diff --git a/CMakeLists.txt b/CMakeLists.txt
index 74c4fc052..1e65ce917 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -44,16 +44,35 @@ if (${CMAKE_BUILD_TYPE_CASE_INSENSITIVE} STREQUAL "debug")
     set(USE_SECURITY_FLAGS FALSE)
 endif()
 
+if ("${DRM_INCLUDE_DIR}" STREQUAL "")
+    find_path(DRM_INCLUDE_DIR "i915_drm.h" PATH_SUFFIXES "drm" "libdrm")
+else()
+    message(STATUS "DRM_INCLUDE_DIR is set by user: ${DRM_INCLUDE_DIR}")
+endif()
+
+if (DRM_INCLUDE_DIR)
+    if (EXISTS ${DRM_INCLUDE_DIR}/i915_drm.h)
+        set(ENABLE_DRM TRUE)
+    else()
+        set(ENABLE_DRM FALSE)
+    endif()
+else()
+    set(ENABLE_DRM FALSE)
+endif()
+
 option(BUILD_EXAMPLES "Build examples" TRUE)
 option(BUILD_FT "Build functional tests" TRUE)
+option(BUILD_REG_TESTS "Build regression tests" TRUE)
 option(BUILD_CONFIG "Build cmake configs" TRUE)
 option(ENABLE_MPI "Enable MPI support" TRUE)
 option(ENABLE_MPI_TESTS "Enable MPI tests support" TRUE)
 option(ENABLE_SYCL_INTEROP_EVENT "Enable SYCL interop event support" TRUE)
-option(ENABLE_OFI_HMEM "Enable support OFI HMEM support" FALSE)
+option(ENABLE_OFI_HMEM "Enable OFI HMEM support" TRUE)
 option(ENABLE_OFI_OOT_PROV "Enable OFI out-of-tree providers support" FALSE)
 option(ENABLE_ITT "Enable ITT profiling support" TRUE)
+option(ENABLE_PMIX "Enable PMIX support" TRUE)
 option(ENABLE_STUB_BACKEND "Enable stub backend" TRUE)
+option(ENABLE_LINKER_RUNPATH "Enable linker runpath flags" FALSE)
 
 option(USE_CODECOV_FLAGS "Calculate code coverage" FALSE)
 option(WITH_ASAN "Use address sanitizer, can only be used in Debug build" FALSE)
@@ -79,7 +98,10 @@ message(STATUS "Enable SYCL interop event support: ${ENABLE_SYCL_INTEROP_EVENT}"
 message(STATUS "Enable OFI HMEM support: ${ENABLE_OFI_HMEM}")
 message(STATUS "Enable OFI out-of-tree providers support: ${ENABLE_OFI_OOT_PROV}")
 message(STATUS "Enable ITT profiling support: ${ENABLE_ITT}")
-message(STATUS "Enable stub backend" ${ENABLE_STUB_BACKEND})
+message(STATUS "Enable PMIX support: ${ENABLE_PMIX}")
+message(STATUS "Enable DRM support: ${ENABLE_DRM}")
+message(STATUS "Enable stub backend: ${ENABLE_STUB_BACKEND}")
+message(STATUS "Enable linker rpath flags: ${ENABLE_LINKER_RUNPATH}")
 
 add_definitions(-DCCL_C_COMPILER="${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
 add_definitions(-DCCL_CXX_COMPILER="${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}")
@@ -131,6 +153,11 @@ message(STATUS "ITT_LIB_DIR: ${ITT_LIB_DIR}")
 set(LEVEL_ZERO_INCLUDE_DIR "${DEPS_DIR}/level_zero/include/")
 message(STATUS "LEVEL_ZERO_INCLUDE_DIR: ${LEVEL_ZERO_INCLUDE_DIR}")
 
+message(STATUS "DRM_INCLUDE_DIR: ${DRM_INCLUDE_DIR}")
+
+set(PMIX_INCLUDE_DIR "${DEPS_DIR}/pmix/include/")
+message(STATUS "PMIX_INCLUDE_DIR: ${PMIX_INCLUDE_DIR}")
+
 set(CMAKE_SKIP_INSTALL_RPATH TRUE)
 set(CMAKE_SKIP_RPATH TRUE)
 
@@ -141,15 +168,21 @@ if (${CMAKE_VERSION} VERSION_LESS 3.1)
     set(C_COMPILER_FLAGS "-std=gnu99")
 endif()
 
+if (${CMAKE_C_COMPILER_ID} STREQUAL "Clang" AND ${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang")
+    set(EXTRA_WARN_FLAGS "-Wshadow")
+endif()
+
+set(COMPILER_WARN_FLAGS "-Wall -Wextra -Wno-unused-parameter -Werror ${EXTRA_WARN_FLAGS}")
+
 # common release/debug compilation settings
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${C_COMPILER_FLAGS} -Wall -Wextra -Wno-unused-parameter -Werror -D_GNU_SOURCE -fvisibility=internal")
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${C_COMPILER_FLAGS} ${COMPILER_WARN_FLAGS} -D_GNU_SOURCE -fvisibility=internal")
 set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} ${C_COMPILER_FLAGS} -O0 -g -DENABLE_DEBUG")
 set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} ${C_COMPILER_FLAGS} -O3")
 set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} ${C_COMPILER_FLAGS} -O2 -g")
 set(CMAKE_C_STANDARD 99)
 set(CMAKE_C_STANDARD_REQUIRED ON)
 
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX_COMPILER_FLAGS} -Wall -Wextra -Wno-unused-parameter -Werror -D_GNU_SOURCE -fvisibility=internal")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX_COMPILER_FLAGS} ${COMPILER_WARN_FLAGS} -D_GNU_SOURCE -fvisibility=internal")
 set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} ${CXX_COMPILER_FLAGS} -O0 -g -DENABLE_DEBUG")
 set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} ${CXX_COMPILER_FLAGS} -O3")
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} ${CXX_COMPILER_FLAGS} -O2 -g")
@@ -162,6 +195,9 @@ if (NOT ${CMAKE_CXX_COMPILER_ID} STREQUAL "Intel")
 endif()
 
 set(COMMON_CMAKE_DIR ${PROJECT_SOURCE_DIR}/cmake)
+
+define_compute_backend()
+
 if (COMPUTE_BACKEND)
     precheck_compute_backend()
     message(STATUS "COMPUTE_BACKEND: ${COMPUTE_BACKEND}")
@@ -170,6 +206,9 @@ if (COMPUTE_BACKEND)
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCCL_ENABLE_OFI_HMEM=1")
         message(STATUS "Enable OFI HMEM support for compute backend ${COMPUTE_BACKEND}")
     endif()
+    if (${COMPUTE_BACKEND} STREQUAL "dpcpp" AND ${CMAKE_CXX_COMPILER} MATCHES ".*icpx")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl")
+    endif()
 endif()
 
 if (ENABLE_OFI_OOT_PROV)
@@ -182,6 +221,16 @@ if (ENABLE_ITT)
     message(STATUS "Enable ITT profiling support")
 endif()
 
+if (ENABLE_PMIX)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCCL_ENABLE_PMIX=1")
+    message(STATUS "Enable PMIX support")
+endif()
+
+if (ENABLE_DRM)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCCL_ENABLE_DRM=1")
+    message(STATUS "Enable DRM support")
+endif()
+
 if (ENABLE_STUB_BACKEND)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCCL_ENABLE_STUB_BACKEND=1")
     message(STATUS "Enable stub backend")
@@ -225,8 +274,8 @@ enable_testing()
 
 set(EXTERNAL_LIBS "")
 
-set(EXAMPLES_INC_DIRS ${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/examples/include ${MPI_INCLUDE_DIR} ${LEVEL_ZERO_INCLUDE_DIR})
-set(EXAMPLES_LIB_DIRS ${MPI_LIB_DIR} ${LIBFABRIC_LIB_DIR})
+set(EXAMPLES_INC_DIRS ${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/examples/include ${MPI_INCLUDE_DIR})
+set(EXAMPLES_LIB_DIRS ${MPI_LIB_DIR})
 
 # allow `deprecated`
 set(CMAKE_CLANG_FLAGS "${CMAKE_CLANG_FLAGS}")
@@ -253,8 +302,8 @@ file(GLOB spv_kernels "${PROJECT_SOURCE_DIR}/src/kernels/kernels.spv")
 endif()
 
 set(CCL_MAJOR_VERSION     "2021")
-set(CCL_MINOR_VERSION     "7")
-set(CCL_UPDATE_VERSION    "1")
+set(CCL_MINOR_VERSION     "8")
+set(CCL_UPDATE_VERSION    "0")
 set(CCL_PRODUCT_STATUS    "Gold")
 string(TIMESTAMP CCL_PRODUCT_BUILD_DATE "%Y-%m-%dT %H:%M:%SZ")
 get_vcs_properties("git")
diff --git a/INSTALL.md b/INSTALL.md
index 0dcf910c9..67b24005b 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -55,7 +55,7 @@ If your CXX compiler requires SYCL, it is possible to specify it (DPC++ is suppo
 Modify `cmake` command as follows:
 
 ```
-cmake .. -DCMAKE_C_COMPILER=your_c_compiler -DCMAKE_CXX_COMPILER=dpcpp -DCOMPUTE_BACKEND=dpcpp
+cmake .. -DCMAKE_C_COMPILER=your_c_compiler -DCMAKE_CXX_COMPILER=icpx -DCOMPUTE_BACKEND=dpcpp
 ```
 
 ## Specify the build type
diff --git a/cmake/FindIntelSYCL_level_zero.cmake b/cmake/FindIntelSYCL_level_zero.cmake
index a67215fe7..08784bd89 100644
--- a/cmake/FindIntelSYCL_level_zero.cmake
+++ b/cmake/FindIntelSYCL_level_zero.cmake
@@ -40,7 +40,7 @@ get_filename_component(INTEL_SYCL_BINARY_DIR ${CMAKE_CXX_COMPILER} PATH)
 
 # Try to find Intel SYCL version.hpp header
 find_path(INTEL_SYCL_INCLUDE_DIRS
-    NAMES CL/sycl/version.hpp
+    NAMES CL/sycl/version.hpp sycl/version.hpp
     PATHS
       ${sycl_root_hints}
       "${INTEL_SYCL_BINARY_DIR}/.."
diff --git a/cmake/helpers.cmake b/cmake/helpers.cmake
index 7a2fa6b15..ca35065f2 100644
--- a/cmake/helpers.cmake
+++ b/cmake/helpers.cmake
@@ -22,7 +22,7 @@ function(set_lp_env)
     else()
         set(CCL_BF16_COMPILER OFF)
     endif()
-    message(STATUS "BF16 compiler: ${CCL_BF16_COMPILER}")
+    message(STATUS "BF16 AVX512F compiler: ${CCL_BF16_COMPILER}")
 
     execute_process(COMMAND ld -v
             OUTPUT_VARIABLE BINUTILS_VERSION_RAW
@@ -55,7 +55,7 @@ function(set_lp_env)
         message(STATUS "BF16 target attributes: ${CCL_BF16_TARGET_ATTRIBUTES}")
     endif()
 
-    option(CCL_BF16_GPU_TRUNCATE "Truncate BF16 in GPU operations" ON)
+    option(CCL_BF16_GPU_TRUNCATE "Truncate BF16 in GPU operations" OFF)
     if (CCL_BF16_GPU_TRUNCATE)
         add_definitions(-DCCL_BF16_GPU_TRUNCATE)
     endif()
@@ -204,7 +204,7 @@ function(activate_compute_backend MODULES_PATH COMPUTE_BACKEND)
         # remember current target for `target_link_libraries` in ccl
         set (COMPUTE_BACKEND_TARGET_NAME Intel::SYCL_level_zero)
         set (COMPUTE_BACKEND_TARGET_NAME Intel::SYCL_level_zero PARENT_SCOPE)
-        message ("COMPUTE_BACKEND_TARGET_NAME=${COMPUTE_BACKEND_TARGET_NAME} requested. Using DPC++ provider")
+        message (STATUS "COMPUTE_BACKEND_TARGET_NAME: ${COMPUTE_BACKEND_TARGET_NAME} requested. Using DPC++ provider")
     endif()
 
     # extract target properties
@@ -231,6 +231,18 @@ function(activate_compute_backend MODULES_PATH COMPUTE_BACKEND)
 
 endfunction(activate_compute_backend)
 
+function(define_compute_backend)
+    if (NOT DEFINED COMPUTE_BACKEND)
+        message(STATUS "COMPUTE_BACKEND is not defined")
+        if (${CMAKE_CXX_COMPILER} MATCHES ".*dpcpp")
+            set(COMPUTE_BACKEND "dpcpp" CACHE STRING "compute backend value")
+            message(STATUS "COMPUTE_BACKEND: ${COMPUTE_BACKEND} (set by default)")
+        endif()
+    else()
+        message(STATUS "COMPUTE_BACKEND: ${COMPUTE_BACKEND} (set by user)")
+    endif()
+endfunction(define_compute_backend)
+
 function(set_compute_backend COMMON_CMAKE_DIR)
     activate_compute_backend("${COMMON_CMAKE_DIR}" ${COMPUTE_BACKEND})
 
@@ -257,13 +269,13 @@ function(set_compute_backend COMMON_CMAKE_DIR)
         set(CCL_ENABLE_ZE ON PARENT_SCOPE)
         message(STATUS "Enable CCL Level Zero support")
 
-        execute_process(COMMAND dpcpp -v
-            OUTPUT_VARIABLE DPCPP_VERSION
-            ERROR_VARIABLE DPCPP_VERSION
+        execute_process(COMMAND icpx -v
+            OUTPUT_VARIABLE ICPX_VERSION
+            ERROR_VARIABLE ICPX_VERSION
             OUTPUT_STRIP_TRAILING_WHITESPACE
             ERROR_STRIP_TRAILING_WHITESPACE
         )
-        message(STATUS "DPC++ compiler version:\n" "${DPCPP_VERSION}")
+        message(STATUS "DPC++ compiler version:\n" "${ICPX_VERSION}")
     endif()
 
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COMPUTE_BACKEND_FLAGS}")
diff --git a/deps/mpi/bin/hydra_bstrap_proxy b/deps/mpi/bin/hydra_bstrap_proxy
index 29e02c8fb..e039276e0 100755
Binary files a/deps/mpi/bin/hydra_bstrap_proxy and b/deps/mpi/bin/hydra_bstrap_proxy differ
diff --git a/deps/mpi/bin/hydra_nameserver b/deps/mpi/bin/hydra_nameserver
index 4d1102bc2..6d5ca620a 100755
Binary files a/deps/mpi/bin/hydra_nameserver and b/deps/mpi/bin/hydra_nameserver differ
diff --git a/deps/mpi/bin/hydra_pmi_proxy b/deps/mpi/bin/hydra_pmi_proxy
index c53a46fce..195ee5543 100755
Binary files a/deps/mpi/bin/hydra_pmi_proxy and b/deps/mpi/bin/hydra_pmi_proxy differ
diff --git a/deps/mpi/bin/mpiexec b/deps/mpi/bin/mpiexec
index 1ecd75f5b..cc39a485a 100755
Binary files a/deps/mpi/bin/mpiexec and b/deps/mpi/bin/mpiexec differ
diff --git a/deps/mpi/bin/mpiexec.hydra b/deps/mpi/bin/mpiexec.hydra
index 1ecd75f5b..cc39a485a 100755
Binary files a/deps/mpi/bin/mpiexec.hydra and b/deps/mpi/bin/mpiexec.hydra differ
diff --git a/deps/mpi/bin/mpigcc b/deps/mpi/bin/mpigcc
index 51068db73..7b335b350 100755
--- a/deps/mpi/bin/mpigcc
+++ b/deps/mpi/bin/mpigcc
@@ -115,7 +115,9 @@ fi
 # configure (e.g., determining whehter -lsocket is needee)
 CC="gcc"
 MPICH_VERSION="3.4a2"
-MPIVERSION="2021.7"
+CFLAGS=""
+CPPFLAGS=""
+MPIVERSION="2021.8"
 MPILIBNAME="mpi"                           
 
 
diff --git a/deps/mpi/bin/mpigxx b/deps/mpi/bin/mpigxx
index 596c43ade..979b61cbe 100755
--- a/deps/mpi/bin/mpigxx
+++ b/deps/mpi/bin/mpigxx
@@ -113,7 +113,8 @@ fi
 # Default settings for compiler, flags, and libraries
 CXX="g++"
 MPICH_VERSION="3.4a2"
-MPIVERSION="2021.7"
+CXXFLAGS=""
+MPIVERSION="2021.8"
 MPILIBNAME="mpi"
 MPICXXLIBNAME="mpicxx"
 
diff --git a/deps/mpi/bin/mpiicc b/deps/mpi/bin/mpiicc
index c793354bf..ac8bd05a7 100755
--- a/deps/mpi/bin/mpiicc
+++ b/deps/mpi/bin/mpiicc
@@ -121,7 +121,7 @@ LDFLAGS="-ldl"
 MPILIBNAME="mpi"
 
 # MPIVERSION is the version of the MPICH2 library that mpicc is intended for
-MPIVERSION="2021.7"
+MPIVERSION="2021.8"
 #
 # Internal variables
 # Show is set to echo to cause the compilation command to be echoed instead
diff --git a/deps/mpi/bin/mpiicpc b/deps/mpi/bin/mpiicpc
index 9c129970e..38d380c7b 100755
--- a/deps/mpi/bin/mpiicpc
+++ b/deps/mpi/bin/mpiicpc
@@ -122,7 +122,7 @@ MPILIBNAME="mpi"
 MPICXXLIBNAME="mpicxx"
 
 # MPIVERSION is the version of the Intel(R) MPI Library that mpiicpc is intended for
-MPIVERSION="2021.7"
+MPIVERSION="2021.8"
 
 # Internal variables
 # Show is set to echo to cause the compilation command to be echoed instead
diff --git a/deps/mpi/etc/tuning_clx-ap_shm-ofi.dat b/deps/mpi/etc/tuning_clx-ap_shm-ofi.dat
index a6988c57a..191e12c0c 100644
Binary files a/deps/mpi/etc/tuning_clx-ap_shm-ofi.dat and b/deps/mpi/etc/tuning_clx-ap_shm-ofi.dat differ
diff --git a/deps/mpi/etc/tuning_clx-ap_shm.dat b/deps/mpi/etc/tuning_clx-ap_shm.dat
index 95cac35d0..41e71534a 100644
Binary files a/deps/mpi/etc/tuning_clx-ap_shm.dat and b/deps/mpi/etc/tuning_clx-ap_shm.dat differ
diff --git a/deps/mpi/etc/tuning_skx_shm-ofi.dat b/deps/mpi/etc/tuning_skx_shm-ofi.dat
index f9e770897..6a06435ff 100644
Binary files a/deps/mpi/etc/tuning_skx_shm-ofi.dat and b/deps/mpi/etc/tuning_skx_shm-ofi.dat differ
diff --git a/deps/mpi/etc/tuning_skx_shm.dat b/deps/mpi/etc/tuning_skx_shm.dat
index f9e770897..6a06435ff 100644
Binary files a/deps/mpi/etc/tuning_skx_shm.dat and b/deps/mpi/etc/tuning_skx_shm.dat differ
diff --git a/deps/mpi/include/mpi.h b/deps/mpi/include/mpi.h
index 1ea01fa26..0a06d83cd 100644
--- a/deps/mpi/include/mpi.h
+++ b/deps/mpi/include/mpi.h
@@ -584,8 +584,8 @@ typedef int (MPI_Delete_function) ( MPI_Comm, int, void *, void * );
  * digits for REV, 1 digit for EXT and 2 digits for EXT_NUMBER. So,
  * 2019.0.0b0 will have the numeric version 20190000100.
  */
-#define I_MPI_VERSION "2021.7.1"
-#define I_MPI_NUMVERSION 20210701300
+#define I_MPI_VERSION "2021.8.0"
+#define I_MPI_NUMVERSION 20210800300
 
 /* for the datatype decoders */
 enum MPIR_Combiner_enum {
diff --git a/deps/mpi/lib/libmpi.so b/deps/mpi/lib/libmpi.so
index 67d4422e1..f48e6cd80 100755
Binary files a/deps/mpi/lib/libmpi.so and b/deps/mpi/lib/libmpi.so differ
diff --git a/deps/mpi/lib/libmpi.so.12 b/deps/mpi/lib/libmpi.so.12
index 67d4422e1..f48e6cd80 100755
Binary files a/deps/mpi/lib/libmpi.so.12 and b/deps/mpi/lib/libmpi.so.12 differ
diff --git a/deps/mpi/lib/libmpi.so.12.0 b/deps/mpi/lib/libmpi.so.12.0
index 67d4422e1..f48e6cd80 100755
Binary files a/deps/mpi/lib/libmpi.so.12.0 and b/deps/mpi/lib/libmpi.so.12.0 differ
diff --git a/deps/mpi/lib/libmpi.so.12.0.0 b/deps/mpi/lib/libmpi.so.12.0.0
index 67d4422e1..f48e6cd80 100755
Binary files a/deps/mpi/lib/libmpi.so.12.0.0 and b/deps/mpi/lib/libmpi.so.12.0.0 differ
diff --git a/deps/mpi/lib/libmpifort.so b/deps/mpi/lib/libmpifort.so
index 0232c7dc4..9f7b2ca70 100755
Binary files a/deps/mpi/lib/libmpifort.so and b/deps/mpi/lib/libmpifort.so differ
diff --git a/deps/mpi/lib/libmpifort.so.12 b/deps/mpi/lib/libmpifort.so.12
index 0232c7dc4..9f7b2ca70 100755
Binary files a/deps/mpi/lib/libmpifort.so.12 and b/deps/mpi/lib/libmpifort.so.12 differ
diff --git a/deps/mpi/lib/libmpifort.so.12.0 b/deps/mpi/lib/libmpifort.so.12.0
index 0232c7dc4..9f7b2ca70 100755
Binary files a/deps/mpi/lib/libmpifort.so.12.0 and b/deps/mpi/lib/libmpifort.so.12.0 differ
diff --git a/deps/mpi/lib/libmpifort.so.12.0.0 b/deps/mpi/lib/libmpifort.so.12.0.0
index 0232c7dc4..9f7b2ca70 100755
Binary files a/deps/mpi/lib/libmpifort.so.12.0.0 and b/deps/mpi/lib/libmpifort.so.12.0.0 differ
diff --git a/deps/mpi/licensing/license.txt b/deps/mpi/licensing/license.txt
index f987e502b..71bc0ebe1 100644
--- a/deps/mpi/licensing/license.txt
+++ b/deps/mpi/licensing/license.txt
@@ -1,33 +1,33 @@
-Intel Simplified Software License (Version August 2021)
+Intel Simplified Software License (Version October 2022)
 
-Use and Redistribution. You may use and redistribute the software (the
-"Software"), without modification, provided the following conditions are met:
+Intel(R) MPI Library: Copyright (C) 2009 Intel Corporation
 
-* Redistributions must reproduce the above copyright notice and the following
-  terms of use in the Software and in the documentation and/or other materials
-  provided with the distribution.
-* Neither the name of Intel nor the names of its suppliers may be used to 
-  endorse or promote products derived from this Software without specific  
-  prior written permission.
-* No reverse engineering, decompilation, or disassembly of this Software is
-  permitted.
+Use and Redistribution. You may use and redistribute the software, which is
+provided in binary form only, (the "Software"), without modification, provided the
+following conditions are met:
+
+* Redistributions must reproduce the above copyright notice and these terms of use
+in the Software and in the documentation and/or other materials provided with
+the distribution.
+* Neither the name of Intel nor the names of its suppliers may be used to endorse
+or promote products derived from this Software without specific prior written
+permission.
+* No reverse engineering, decompilation, or disassembly of the Software is
+permitted, nor any modification or alteration of the Software or its operation
+at any time, including during execution.
 
 No other licenses. Except as provided in the preceding section, Intel grants no
 licenses or other rights by implication, estoppel or otherwise to, patent,
 copyright, trademark, trade name, service mark or other intellectual property
 licenses or rights of Intel.
 
-Third party software. The Software may contain Third Party Software. "Third
-Party Software" is open source software, third party software, or other Intel
-software that may be identified in the Software itself or in the files (if any)
-listed in the "third-party-software.txt" or similarly named text file included
-with the Software. Third Party Software, even if included with the distribution
-of the Software, may be governed by separate license terms, including without
-limitation, open source software license terms, third party software license
-terms, and other Intel software license terms. Those separate license terms
-solely govern your use of the Third Party Software, and nothing in this license
-limits any rights under, or grants rights that supersede, the terms of the
-applicable license terms.
+Third party software. "Third Party Software" means the files (if any) listed in
+the "third-party-software.txt" or other similarly-named text file that may be
+included with the Software. Third Party Software, even if included with the
+distribution of the Software, may be governed by separate license terms, including
+without limitation, third party license terms, open source software notices and
+terms, and/or other Intel software license terms. These separate license terms 
+solely govern Your use of the Third Party Software.
 
 DISCLAIMER. THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED
 WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
@@ -63,11 +63,10 @@ Compliance with laws. You agree to comply with all relevant laws and regulations
 governing your use, transfer, import or export (or prohibition thereof) of the
 Software.
 
-Governing law. All disputes will be governed by the laws of the United States of
-America and the State of Delaware without reference to conflict of law
-principles and subject to the exclusive jurisdiction of the state or federal
-courts sitting in the State of Delaware, and each party agrees that it submits
-to the personal jurisdiction and venue of those courts and waives any
-objections. The United Nations Convention on Contracts for the International
-Sale of Goods (1980) is specifically excluded and will not apply to the
-Software.
+Governing law.  All disputes will be governed by the laws of the United States of 
+America and the State of Delaware without reference to conflict of law principles 
+and subject to the exclusive jurisdiction of the state or federal courts sitting 
+in the State of Delaware, and each party agrees that it submits to the personal 
+jurisdiction and venue of those courts and waives any objections. THE UNITED 
+NATIONS CONVENTION ON CONTRACTS FOR THE INTERNATIONAL SALE OF GOODS (1980) IS 
+SPECIFICALLY EXCLUDED AND WILL NOT APPLY TO THE SOFTWARE.
diff --git a/deps/mpi/licensing/third-party-programs.txt b/deps/mpi/licensing/third-party-programs.txt
index 403f3829c..a4ae4ae8e 100644
--- a/deps/mpi/licensing/third-party-programs.txt
+++ b/deps/mpi/licensing/third-party-programs.txt
@@ -1,4 +1,4 @@
-Intel(R) MPI Library 2021.7 Third Party Programs File
+Intel(R) MPI Library 2021.8 Third Party Programs File
 
 This file is the "third-party-programs.txt" file specified in the associated 
 Intel end user license agreement for the Intel software you are licensing.
diff --git a/deps/ofi/bin/fi_info b/deps/ofi/bin/fi_info
index 711ae57b3..89077a6fe 100755
Binary files a/deps/ofi/bin/fi_info and b/deps/ofi/bin/fi_info differ
diff --git a/deps/ofi/include/rdma/fabric.h b/deps/ofi/include/rdma/fabric.h
index 21bffa1d6..f911526f2 100644
--- a/deps/ofi/include/rdma/fabric.h
+++ b/deps/ofi/include/rdma/fabric.h
@@ -2,6 +2,7 @@
  * Copyright (c) 2013-2017 Intel Corporation. All rights reserved.
  * Copyright (c) 2016 Cisco Systems, Inc. All rights reserved.
  * (C) Copyright 2020 Hewlett Packard Enterprise Development LP
+ * Copyright (c) 2022 DataDirect Networks, Inc. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -44,12 +45,16 @@
 #ifdef __GNUC__
 #define FI_DEPRECATED_FUNC __attribute__((deprecated))
 #define FI_DEPRECATED_FIELD __attribute__((deprecated))
+#define FI_FORMAT_PRINTF(string, first) \
+	__attribute__ ((__format__ (__printf__, (string), (first))))
 #elif defined(_MSC_VER)
 #define FI_DEPRECATED_FUNC __declspec(deprecated)
 #define FI_DEPRECATED_FIELD
+#define FI_FORMAT_PRINTF(string, first)
 #else
 #define FI_DEPRECATED_FUNC
 #define FI_DEPRECATED_FIELD
+#define FI_FORMAT_PRINTF(string, first)
 #endif
 
 #if defined(__GNUC__) && !defined(__clang__)
@@ -79,8 +84,8 @@ extern "C" {
 #endif
 
 #define FI_MAJOR_VERSION 1
-#define FI_MINOR_VERSION 13
-#define FI_REVISION_VERSION 2
+#define FI_MINOR_VERSION 16
+#define FI_REVISION_VERSION 1
 
 enum {
 	FI_PATH_MAX		= 256,
@@ -91,14 +96,8 @@ enum {
 #define FI_VERSION(major, minor) (((major) << 16) | (minor))
 #define FI_MAJOR(version)	(version >> 16)
 #define FI_MINOR(version)	(version & 0xFFFF)
-#define FI_VERSION_GE(v1, v2)   ((FI_MAJOR(v1) > FI_MAJOR(v2)) || \
-				 (FI_MAJOR(v1) == FI_MAJOR(v2) && \
-				  FI_MINOR(v1) == FI_MINOR(v2)) || \
-				 (FI_MAJOR(v1) == FI_MAJOR(v2) && \
-				  FI_MINOR(v1) > FI_MINOR(v2)))
-#define FI_VERSION_LT(v1, v2)	((FI_MAJOR(v1) < FI_MAJOR(v2)) || \
-				 (FI_MAJOR(v1) == FI_MAJOR(v2) && \
-				  FI_MINOR(v1) < FI_MINOR(v2)))
+#define FI_VERSION_GE(v1, v2)	(v1 >= v2)
+#define FI_VERSION_LT(v1, v2)	(v1 < v2)
 
 uint32_t fi_version(void);
 
@@ -166,6 +165,11 @@ typedef struct fid *fid_t;
 #define FI_COMMIT_COMPLETE	(1ULL << 30)
 #define FI_MATCH_COMPLETE	(1ULL << 31)
 
+#define FI_AV_USER_ID		(1ULL << 41)
+#define FI_PEER_SRX		(1ULL << 42)
+#define FI_PEER_CQ		(1ULL << 43)
+#define FI_XPU_TRIGGER		(1ULL << 44)
+#define FI_HMEM_HOST_ALLOC	(1ULL << 45)
 #define FI_HMEM_DEVICE_ONLY	(1ULL << 46)
 #define FI_HMEM			(1ULL << 47)
 #define FI_VARIABLE_MSG		(1ULL << 48)
@@ -209,7 +213,9 @@ enum {
 	FI_ADDR_PSMX2,		/* uint64_t[2] */
 	FI_ADDR_IB_UD,		/* uint64_t[4] */
 	FI_ADDR_EFA,
-	FI_ADDR_PSMX3,		/* uint64_t[2] */
+	FI_ADDR_PSMX3,		/* uint64_t[4] */
+	FI_ADDR_OPX,
+	FI_ADDR_CXI,
 };
 
 #define FI_ADDR_UNSPEC		((uint64_t) -1)
@@ -239,6 +245,7 @@ enum fi_mr_mode {
 #define FI_MR_RMA_EVENT		(1 << 8)
 #define FI_MR_ENDPOINT		(1 << 9)
 #define FI_MR_HMEM		(1 << 10)
+#define FI_MR_COLLECTIVE	(1 << 11)
 
 enum fi_progress {
 	FI_PROGRESS_UNSPEC,
@@ -306,7 +313,7 @@ enum {
 	FI_PROTO_UDP,
 	FI_PROTO_SOCK_TCP,
 	/*  MXM provider is deprecated.
-	 *  We will keep  this value in order to save binary compatibility.
+	 *  We will keep this value in order to save binary compatibility.
 	 */
 	FI_PROTO_MXM,
 	FI_PROTO_IWARP_RDM,
@@ -322,7 +329,11 @@ enum {
 	FI_PROTO_RSTREAM,
 	FI_PROTO_RDMA_CM_IB_XRC,
 	FI_PROTO_EFA,
-	FI_PROTO_PSMX3
+	FI_PROTO_PSMX3,
+	FI_PROTO_RXM_TCP,
+	FI_PROTO_OPX,
+	FI_PROTO_CXI,
+	FI_PROTO_XNET,
 };
 
 enum {
@@ -522,6 +533,9 @@ enum {
 	FI_CLASS_AV_SET,
 	FI_CLASS_MR_CACHE,
 	FI_CLASS_MEM_MONITOR,
+	FI_CLASS_PEER_CQ,
+	FI_CLASS_PEER_SRX,
+	FI_CLASS_LOG,
 };
 
 struct fi_eq_attr;
@@ -727,6 +741,9 @@ enum fi_type {
 	FI_TYPE_FID,
 	FI_TYPE_COLLECTIVE_OP,
 	FI_TYPE_HMEM_IFACE,
+	FI_TYPE_CQ_FORMAT,
+	FI_TYPE_LOG_LEVEL,
+	FI_TYPE_LOG_SUBSYS,
 };
 
 char *fi_tostr(const void *data, enum fi_type datatype);
diff --git a/deps/ofi/include/rdma/fi_atomic.h b/deps/ofi/include/rdma/fi_atomic.h
new file mode 100644
index 000000000..cc8b1e520
--- /dev/null
+++ b/deps/ofi/include/rdma/fi_atomic.h
@@ -0,0 +1,300 @@
+/*
+ * Copyright (c) 2013-2014 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef FI_ATOMIC_H
+#define FI_ATOMIC_H
+
+#include <rdma/fabric.h>
+#include <rdma/fi_endpoint.h>
+#include <rdma/fi_rma.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* Atomic flags */
+#define FI_FETCH_ATOMIC		(1ULL << 58)
+#define FI_COMPARE_ATOMIC	(1ULL << 59)
+
+struct fi_atomic_attr {
+	size_t			count;
+	size_t			size;
+};
+
+struct fi_msg_atomic {
+	const struct fi_ioc	*msg_iov;
+	void			**desc;
+	size_t			iov_count;
+	fi_addr_t		addr;
+	const struct fi_rma_ioc	*rma_iov;
+	size_t			rma_iov_count;
+	enum fi_datatype	datatype;
+	enum fi_op		op;
+	void			*context;
+	uint64_t		data;
+};
+
+struct fi_msg_fetch {
+	struct fi_ioc		*msg_iov;
+	void			**desc;
+	size_t			iov_count;
+};
+
+struct fi_msg_compare {
+	const struct fi_ioc	*msg_iov;
+	void			**desc;
+	size_t			iov_count;
+};
+
+struct fi_ops_atomic {
+	size_t	size;
+	ssize_t	(*write)(struct fid_ep *ep,
+			const void *buf, size_t count, void *desc,
+			fi_addr_t dest_addr,
+			uint64_t addr, uint64_t key,
+			enum fi_datatype datatype, enum fi_op op, void *context);
+	ssize_t	(*writev)(struct fid_ep *ep,
+			const struct fi_ioc *iov, void **desc, size_t count,
+			fi_addr_t dest_addr,
+			uint64_t addr, uint64_t key,
+			enum fi_datatype datatype, enum fi_op op, void *context);
+	ssize_t	(*writemsg)(struct fid_ep *ep,
+			const struct fi_msg_atomic *msg, uint64_t flags);
+	ssize_t	(*inject)(struct fid_ep *ep, const void *buf, size_t count,
+			fi_addr_t dest_addr, uint64_t addr, uint64_t key,
+			enum fi_datatype datatype, enum fi_op op);
+
+	ssize_t	(*readwrite)(struct fid_ep *ep,
+			const void *buf, size_t count, void *desc,
+			void *result, void *result_desc,
+			fi_addr_t dest_addr,
+			uint64_t addr, uint64_t key,
+			enum fi_datatype datatype, enum fi_op op, void *context);
+	ssize_t	(*readwritev)(struct fid_ep *ep,
+			const struct fi_ioc *iov, void **desc, size_t count,
+			struct fi_ioc *resultv, void **result_desc, size_t result_count,
+			fi_addr_t dest_addr,
+			uint64_t addr, uint64_t key,
+			enum fi_datatype datatype, enum fi_op op, void *context);
+	ssize_t	(*readwritemsg)(struct fid_ep *ep,
+			const struct fi_msg_atomic *msg,
+			struct fi_ioc *resultv, void **result_desc, size_t result_count,
+			uint64_t flags);
+
+	ssize_t	(*compwrite)(struct fid_ep *ep,
+			const void *buf, size_t count, void *desc,
+			const void *compare, void *compare_desc,
+			void *result, void *result_desc,
+			fi_addr_t dest_addr,
+			uint64_t addr, uint64_t key,
+			enum fi_datatype datatype, enum fi_op op, void *context);
+	ssize_t	(*compwritev)(struct fid_ep *ep,
+			const struct fi_ioc *iov, void **desc, size_t count,
+			const struct fi_ioc *comparev, void **compare_desc, size_t compare_count,
+			struct fi_ioc *resultv, void **result_desc, size_t result_count,
+			fi_addr_t dest_addr,
+			uint64_t addr, uint64_t key,
+			enum fi_datatype datatype, enum fi_op op, void *context);
+	ssize_t	(*compwritemsg)(struct fid_ep *ep,
+			const struct fi_msg_atomic *msg,
+			const struct fi_ioc *comparev, void **compare_desc, size_t compare_count,
+			struct fi_ioc *resultv, void **result_desc, size_t result_count,
+			uint64_t flags);
+
+	int	(*writevalid)(struct fid_ep *ep,
+			enum fi_datatype datatype, enum fi_op op, size_t *count);
+	int	(*readwritevalid)(struct fid_ep *ep,
+			enum fi_datatype datatype, enum fi_op op, size_t *count);
+	int	(*compwritevalid)(struct fid_ep *ep,
+			enum fi_datatype datatype, enum fi_op op, size_t *count);
+};
+
+#ifdef FABRIC_DIRECT
+#include <rdma/fi_direct_atomic.h>
+#endif	/* FABRIC_DIRECT */
+
+#ifndef FABRIC_DIRECT_ATOMIC
+
+static inline ssize_t
+fi_atomic(struct fid_ep *ep,
+	  const void *buf, size_t count, void *desc,
+	  fi_addr_t dest_addr,
+	  uint64_t addr, uint64_t key,
+	  enum fi_datatype datatype, enum fi_op op, void *context)
+{
+	return ep->atomic->write(ep, buf, count, desc, dest_addr, addr, key,
+			datatype, op, context);
+}
+
+static inline ssize_t
+fi_atomicv(struct fid_ep *ep,
+	   const struct fi_ioc *iov, void **desc, size_t count,
+	   fi_addr_t dest_addr,
+	   uint64_t addr, uint64_t key,
+	   enum fi_datatype datatype, enum fi_op op, void *context)
+{
+	return ep->atomic->writev(ep, iov, desc, count, dest_addr, addr, key,
+			datatype, op, context);
+}
+
+static inline ssize_t
+fi_atomicmsg(struct fid_ep *ep,
+	     const struct fi_msg_atomic *msg, uint64_t flags)
+{
+	return ep->atomic->writemsg(ep, msg, flags);
+}
+
+static inline ssize_t
+fi_inject_atomic(struct fid_ep *ep, const void *buf, size_t count,
+		 fi_addr_t dest_addr, uint64_t addr, uint64_t key,
+		 enum fi_datatype datatype, enum fi_op op)
+{
+	return ep->atomic->inject(ep, buf, count, dest_addr, addr,
+			key, datatype, op);
+}
+
+static inline ssize_t
+fi_fetch_atomic(struct fid_ep *ep,
+		const void *buf, size_t count, void *desc,
+		void *result, void *result_desc,
+		fi_addr_t dest_addr,
+		uint64_t addr, uint64_t key,
+		enum fi_datatype datatype, enum fi_op op, void *context)
+{
+	return ep->atomic->readwrite(ep, buf, count, desc, result, result_desc,
+			dest_addr, addr, key, datatype, op, context);
+}
+
+static inline ssize_t
+fi_fetch_atomicv(struct fid_ep *ep,
+		 const struct fi_ioc *iov, void **desc, size_t count,
+		 struct fi_ioc *resultv, void **result_desc, size_t result_count,
+		 fi_addr_t dest_addr,
+		 uint64_t addr, uint64_t key,
+		 enum fi_datatype datatype, enum fi_op op, void *context)
+{
+	return ep->atomic->readwritev(ep, iov, desc, count,
+			resultv, result_desc, result_count,
+			dest_addr, addr, key, datatype, op, context);
+}
+
+static inline ssize_t
+fi_fetch_atomicmsg(struct fid_ep *ep,
+		   const struct fi_msg_atomic *msg,
+		   struct fi_ioc *resultv, void **result_desc, size_t result_count,
+		   uint64_t flags)
+{
+	return ep->atomic->readwritemsg(ep, msg, resultv, result_desc,
+			result_count, flags);
+}
+
+static inline ssize_t
+fi_compare_atomic(struct fid_ep *ep,
+		  const void *buf, size_t count, void *desc,
+		  const void *compare, void *compare_desc,
+		  void *result, void *result_desc,
+		  fi_addr_t dest_addr,
+		  uint64_t addr, uint64_t key,
+		  enum fi_datatype datatype, enum fi_op op, void *context)
+{
+	return ep->atomic->compwrite(ep, buf, count, desc,
+			compare, compare_desc, result, result_desc,
+			dest_addr, addr, key, datatype, op, context);
+}
+
+static inline ssize_t
+fi_compare_atomicv(struct fid_ep *ep,
+		   const struct fi_ioc *iov, void **desc, size_t count,
+		   const struct fi_ioc *comparev, void **compare_desc, size_t compare_count,
+		   struct fi_ioc *resultv, void **result_desc, size_t result_count,
+		   fi_addr_t dest_addr,
+		   uint64_t addr, uint64_t key,
+		   enum fi_datatype datatype, enum fi_op op, void *context)
+{
+	return ep->atomic->compwritev(ep, iov, desc, count,
+			comparev, compare_desc, compare_count,
+			resultv, result_desc, result_count,
+			dest_addr, addr, key, datatype, op, context);
+}
+
+static inline ssize_t
+fi_compare_atomicmsg(struct fid_ep *ep,
+		     const struct fi_msg_atomic *msg,
+		     const struct fi_ioc *comparev, void **compare_desc, size_t compare_count,
+		     struct fi_ioc *resultv, void **result_desc, size_t result_count,
+		     uint64_t flags)
+{
+	return ep->atomic->compwritemsg(ep, msg,
+			comparev, compare_desc, compare_count,
+			resultv, result_desc, result_count, flags);
+}
+
+static inline int
+fi_atomicvalid(struct fid_ep *ep,
+	       enum fi_datatype datatype, enum fi_op op, size_t *count)
+{
+	return ep->atomic->writevalid(ep, datatype, op, count);
+}
+
+static inline int
+fi_fetch_atomicvalid(struct fid_ep *ep,
+		     enum fi_datatype datatype, enum fi_op op, size_t *count)
+{
+	return ep->atomic->readwritevalid(ep, datatype, op, count);
+}
+
+static inline int
+fi_compare_atomicvalid(struct fid_ep *ep,
+		       enum fi_datatype datatype, enum fi_op op, size_t *count)
+{
+	return ep->atomic->compwritevalid(ep, datatype, op, count);
+}
+
+static inline int
+fi_query_atomic(struct fid_domain *domain,
+		enum fi_datatype datatype, enum fi_op op,
+		struct fi_atomic_attr *attr, uint64_t flags)
+{
+	return FI_CHECK_OP(domain->ops, struct fi_ops_domain, query_atomic) ?
+		domain->ops->query_atomic(domain, datatype, op, attr, flags) :
+		-FI_ENOSYS;
+}
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* FI_ATOMIC_H */
diff --git a/deps/ofi/include/rdma/fi_collective.h b/deps/ofi/include/rdma/fi_collective.h
new file mode 100644
index 000000000..41528b54f
--- /dev/null
+++ b/deps/ofi/include/rdma/fi_collective.h
@@ -0,0 +1,304 @@
+/*
+ * Copyright (c) 2019 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef FI_COLLECTIVE_H
+#define FI_COLLECTIVE_H
+
+#include <rdma/fi_atomic.h>
+#include <rdma/fi_domain.h>
+#include <rdma/fi_cm.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef FABRIC_DIRECT
+#include <rdma/fi_direct_collective_def.h>
+#endif /* FABRIC_DIRECT */
+
+
+struct fi_ops_av_set {
+	size_t	size;
+	int	(*set_union)(struct fid_av_set *dst,
+			const struct fid_av_set *src);
+	int	(*intersect)(struct fid_av_set *dst,
+			const struct fid_av_set *src);
+	int	(*diff)(struct fid_av_set *dst, const struct fid_av_set *src);
+	int	(*insert)(struct fid_av_set *set, fi_addr_t addr);
+	int	(*remove)(struct fid_av_set *set, fi_addr_t addr);
+	int	(*addr)(struct fid_av_set *set, fi_addr_t *coll_addr);
+};
+
+struct fid_av_set {
+	struct fid		fid;
+	struct fi_ops_av_set	*ops;
+};
+
+struct fi_collective_attr {
+	enum fi_op 		op;
+	enum fi_datatype 	datatype;
+	struct fi_atomic_attr 	datatype_attr;
+	size_t 			max_members;
+	uint64_t 		mode;
+};
+
+struct fi_collective_addr {
+	const struct fid_av_set	*set;
+	fi_addr_t		coll_addr;
+};
+
+struct fi_msg_collective {
+	const struct fi_ioc	*msg_iov;
+	void			**desc;
+	size_t			iov_count;
+	fi_addr_t		coll_addr;
+	fi_addr_t		root_addr;
+	enum fi_collective_op	coll;
+	enum fi_datatype	datatype;
+	enum fi_op		op;
+	void			*context;
+};
+
+struct fi_ops_collective {
+	size_t	size;
+
+	ssize_t	(*barrier)(struct fid_ep *ep, fi_addr_t coll_addr,
+			void *context);
+	ssize_t	(*broadcast)(struct fid_ep *ep,
+			void *buf, size_t count, void *desc,
+			fi_addr_t coll_addr, fi_addr_t root_addr,
+			enum fi_datatype datatype, uint64_t flags, void *context);
+	ssize_t	(*alltoall)(struct fid_ep *ep,
+			const void *buf, size_t count, void *desc,
+			void *result, void *result_desc, fi_addr_t coll_addr,
+			enum fi_datatype datatype, uint64_t flags, void *context);
+	ssize_t	(*allreduce)(struct fid_ep *ep,
+			const void *buf, size_t count, void *desc,
+			void *result, void *result_desc, fi_addr_t coll_addr,
+			enum fi_datatype datatype, enum fi_op op,
+			uint64_t flags, void *context);
+	ssize_t	(*allgather)(struct fid_ep *ep,
+			const void *buf, size_t count, void *desc,
+			void *result, void *result_desc, fi_addr_t coll_addr,
+			enum fi_datatype datatype, uint64_t flags, void *context);
+	ssize_t	(*reduce_scatter)(struct fid_ep *ep,
+			const void *buf, size_t count, void *desc,
+			void *result, void *result_desc, fi_addr_t coll_addr,
+			enum fi_datatype datatype, enum fi_op op,
+			uint64_t flags, void *context);
+	ssize_t	(*reduce)(struct fid_ep *ep,
+			const void *buf, size_t count, void *desc,
+			void *result, void *result_desc, fi_addr_t coll_addr,
+			fi_addr_t root_addr, enum fi_datatype datatype, enum fi_op op,
+			uint64_t flags, void *context);
+	ssize_t	(*scatter)(struct fid_ep *ep,
+			const void *buf, size_t count, void *desc,
+			void *result, void *result_desc,
+			fi_addr_t coll_addr, fi_addr_t root_addr,
+			enum fi_datatype datatype, uint64_t flags, void *context);
+	ssize_t	(*gather)(struct fid_ep *ep,
+			const void *buf, size_t count, void *desc,
+			void *result, void *result_desc,
+			fi_addr_t coll_addr, fi_addr_t root_addr,
+			enum fi_datatype datatype, uint64_t flags, void *context);
+	ssize_t	(*msg)(struct fid_ep *ep,
+			const struct fi_msg_collective *msg,
+			struct fi_ioc *resultv, void **result_desc,
+			size_t result_count, uint64_t flags);
+};
+
+
+#ifdef FABRIC_DIRECT
+#include <rdma/fi_direct_collective.h>
+#endif /* FABRIC_DIRECT */
+
+#ifndef FABRIC_DIRECT_COLLECTIVE
+
+static inline int
+fi_av_set(struct fid_av *av, struct fi_av_set_attr *attr,
+	  struct fid_av_set **set, void * context)
+{
+	return FI_CHECK_OP(av->ops, struct fi_ops_av, av_set) ?
+		av->ops->av_set(av, attr, set, context) : -FI_ENOSYS;
+}
+
+static inline int
+fi_av_set_union(struct fid_av_set *dst, const struct fid_av_set *src)
+{
+	return dst->ops->set_union(dst, src);
+}
+
+static inline int
+fi_av_set_intersect(struct fid_av_set *dst, const struct fid_av_set *src)
+{
+	return dst->ops->intersect(dst, src);
+}
+
+static inline int
+fi_av_set_diff(struct fid_av_set *dst, const struct fid_av_set *src)
+{
+	return dst->ops->diff(dst, src);
+}
+
+static inline int
+fi_av_set_insert(struct fid_av_set *set, fi_addr_t addr)
+{
+	return set->ops->insert(set, addr);
+}
+
+static inline int
+fi_av_set_remove(struct fid_av_set *set, fi_addr_t addr)
+{
+	return set->ops->remove(set, addr);
+}
+
+static inline int
+fi_av_set_addr(struct fid_av_set *set, fi_addr_t *coll_addr)
+{
+	return set->ops->addr(set, coll_addr);
+}
+
+static inline int
+fi_join_collective(struct fid_ep *ep, fi_addr_t coll_addr,
+		   const struct fid_av_set *set,
+		   uint64_t flags, struct fid_mc **mc, void *context)
+{
+	struct fi_collective_addr addr;
+
+	addr.set = set;
+	addr.coll_addr = coll_addr;
+	return fi_join(ep, &addr, flags | FI_COLLECTIVE, mc, context);
+}
+
+static inline ssize_t
+fi_barrier(struct fid_ep *ep, fi_addr_t coll_addr, void *context)
+{
+	return ep->collective->barrier(ep, coll_addr, context);
+}
+
+static inline ssize_t
+fi_broadcast(struct fid_ep *ep, void *buf, size_t count, void *desc,
+	     fi_addr_t coll_addr, fi_addr_t root_addr,
+	     enum fi_datatype datatype, uint64_t flags, void *context)
+{
+	return ep->collective->broadcast(ep, buf, count, desc,
+		coll_addr, root_addr, datatype, flags, context);
+}
+
+static inline ssize_t
+fi_alltoall(struct fid_ep *ep, const void *buf, size_t count, void *desc,
+	    void *result, void *result_desc,
+	    fi_addr_t coll_addr, enum fi_datatype datatype,
+	    uint64_t flags, void *context)
+{
+	return ep->collective->alltoall(ep, buf, count, desc,
+		result, result_desc, coll_addr, datatype, flags, context);
+}
+
+static inline ssize_t
+fi_allreduce(struct fid_ep *ep, const void *buf, size_t count, void *desc,
+	     void *result, void *result_desc, fi_addr_t coll_addr,
+	     enum fi_datatype datatype, enum fi_op op,
+	     uint64_t flags, void *context)
+{
+	return ep->collective->allreduce(ep, buf, count, desc,
+		result, result_desc, coll_addr, datatype, op, flags, context);
+}
+
+static inline ssize_t
+fi_allgather(struct fid_ep *ep, const void *buf, size_t count, void *desc,
+	     void *result, void *result_desc, fi_addr_t coll_addr,
+	     enum fi_datatype datatype, uint64_t flags, void *context)
+{
+	return ep->collective->allgather(ep, buf, count, desc,
+		result, result_desc, coll_addr, datatype, flags, context);
+}
+
+static inline ssize_t
+fi_reduce_scatter(struct fid_ep *ep, const void *buf, size_t count, void *desc,
+		  void *result, void *result_desc, fi_addr_t coll_addr,
+		  enum fi_datatype datatype, enum fi_op op,
+		  uint64_t flags, void *context)
+{
+	return ep->collective->reduce_scatter(ep, buf, count, desc,
+		result, result_desc, coll_addr, datatype, op, flags, context);
+}
+
+static inline ssize_t
+fi_reduce(struct fid_ep *ep, const void *buf, size_t count, void *desc,
+	  void *result, void *result_desc, fi_addr_t coll_addr,
+	  fi_addr_t root_addr, enum fi_datatype datatype, enum fi_op op,
+	  uint64_t flags, void *context)
+{
+	return ep->collective->reduce(ep, buf, count, desc, result, result_desc,
+		coll_addr, root_addr, datatype, op, flags, context);
+}
+
+
+static inline ssize_t
+fi_scatter(struct fid_ep *ep, const void *buf, size_t count, void *desc,
+	   void *result, void *result_desc, fi_addr_t coll_addr,
+	   fi_addr_t root_addr, enum fi_datatype datatype,
+	   uint64_t flags, void *context)
+{
+	return ep->collective->scatter(ep, buf, count, desc, result, result_desc,
+		coll_addr, root_addr, datatype, flags, context);
+}
+
+
+static inline ssize_t
+fi_gather(struct fid_ep *ep, const void *buf, size_t count, void *desc,
+	  void *result, void *result_desc, fi_addr_t coll_addr,
+	  fi_addr_t root_addr, enum fi_datatype datatype,
+	  uint64_t flags, void *context)
+{
+	return ep->collective->gather(ep, buf, count, desc, result, result_desc,
+		coll_addr, root_addr, datatype, flags, context);
+}
+
+static inline
+int fi_query_collective(struct fid_domain *domain, enum fi_collective_op coll,
+			struct fi_collective_attr *attr, uint64_t flags)
+{
+	return FI_CHECK_OP(domain->ops, struct fi_ops_domain, query_collective) ?
+		       domain->ops->query_collective(domain, coll, attr, flags) :
+		       -FI_ENOSYS;
+}
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* FI_COLLECTIVE_H */
diff --git a/deps/ofi/include/rdma/fi_domain.h b/deps/ofi/include/rdma/fi_domain.h
index b5399ba00..05b30b831 100644
--- a/deps/ofi/include/rdma/fi_domain.h
+++ b/deps/ofi/include/rdma/fi_domain.h
@@ -52,7 +52,15 @@ extern "C" {
 #define FI_SYMMETRIC		(1ULL << 59)
 #define FI_SYNC_ERR		(1ULL << 58)
 #define FI_UNIVERSE		(1ULL << 57)
-
+#define FI_BARRIER_SET		(1ULL << 40)
+#define FI_BROADCAST_SET	(1ULL << 41)
+#define FI_ALLTOALL_SET		(1ULL << 42)
+#define FI_ALLREDUCE_SET	(1ULL << 43)
+#define FI_ALLGATHER_SET	(1ULL << 44)
+#define FI_REDUCE_SCATTER_SET	(1ULL << 45)
+#define FI_REDUCE_SET		(1ULL << 46)
+#define FI_SCATTER_SET		(1ULL << 47)
+#define FI_GATHER_SET		(1ULL << 48)
 
 struct fi_av_attr {
 	enum fi_av_type		type;
@@ -118,6 +126,8 @@ enum fi_hmem_iface {
 	FI_HMEM_CUDA,
 	FI_HMEM_ROCR,
 	FI_HMEM_ZE,
+	FI_HMEM_NEURON,
+	FI_HMEM_SYNAPSEAI,
 };
 
 struct fi_mr_attr {
@@ -134,6 +144,8 @@ struct fi_mr_attr {
 		uint64_t	reserved;
 		int		cuda;
 		int		ze;
+		int		neuron;
+		int		synapseai;
 	} device;
 };
 
diff --git a/deps/ofi/include/rdma/fi_endpoint.h b/deps/ofi/include/rdma/fi_endpoint.h
index 7f7a4c814..56df151c7 100644
--- a/deps/ofi/include/rdma/fi_endpoint.h
+++ b/deps/ofi/include/rdma/fi_endpoint.h
@@ -66,6 +66,19 @@ enum {
 	FI_OPT_RECV_BUF_SIZE,
 	FI_OPT_TX_SIZE,
 	FI_OPT_RX_SIZE,
+	FI_OPT_FI_HMEM_P2P,		/* int */
+	FI_OPT_XPU_TRIGGER,		/* struct fi_trigger_xpu */
+};
+
+/*
+ * Parameters for FI_OPT_HMEM_P2P to allow endpoint control over peer to peer
+ * support and FI_HMEM.
+ */
+enum {
+	FI_HMEM_P2P_ENABLED,	/* Provider decides when to use P2P, default. */
+	FI_HMEM_P2P_REQUIRED,	/* Must use P2P for all transfers */
+	FI_HMEM_P2P_PREFERRED,	/* Should use P2P for all transfers if available */
+	FI_HMEM_P2P_DISABLED	/* Do not use P2P */
 };
 
 struct fi_ops_ep {
diff --git a/deps/ofi/include/rdma/fi_errno.h b/deps/ofi/include/rdma/fi_errno.h
index 63a6acbfd..fee1046e8 100644
--- a/deps/ofi/include/rdma/fi_errno.h
+++ b/deps/ofi/include/rdma/fi_errno.h
@@ -193,6 +193,7 @@ enum {
 	FI_ENOKEY        = 266, /* Required key not available */
 	FI_ENOAV	 = 267, /* Missing or unavailable address vector */
 	FI_EOVERRUN	 = 268, /* Queue has been overrun */
+	FI_ENORX	 = 269, /* Receiver not ready, no receive buffers available */
 	FI_ERRNO_MAX
 };
 
diff --git a/deps/ofi/include/rdma/fi_ext.h b/deps/ofi/include/rdma/fi_ext.h
new file mode 100644
index 000000000..412007c16
--- /dev/null
+++ b/deps/ofi/include/rdma/fi_ext.h
@@ -0,0 +1,258 @@
+/*
+ * Copyright (c) 2021 Intel Corporation. All rights reserved.
+ * Copyright (c) 2021 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright (c) 2022 DataDirect Networks, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef FI_EXT_H
+#define FI_EXT_H
+
+#include <stdbool.h>
+#include <rdma/fabric.h>
+#include <rdma/fi_eq.h>
+#include <rdma/fi_endpoint.h>
+#include <rdma/providers/fi_prov.h>
+#include <rdma/providers/fi_log.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Each provider needs to define an unique 12-bit provider
+ * specific code to avoid overlapping with other providers,
+ * then bit left shift the code 16 bits. Note that the
+ * highest 4 bits are not touched, so they are still left
+ * to 0. The lowest 16 bits can be used to define provider
+ * specific values. E.g.,
+ *
+ * define FI_PROV_SPECIFIC_XXX    (0xabc << 16)
+ *
+ * enum {
+ *        FI_PROV_XXX_FOO = -(FI_PROV_SPECIFIC_XXX),
+ *        FI_PROV_XXX_BAR,
+ * }
+ */
+
+#define FI_PROV_SPECIFIC_EFA   (0xefa << 16)
+#define FI_PROV_SPECIFIC_TCP   (0x7cb << 16)
+
+
+/* negative options are provider specific */
+enum {
+       FI_OPT_EFA_RNR_RETRY = -FI_PROV_SPECIFIC_EFA,
+};
+
+struct fi_fid_export {
+	struct fid **fid;
+	uint64_t flags;
+	void *context;
+};
+
+static inline int
+fi_export_fid(struct fid *fid, uint64_t flags,
+	      struct fid **expfid, void *context)
+{
+	struct fi_fid_export exp;
+
+	exp.fid = expfid;
+	exp.flags = flags;
+	exp.context = context;
+	return fi_control(fid, FI_EXPORT_FID, &exp);
+}
+
+static inline int
+fi_import_fid(struct fid *fid, struct fid *expfid, uint64_t flags)
+{
+	return fid->ops->bind(fid, expfid, flags);
+}
+
+
+/*
+ * System memory monitor import extension:
+ * To use, open mr_cache fid and import.
+ */
+
+struct fid_mem_monitor;
+
+struct fi_ops_mem_monitor {
+	size_t	size;
+	int	(*start)(struct fid_mem_monitor *monitor);
+	void	(*stop)(struct fid_mem_monitor *monitor);
+	int	(*subscribe)(struct fid_mem_monitor *monitor,
+			const void *addr, size_t len);
+	void	(*unsubscribe)(struct fid_mem_monitor *monitor,
+			const void *addr, size_t len);
+	bool	(*valid)(struct fid_mem_monitor *monitor,
+			const void *addr, size_t len);
+};
+
+struct fi_ops_mem_notify {
+	size_t	size;
+	void	(*notify)(struct fid_mem_monitor *monitor, const void *addr,
+			size_t len);
+};
+
+struct fid_mem_monitor {
+	struct fid fid;
+	struct fi_ops_mem_monitor *export_ops;
+	struct fi_ops_mem_notify *import_ops;
+};
+
+
+/* Peer provider CQ support. */
+struct fid_peer_cq;
+
+struct fi_ops_cq_owner {
+	size_t	size;
+	ssize_t (*write)(struct fid_peer_cq *cq, void *context, uint64_t flags,
+			size_t len, void *buf, uint64_t data, uint64_t tag,
+			fi_addr_t src);
+	ssize_t	(*writeerr)(struct fid_peer_cq *cq,
+			const struct fi_cq_err_entry *err_entry);
+};
+
+struct fid_peer_cq {
+	struct fid fid;
+	struct fi_ops_cq_owner *owner_ops;
+};
+
+struct fi_peer_cq_context {
+	size_t size;
+	struct fid_peer_cq *cq;
+};
+
+
+/* Peer shared rx context */
+struct fid_peer_srx;
+
+/* Castable to dlist_entry */
+struct fi_peer_rx_entry {
+	struct fi_peer_rx_entry *next;
+	struct fi_peer_rx_entry *prev;
+	struct fid_peer_srx *srx;
+	fi_addr_t addr;
+	size_t size;
+	uint64_t tag;
+	uint64_t flags;
+	void *context;
+	size_t count;
+	void **desc;
+	void *peer_context;
+	void *owner_context;
+	struct iovec *iov;
+};
+
+struct fi_ops_srx_owner {
+	size_t	size;
+	int	(*get_msg)(struct fid_peer_srx *srx, fi_addr_t addr,
+			size_t size, struct fi_peer_rx_entry **entry);
+	int	(*get_tag)(struct fid_peer_srx *srx, fi_addr_t addr,
+			uint64_t tag, struct fi_peer_rx_entry **entry);
+	int	(*queue_msg)(struct fi_peer_rx_entry *entry);
+	int	(*queue_tag)(struct fi_peer_rx_entry *entry);
+
+	void	(*free_entry)(struct fi_peer_rx_entry *entry);
+};
+
+struct fi_ops_srx_peer {
+	size_t	size;
+	int	(*start_msg)(struct fi_peer_rx_entry *entry);
+	int	(*start_tag)(struct fi_peer_rx_entry *entry);
+	int	(*discard_msg)(struct fi_peer_rx_entry *entry);
+	int	(*discard_tag)(struct fi_peer_rx_entry *entry);
+};
+
+struct fid_peer_srx {
+	struct fid_ep ep_fid;
+	struct fi_ops_srx_owner *owner_ops;
+	struct fi_ops_srx_peer *peer_ops;
+};
+
+struct fi_peer_srx_context {
+	size_t size;
+	struct fid_peer_srx *srx;
+};
+
+/*
+ * System logging import extension:
+ * To use, open logging fid and import.
+ */
+
+#define FI_LOG_PROV_FILTERED (1ULL << 0) /* Filter provider */
+
+struct fi_ops_log {
+	size_t size;
+	int (*enabled)(const struct fi_provider *prov, enum fi_log_level level,
+		       enum fi_log_subsys subsys, uint64_t flags);
+	int (*ready)(const struct fi_provider *prov, enum fi_log_level level,
+		     enum fi_log_subsys subsys, uint64_t flags, uint64_t *showtime);
+	void (*log)(const struct fi_provider *prov, enum fi_log_level level,
+		    enum fi_log_subsys subsys, const char *func, int line,
+		    const char *msg);
+};
+
+struct fid_logging {
+	struct fid          fid;
+	struct fi_ops_log   *ops;
+};
+
+static inline int fi_import(uint32_t version, const char *name, void *attr,
+			    size_t attr_len, uint64_t flags, struct fid *fid,
+			    void *context)
+{
+	struct fid *open_fid;
+	int ret;
+
+	ret = fi_open(version, name, attr, attr_len, flags, &open_fid, context);
+	if (ret != FI_SUCCESS)
+	    return ret;
+
+	ret = fi_import_fid(open_fid, fid, flags);
+	fi_close(open_fid);
+	return ret;
+}
+
+static inline int fi_import_log(uint32_t version, uint64_t flags,
+				struct fid_logging *log_fid)
+{
+	log_fid->fid.fclass = FI_CLASS_LOG;
+	log_fid->ops->size = sizeof(struct fi_ops_log);
+
+	return fi_import(version, "logging", NULL, 0, flags, &log_fid->fid, log_fid);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* FI_EXT_H */
diff --git a/deps/ofi/include/rdma/fi_ext_psm2.h b/deps/ofi/include/rdma/fi_ext_psm2.h
new file mode 100644
index 000000000..3a48d83e1
--- /dev/null
+++ b/deps/ofi/include/rdma/fi_ext_psm2.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2020 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef FI_EXT_PSM2_H
+#define FI_EXT_PSM2_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Provider specific name for fi_set_val() / fi_get_val() */
+#define	FI_PSM2_DISCONNECT	(1U | FI_PROV_SPECIFIC)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* FI_EXT_PSM2_H */
diff --git a/deps/ofi/include/rdma/fi_trigger.h b/deps/ofi/include/rdma/fi_trigger.h
new file mode 100644
index 000000000..80f7d648b
--- /dev/null
+++ b/deps/ofi/include/rdma/fi_trigger.h
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2014 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef FI_TRIGGER_H
+#define FI_TRIGGER_H
+
+#include <rdma/fabric.h>
+#include <rdma/fi_endpoint.h>
+#include <rdma/fi_rma.h>
+#include <rdma/fi_tagged.h>
+#include <rdma/fi_atomic.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum fi_trigger_event {
+	FI_TRIGGER_THRESHOLD,
+	FI_TRIGGER_XPU,
+};
+
+enum fi_op_type {
+	FI_OP_RECV,
+	FI_OP_SEND,
+	FI_OP_TRECV,
+	FI_OP_TSEND,
+	FI_OP_READ,
+	FI_OP_WRITE,
+	FI_OP_ATOMIC,
+	FI_OP_FETCH_ATOMIC,
+	FI_OP_COMPARE_ATOMIC,
+	FI_OP_CNTR_SET,
+	FI_OP_CNTR_ADD
+};
+
+struct fi_trigger_threshold {
+	struct fid_cntr		*cntr;
+	size_t			threshold;
+};
+
+struct fi_trigger_var {
+	enum fi_datatype	datatype;
+	int			count;
+	void			*addr;
+	union {
+		uint8_t		val8;
+		uint16_t	val16;
+		uint32_t	val32;
+		uint64_t	val64;
+		uint8_t		*data;
+	} value;
+};
+
+struct fi_trigger_xpu {
+	int			count;
+	enum fi_hmem_iface	iface;
+	union {
+		uint64_t	reserved;
+		int		cuda;
+		int		ze;
+	} device;
+	struct fi_trigger_var	*var;
+};
+
+struct fi_op_msg {
+	struct fid_ep		*ep;
+	struct fi_msg		msg;
+	uint64_t		flags;
+};
+
+struct fi_op_tagged {
+	struct fid_ep		*ep;
+	struct fi_msg_tagged	msg;
+	uint64_t		flags;
+};
+
+struct fi_op_rma {
+	struct fid_ep		*ep;
+	struct fi_msg_rma	msg;
+	uint64_t		flags;
+};
+
+struct fi_op_atomic {
+	struct fid_ep		*ep;
+	struct fi_msg_atomic	msg;
+	uint64_t		flags;
+};
+
+struct fi_op_fetch_atomic {
+	struct fid_ep		*ep;
+	struct fi_msg_atomic	msg;
+	struct fi_msg_fetch	fetch;
+	uint64_t		flags;
+};
+
+struct fi_op_compare_atomic {
+	struct fid_ep		*ep;
+	struct fi_msg_atomic	msg;
+	struct fi_msg_fetch	fetch;
+	struct fi_msg_compare	compare;
+	uint64_t		flags;
+};
+
+struct fi_op_cntr {
+	struct fid_cntr		*cntr;
+	uint64_t		value;
+};
+
+#ifdef FABRIC_DIRECT
+#include <rdma/fi_direct_trigger.h>
+#endif
+
+#ifndef FABRIC_DIRECT_TRIGGER
+
+/* Size must match struct fi_context */
+struct fi_triggered_context {
+	enum fi_trigger_event			event_type;
+	union {
+		struct fi_trigger_threshold	threshold;
+		struct fi_trigger_xpu		xpu;
+		void				*internal[3];
+	} trigger;
+};
+
+/* Size must match struct fi_context2 */
+struct fi_triggered_context2 {
+	enum fi_trigger_event			event_type;
+	union {
+		struct fi_trigger_threshold	threshold;
+		struct fi_trigger_xpu		xpu;
+		void				*internal[7];
+	} trigger;
+};
+
+struct fi_deferred_work {
+	struct fi_context2			context;
+
+	uint64_t				threshold;
+	struct fid_cntr				*triggering_cntr;
+	struct fid_cntr				*completion_cntr;
+
+	enum fi_op_type				op_type;
+
+	union {
+		struct fi_op_msg		*msg;
+		struct fi_op_tagged		*tagged;
+		struct fi_op_rma		*rma;
+		struct fi_op_atomic		*atomic;
+		struct fi_op_fetch_atomic	*fetch_atomic;
+		struct fi_op_compare_atomic	*compare_atomic;
+		struct fi_op_cntr		*cntr;
+	} op;
+};
+
+#endif
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* FI_TRIGGER_H */
diff --git a/deps/ofi/include/rdma/providers/fi_log.h b/deps/ofi/include/rdma/providers/fi_log.h
new file mode 100644
index 000000000..614551b1d
--- /dev/null
+++ b/deps/ofi/include/rdma/providers/fi_log.h
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2015-2016, Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2015, Intel Corp., Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef FI_LOG_H
+#define FI_LOG_H
+
+#include <rdma/fabric.h>
+#include <rdma/providers/fi_prov.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum fi_log_subsys {
+	FI_LOG_CORE,
+	FI_LOG_FABRIC,
+	FI_LOG_DOMAIN,
+	FI_LOG_EP_CTRL,
+	FI_LOG_EP_DATA,
+	FI_LOG_AV,
+	FI_LOG_CQ,
+	FI_LOG_EQ,
+	FI_LOG_MR,
+	FI_LOG_CNTR,
+	FI_LOG_SUBSYS_MAX
+};
+
+enum fi_log_level {
+	FI_LOG_WARN,
+	FI_LOG_TRACE,
+	FI_LOG_INFO,
+	FI_LOG_DEBUG,
+	FI_LOG_MAX
+};
+
+int fi_log_enabled(const struct fi_provider *prov, enum fi_log_level level,
+		   enum fi_log_subsys subsys);
+int fi_log_ready(const struct fi_provider *prov, enum fi_log_level level,
+		 enum fi_log_subsys subsys, uint64_t *showtime);
+void fi_log(const struct fi_provider *prov, enum fi_log_level level,
+	    enum fi_log_subsys subsys, const char *func, int line,
+	    const char *fmt, ...) FI_FORMAT_PRINTF(6, 7);
+
+#define FI_LOG(prov, level, subsystem, ...)				\
+	do {								\
+		if (fi_log_enabled(prov, level, subsystem)) {		\
+			int saved_errno = errno;			\
+			fi_log(prov, level, subsystem,			\
+				__func__, __LINE__, __VA_ARGS__);	\
+			errno = saved_errno;				\
+		}							\
+	} while (0)
+
+#define FI_LOG_SPARSE(prov, level, subsystem, ...)			\
+	do {								\
+		static uint64_t showtime;				\
+		if (fi_log_ready(prov, level, subsystem, &showtime)) {	\
+			int saved_errno = errno;			\
+			fi_log(prov, level, subsystem,			\
+				__func__, __LINE__, __VA_ARGS__);	\
+			errno = saved_errno;				\
+		}							\
+	} while (0)
+
+#define FI_WARN(prov, subsystem, ...)					\
+	FI_LOG(prov, FI_LOG_WARN, subsystem, __VA_ARGS__)
+#define FI_WARN_SPARSE(prov, subsystem, ...)				\
+	FI_LOG_SPARSE(prov, FI_LOG_WARN, subsystem, __VA_ARGS__)
+
+#define FI_TRACE(prov, subsystem, ...)					\
+	FI_LOG(prov, FI_LOG_TRACE, subsystem, __VA_ARGS__)
+
+#define FI_INFO(prov, subsystem, ...)					\
+	FI_LOG(prov, FI_LOG_INFO, subsystem, __VA_ARGS__)
+
+#if defined(ENABLE_DEBUG) && ENABLE_DEBUG
+#define FI_DBG(prov, subsystem, ...)					\
+	FI_LOG(prov, FI_LOG_DEBUG, subsystem, __VA_ARGS__)
+#define FI_DBG_TRACE(prov, subsystem, ...)				\
+	FI_LOG(prov, FI_LOG_TRACE, subsystem, __VA_ARGS__)
+#else
+#define FI_DBG(prov_name, subsystem, ...)				\
+	do {} while (0)
+#define FI_DBG_TRACE(prov, subsystem, ...)				\
+	do {} while (0)
+#endif
+
+#define FI_WARN_ONCE(prov, subsystem, ...)  				\
+	do {								\
+		static int warned = 0;					\
+		if (!warned &&						\
+		    fi_log_enabled(prov, FI_LOG_WARN, subsystem)) {	\
+			int saved_errno = errno;			\
+			fi_log(prov, FI_LOG_WARN, subsystem,		\
+			__func__, __LINE__, __VA_ARGS__);		\
+			warned = 1;					\
+			errno = saved_errno;				\
+		}							\
+	} while (0)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* FI_LOG_H */
diff --git a/deps/ofi/include/rdma/providers/fi_prov.h b/deps/ofi/include/rdma/providers/fi_prov.h
new file mode 100644
index 000000000..ab8858d8f
--- /dev/null
+++ b/deps/ofi/include/rdma/providers/fi_prov.h
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006, 2016 Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2005 PathScale, Inc.  All rights reserved.
+ * Copyright (c) 2013-2014 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef FI_PROV_H
+#define FI_PROV_H
+
+#include <rdma/fabric.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Extension that dl-loaded providers should add to their .so filename
+ * (probably via libtool "-release" option). For example a provider
+ * driver named "foo" should build a plug-in named "libfoo-fi.so", and
+ * place it in $prefix/$libdir/libfabric/
+ */
+#define FI_LIB_EXTENSION "fi"
+#define FI_LIB_SUFFIX FI_LIB_EXTENSION ".so"
+
+/*
+ * Dynamically loaded providers must export the following entry point.
+ * This is invoked by the libfabric framework when the provider library
+ * is loaded.
+ */
+#define FI_EXT_INI \
+	__attribute__((visibility ("default"),EXTERNALLY_VISIBLE)) \
+	struct fi_provider* fi_prov_ini(void)
+
+struct fi_provider {
+	uint32_t version;
+	uint32_t fi_version;
+	struct fi_context context;
+	const char *name;
+	int	(*getinfo)(uint32_t version, const char *node, const char *service,
+			uint64_t flags, const struct fi_info *hints,
+			struct fi_info **info);
+	int	(*fabric)(struct fi_fabric_attr *attr, struct fid_fabric **fabric,
+			void *context);
+	void	(*cleanup)(void);
+};
+
+
+/*
+ * Defines a configuration parameter for use with libfabric.
+ */
+int fi_param_define(const struct fi_provider *provider, const char *param_name,
+		    enum fi_param_type type, const char *help_string_fmt, ...);
+
+/*
+ * Get the value of a configuration variable.
+ *
+ * Currently, configuration parameter will only be read from the
+ * environment. Someday this call could be expanded to also check
+ * config files.
+ */
+int fi_param_get(struct fi_provider *provider, const char *param_name,
+		 void *value);
+
+static inline int
+fi_param_get_str(struct fi_provider *provider, const char *param_name, char **value)
+{
+	return fi_param_get(provider, param_name, value);
+}
+
+static inline int
+fi_param_get_int(struct fi_provider *provider, const char *param_name, int *value)
+{
+	return fi_param_get(provider, param_name, value);
+}
+
+static inline int
+fi_param_get_bool(struct fi_provider *provider, const char *param_name, int *value)
+{
+	return fi_param_get(provider, param_name, value);
+}
+
+static inline int
+fi_param_get_size_t(struct fi_provider *provider, const char *param_name, size_t *value)
+{
+	return fi_param_get(provider, param_name, value);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* FI_PROV_H */
diff --git a/deps/ofi/lib/libfabric.so b/deps/ofi/lib/libfabric.so
index cf435ab98..06c884e1a 100755
Binary files a/deps/ofi/lib/libfabric.so and b/deps/ofi/lib/libfabric.so differ
diff --git a/deps/ofi/lib/libfabric.so.1 b/deps/ofi/lib/libfabric.so.1
index cf435ab98..06c884e1a 100755
Binary files a/deps/ofi/lib/libfabric.so.1 and b/deps/ofi/lib/libfabric.so.1 differ
diff --git a/deps/ofi/lib/prov/libpsm3-fi.so b/deps/ofi/lib/prov/libpsm3-fi.so
index 875cedbc8..0a937ac88 100755
Binary files a/deps/ofi/lib/prov/libpsm3-fi.so and b/deps/ofi/lib/prov/libpsm3-fi.so differ
diff --git a/deps/ofi/lib/prov/libpsmx2-fi.so b/deps/ofi/lib/prov/libpsmx2-fi.so
index edb03e004..dfce6641f 100755
Binary files a/deps/ofi/lib/prov/libpsmx2-fi.so and b/deps/ofi/lib/prov/libpsmx2-fi.so differ
diff --git a/deps/ofi/lib/prov/librxm-fi.so b/deps/ofi/lib/prov/librxm-fi.so
index 211edb301..a10783523 100755
Binary files a/deps/ofi/lib/prov/librxm-fi.so and b/deps/ofi/lib/prov/librxm-fi.so differ
diff --git a/deps/ofi/lib/prov/libshm-fi.so b/deps/ofi/lib/prov/libshm-fi.so
index 9394b7be7..b461c3f3e 100755
Binary files a/deps/ofi/lib/prov/libshm-fi.so and b/deps/ofi/lib/prov/libshm-fi.so differ
diff --git a/deps/ofi/lib/prov/libsockets-fi.so b/deps/ofi/lib/prov/libsockets-fi.so
index 7145739c7..b33844f35 100755
Binary files a/deps/ofi/lib/prov/libsockets-fi.so and b/deps/ofi/lib/prov/libsockets-fi.so differ
diff --git a/deps/ofi/lib/prov/libtcp-fi.so b/deps/ofi/lib/prov/libtcp-fi.so
index 6861c2533..89be5f0e8 100755
Binary files a/deps/ofi/lib/prov/libtcp-fi.so and b/deps/ofi/lib/prov/libtcp-fi.so differ
diff --git a/deps/ofi/lib/prov/libverbs-fi.so b/deps/ofi/lib/prov/libverbs-fi.so
new file mode 100755
index 000000000..75003ff0f
Binary files /dev/null and b/deps/ofi/lib/prov/libverbs-fi.so differ
diff --git a/deps/pmix/include/pmix.h b/deps/pmix/include/pmix.h
new file mode 100644
index 000000000..d66067643
--- /dev/null
+++ b/deps/pmix/include/pmix.h
@@ -0,0 +1,1655 @@
+/*
+ * Copyright (c) 2013-2020 Intel, Inc.  All rights reserved.
+ * Copyright (c) 2016      Research Organization for Information Science
+ *                         and Technology (RIST). All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer listed
+ *   in this license in the documentation and/or other materials
+ *   provided with the distribution.
+ *
+ * - Neither the name of the copyright holders nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ *
+ * The copyright holders provide no reassurances that the source code
+ * provided does not infringe any patent, copyright, or any other
+ * intellectual property rights of third parties.  The copyright holders
+ * disclaim any liability to any recipient for claims brought against
+ * recipient by any third party for infringement of that parties
+ * intellectual property rights.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Copyright (c) 2021-2022 Nanook Consulting  All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#ifndef PMIx_H
+#define PMIx_H
+
+/* Structure and constant definitions */
+#include <pmix_common.h>
+
+#if defined(c_plusplus) || defined(__cplusplus)
+extern "C" {
+#endif
+
+/****    PMIX API    ****/
+
+/* Initialize the PMIx client, returning the process identifier assigned
+ * to this client's application in the provided pmix_proc_t struct.
+ * Passing a parameter of _NULL_ for this parameter is allowed if the user
+ * wishes solely to initialize the PMIx system and does not require
+ * return of the identifier at that time.
+ *
+ * When called the PMIx client will check for the required connection
+ * information of the local PMIx server and will establish the connection.
+ * If the information is not found, or the server connection fails, then
+ * an appropriate error constant will be returned.
+ *
+ * If successful, the function will return PMIX_SUCCESS and will fill the
+ * provided structure with the server-assigned namespace and rank of the
+ * process within the application.
+ *
+ * Note that the PMIx client library is referenced counted, and so multiple
+ * calls to PMIx_Init are allowed. Thus, one way to obtain the namespace and
+ * rank of the process is to simply call PMIx_Init with a non-NULL parameter.
+ *
+ * The info array is used to pass user requests pertaining to the init
+ * and subsequent operations. Pass a _NULL_ value for the array pointer
+ * is supported if no directives are desired.
+ */
+PMIX_EXPORT pmix_status_t PMIx_Init(pmix_proc_t *proc,
+                                    pmix_info_t info[], size_t ninfo);
+
+/* Finalize the PMIx client, closing the connection to the local server.
+ * An error code will be returned if, for some reason, the connection
+ * cannot be closed.
+ *
+ * The info array is used to pass user requests regarding the finalize
+ * operation. This can include:
+ *
+ * (a) PMIX_EMBED_BARRIER - By default, PMIx_Finalize does not include an
+ * internal barrier operation. This attribute directs PMIx_Finalize to
+ * execute a barrier as part of the finalize operation.
+ */
+PMIX_EXPORT pmix_status_t PMIx_Finalize(const pmix_info_t info[], size_t ninfo);
+
+
+/* Returns _true_ if the PMIx client has been successfully initialized,
+ * returns _false_ otherwise. Note that the function only reports the
+ * internal state of the PMIx client - it does not verify an active
+ * connection with the server, nor that the server is functional. */
+PMIX_EXPORT int PMIx_Initialized(void);
+
+
+/* Request that the provided array of procs be aborted, returning the
+ * provided _status_ and printing the provided message. A _NULL_
+ * for the proc array indicates that all processes in the caller's
+ * nspace are to be aborted.
+ *
+ * The response to this request is somewhat dependent on the specific resource
+ * manager and its configuration (e.g., some resource managers will
+ * not abort the application if the provided _status_ is zero unless
+ * specifically configured to do so), and thus lies outside the control
+ * of PMIx itself. However, the client will inform the RM of
+ * the request that the application be aborted, regardless of the
+ * value of the provided _status_.
+ *
+ * Passing a _NULL_ msg parameter is allowed. Note that race conditions
+ * caused by multiple processes calling PMIx_Abort are left to the
+ * server implementation to resolve with regard to which status is
+ * returned and what messages (if any) are printed. */
+PMIX_EXPORT pmix_status_t PMIx_Abort(int status, const char msg[],
+                                     pmix_proc_t procs[], size_t nprocs);
+
+
+/* Push a value into the client's namespace. The client library will cache
+ * the information locally until _PMIx_Commit_ is called. The provided scope
+ * value is passed to the local PMIx server, which will distribute the data
+ * as directed. */
+PMIX_EXPORT pmix_status_t PMIx_Put(pmix_scope_t scope,
+                                   const char key[],
+                                   pmix_value_t *val);
+
+
+/* Push all previously _PMIx_Put_ values to the local PMIx server.
+ * This is an asynchronous operation - the library will immediately
+ * return to the caller while the data is transmitted to the local
+ * server in the background */
+PMIX_EXPORT pmix_status_t PMIx_Commit(void);
+
+
+/* Execute a blocking barrier across the processes identified in the
+ * specified array. Passing a _NULL_ pointer as the _procs_ parameter
+ * indicates that the barrier is to span all processes in the client's
+ * namespace. Each provided pmix_proc_t struct can pass PMIX_RANK_WILDCARD to
+ * indicate that all processes in the given namespace are
+ * participating.
+ *
+ * The info array is used to pass user requests regarding the fence
+ * operation. This can include:
+ *
+ * (a) PMIX_COLLECT_DATA - a boolean indicating whether or not the barrier
+ *     operation is to return the _put_ data from all participating processes.
+ *     A value of _false_ indicates that the callback is just used as a release
+ *     and no data is to be returned at that time. A value of _true_ indicates
+ *     that all _put_ data is to be collected by the barrier. Returned data is
+ *     cached at the server to reduce memory footprint, and can be retrieved
+ *     as needed by calls to PMIx_Get(nb).
+ *
+ *     Note that for scalability reasons, the default behavior for PMIx_Fence
+ *     is to _not_ collect the data.
+ *
+ * (b) PMIX_COLLECTIVE_ALGO - a comma-delimited string indicating the algos
+ *     to be used for executing the barrier, in priority order.
+ *
+ * (c) PMIX_COLLECTIVE_ALGO_REQD - instructs the host RM that it should return
+ *     an error if none of the specified algos are available. Otherwise, the RM
+ *     is to use one of the algos if possible, but is otherwise free to use any
+ *     of its available methods to execute the operation.
+ *
+ * (d) PMIX_TIMEOUT - maximum time for the fence to execute before declaring
+ *     an error. By default, the RM shall terminate the operation and notify participants
+ *     if one or more of the indicated procs fails during the fence. However,
+ *     the timeout parameter can help avoid "hangs" due to programming errors
+ *     that prevent one or more procs from reaching the "fence".
+ */
+PMIX_EXPORT pmix_status_t PMIx_Fence(const pmix_proc_t procs[], size_t nprocs,
+                                     const pmix_info_t info[], size_t ninfo);
+
+/* Non-blocking version of PMIx_Fence. Note that the function will return
+ * an error if a _NULL_ callback function is given. */
+PMIX_EXPORT pmix_status_t PMIx_Fence_nb(const pmix_proc_t procs[], size_t nprocs,
+                                        const pmix_info_t info[], size_t ninfo,
+                                        pmix_op_cbfunc_t cbfunc, void *cbdata);
+
+
+/* Retrieve information for the specified _key_ as published by the process
+ * identified in the given pmix_proc_t, returning a pointer to the value in the
+ * given address.
+ *
+ * This is a blocking operation - the caller will block until
+ * the specified data has been _PMIx_Put_ by the specified rank. The caller is
+ * responsible for freeing all memory associated with the returned value when
+ * no longer required.
+ *
+ * The info array is used to pass user requests regarding the get
+ * operation. This can include:
+ *
+ * (a) PMIX_TIMEOUT - maximum time for the get to execute before declaring
+ *     an error. The timeout parameter can help avoid "hangs" due to programming
+ *     errors that prevent the target proc from ever exposing its data.
+ */
+PMIX_EXPORT pmix_status_t PMIx_Get(const pmix_proc_t *proc, const char key[],
+                                   const pmix_info_t info[], size_t ninfo,
+                                   pmix_value_t **val);
+
+/* A non-blocking operation version of PMIx_Get - the callback function will
+ * be executed once the specified data has been _PMIx_Put_
+ * by the identified process and retrieved by the local server. The info
+ * array is used as described above for the blocking form of this call. */
+PMIX_EXPORT pmix_status_t PMIx_Get_nb(const pmix_proc_t *proc, const char key[],
+                                      const pmix_info_t info[], size_t ninfo,
+                                      pmix_value_cbfunc_t cbfunc, void *cbdata);
+
+
+/* Publish the data in the info array for lookup. By default,
+ * the data will be published into the PMIX_SESSION range and
+ * with PMIX_PERSIST_APP persistence. Changes to those values,
+ * and any additional directives, can be included in the pmix_info_t
+ * array.
+ *
+ * Note that the keys must be unique within the specified
+ * data range or else an error will be returned (first published
+ * wins). Attempts to access the data by procs outside of
+ * the provided data range will be rejected.
+ *
+ * The persistence parameter instructs the server as to how long
+ * the data is to be retained.
+ *
+ * The blocking form will block until the server confirms that the
+ * data has been posted and is available. The non-blocking form will
+ * return immediately, executing the callback when the server confirms
+ * availability of the data.
+ */
+PMIX_EXPORT pmix_status_t PMIx_Publish(const pmix_info_t info[], size_t ninfo);
+PMIX_EXPORT pmix_status_t PMIx_Publish_nb(const pmix_info_t info[], size_t ninfo,
+                                          pmix_op_cbfunc_t cbfunc, void *cbdata);
+
+
+/* Lookup information published by this or another process. By default,
+ * the search will be conducted across the PMIX_SESSION range. Changes
+ * to the range, and any additional directives, can be provided
+ * in the pmix_info_t array. Note that the search is also constrained
+ * to only data published by the current user ID - i.e., the search
+ * will not return data published by an application being executed
+ * by another user. There currently is no option to override this
+ * behavior - such an option may become available later via an
+ * appropriate pmix_info_t directive.
+ *
+ * The "data" parameter consists of an array of pmix_pdata_t struct with the
+ * keys specifying the requested information. Data will be returned
+ * for each key in the associated info struct - any key that cannot
+ * be found will return with a data type of "PMIX_UNDEF". The function
+ * will return SUCCESS if _any_ values can be found, so the caller
+ * must check each data element to ensure it was returned.
+ *
+ * The proc field in each pmix_pdata_t struct will contain the
+ * nspace/rank of the process that published the data.
+ *
+ * Note: although this is a blocking function, it will _not_ wait
+ * by default for the requested data to be published. Instead, it
+ * will block for the time required by the server to lookup its current
+ * data and return any found items. Thus, the caller is responsible for
+ * ensuring that data is published prior to executing a lookup, or
+ * for retrying until the requested data is found
+ *
+ * Optionally, the info array can be used to modify this behavior
+ * by including:
+ *
+ * (a) PMIX_WAIT - wait for the requested data to be published. The
+ *     server is to wait until all data has become available.
+ *
+ * (b) PMIX_TIMEOUT - max time to wait for data to become available.
+ *
+ */
+PMIX_EXPORT pmix_status_t PMIx_Lookup(pmix_pdata_t data[], size_t ndata,
+                                      const pmix_info_t info[], size_t ninfo);
+
+/* Non-blocking form of the _PMIx_Lookup_ function. Data for
+ * the provided NULL-terminated keys array will be returned
+ * in the provided callback function. As above, the default
+ * behavior is to _not_ wait for data to be published. The
+ * info keys can be used to modify the behavior as previously
+ * described */
+PMIX_EXPORT pmix_status_t PMIx_Lookup_nb(char **keys, const pmix_info_t info[], size_t ninfo,
+                                         pmix_lookup_cbfunc_t cbfunc, void *cbdata);
+
+
+/* Unpublish data posted by this process using the given keys.
+ * The function will block until the data has been removed by
+ * the server. A value of _NULL_ for the keys parameter instructs
+ * the server to remove _all_ data published by this process.
+ *
+ * By default, the range is assumed to be PMIX_SESSION. Changes
+ * to the range, and any additional directives, can be provided
+ * in the pmix_info_t array */
+PMIX_EXPORT pmix_status_t PMIx_Unpublish(char **keys,
+                                         const pmix_info_t info[], size_t ninfo);
+
+/* Non-blocking form of the _PMIx_Unpublish_ function. The
+ * callback function will be executed once the server confirms
+ * removal of the specified data. */
+PMIX_EXPORT pmix_status_t PMIx_Unpublish_nb(char **keys,
+                                            const pmix_info_t info[], size_t ninfo,
+                                            pmix_op_cbfunc_t cbfunc, void *cbdata);
+
+
+/* Spawn a new job. The assigned namespace of the spawned applications
+ * is returned in the nspace parameter - a _NULL_ value in that
+ * location indicates that the caller doesn't wish to have the
+ * namespace returned. The nspace array must be at least of size
+ * PMIX_MAX_NSLEN+1. Behavior of individual resource managers
+ * may differ, but it is expected that failure of any application
+ * process to start will result in termination/cleanup of _all_
+ * processes in the newly spawned job and return of an error
+ * code to the caller.
+ *
+ * By default, the spawned processes will be PMIx "connected" to
+ * the parent process upon successful launch (see PMIx_Connect
+ * description for details). Note that this only means that the
+ * parent process (a) will be given a copy of the  new job's
+ * information so it can query job-level info without
+ * incurring any communication penalties, and (b) will receive
+ * notification of errors from process in the child job.
+ *
+ * Job-level directives can be specified in the job_info array. This
+ * can include:
+ *
+ * (a) PMIX_NON_PMI - processes in the spawned job will
+ *     not be calling PMIx_Init
+ *
+ * (b) PMIX_TIMEOUT - declare the spawn as having failed if the launched
+ *     procs do not call PMIx_Init within the specified time
+ *
+ * (c) PMIX_NOTIFY_COMPLETION - notify the parent process when the
+ *     child job terminates, either normally or with error
+ */
+PMIX_EXPORT pmix_status_t PMIx_Spawn(const pmix_info_t job_info[], size_t ninfo,
+                                     const pmix_app_t apps[], size_t napps,
+                                     pmix_nspace_t nspace);
+
+
+/* Non-blocking form of the _PMIx_Spawn_ function. The callback
+ * will be executed upon launch of the specified applications,
+ * or upon failure to launch any of them. */
+PMIX_EXPORT pmix_status_t PMIx_Spawn_nb(const pmix_info_t job_info[], size_t ninfo,
+                                        const pmix_app_t apps[], size_t napps,
+                                        pmix_spawn_cbfunc_t cbfunc, void *cbdata);
+
+/* Record the specified processes as "connected". Both blocking and non-blocking
+ * versions are provided. This means that the resource manager should treat the
+ * failure of any process in the specified group as a reportable event, and take
+ * appropriate action. Note that different resource managers may respond to
+ * failures in different manners.
+ *
+ * The callback function is to be called once all participating processes have
+ * called connect. The server is required to return any job-level info for the
+ * connecting processes that might not already have - i.e., if the connect
+ * request involves procs from different nspaces, then each proc shall receive
+ * the job-level info from those nspaces other than their own.
+ *
+ * Note: a process can only engage in _one_ connect operation involving the identical
+ * set of processes at a time. However, a process _can_ be simultaneously engaged
+ * in multiple connect operations, each involving a different set of processes
+ *
+ * As in the case of the fence operation, the info array can be used to pass
+ * user-level directives regarding the algorithm to be used for the collective
+ * operation involved in the "connect", timeout constraints, and other options
+ * available from the host RM */
+PMIX_EXPORT pmix_status_t PMIx_Connect(const pmix_proc_t procs[], size_t nprocs,
+                                       const pmix_info_t info[], size_t ninfo);
+
+PMIX_EXPORT pmix_status_t PMIx_Connect_nb(const pmix_proc_t procs[], size_t nprocs,
+                                          const pmix_info_t info[], size_t ninfo,
+                                          pmix_op_cbfunc_t cbfunc, void *cbdata);
+
+/* Disconnect a previously connected set of processes. An error will be returned
+ * if the specified set of procs was not previously "connected". As above, a process
+ * may be involved in multiple simultaneous disconnect operations. However, a process
+ * is not allowed to reconnect to a set of procs that has not fully completed
+ * disconnect - i.e., you have to fully disconnect before you can reconnect to the
+ * _same_ group of processes. The info array is used as above. */
+PMIX_EXPORT pmix_status_t PMIx_Disconnect(const pmix_proc_t procs[], size_t nprocs,
+                                          const pmix_info_t info[], size_t ninfo);
+
+PMIX_EXPORT pmix_status_t PMIx_Disconnect_nb(const pmix_proc_t ranges[], size_t nprocs,
+                                             const pmix_info_t info[], size_t ninfo,
+                                             pmix_op_cbfunc_t cbfunc, void *cbdata);
+
+/* Given a node name, return an array of processes within the specified nspace
+ * on that node. If the nspace is NULL, then all processes on the node will
+ * be returned. If the specified node does not currently host any processes,
+ * then the returned array will be NULL, and nprocs=0. The caller is responsible
+ * for releasing the array when done with it - the PMIX_PROC_FREE macro is
+ * provided for this purpose.
+ */
+PMIX_EXPORT pmix_status_t PMIx_Resolve_peers(const char *nodename,
+                                             const pmix_nspace_t nspace,
+                                             pmix_proc_t **procs, size_t *nprocs);
+
+
+/* Given an nspace, return the list of nodes hosting processes within
+ * that nspace. The returned string will contain a comma-delimited list
+ * of nodenames. The caller is responsible for releasing the string
+ * when done with it */
+PMIX_EXPORT pmix_status_t PMIx_Resolve_nodes(const pmix_nspace_t nspace, char **nodelist);
+
+/* Query information about the system in general - can include
+ * a list of active nspaces, network topology, etc. Also can be
+ * used to query node-specific info such as the list of peers
+ * executing on a given node. We assume that the host RM will
+ * exercise appropriate access control on the information.
+ *
+ * The following return status codes are provided in the callback:
+ *
+ * PMIX_SUCCESS - all data has been returned
+ * PMIX_ERR_NOT_FOUND - none of the requested data was available
+ * PMIX_ERR_PARTIAL_SUCCESS - some of the data has been returned
+ * PMIX_ERR_NOT_SUPPORTED - the host RM does not support this function
+ */
+PMIX_EXPORT pmix_status_t PMIx_Query_info(pmix_query_t queries[], size_t nqueries,
+                                          pmix_info_t **results, size_t *nresults);
+
+PMIX_EXPORT pmix_status_t PMIx_Query_info_nb(pmix_query_t queries[], size_t nqueries,
+                                             pmix_info_cbfunc_t cbfunc, void *cbdata);
+
+/* Log data to a central data service/store, subject to the
+ * services offered by the host resource manager. The data to
+ * be logged is provided in the data array. The (optional) directives
+ * can be used to request specific storage options and direct
+ * the choice of storage option.
+ *
+ * The callback function will be executed when the log operation
+ * has been completed. The data array must be maintained until
+ * the callback is provided
+ */
+PMIX_EXPORT pmix_status_t PMIx_Log(const pmix_info_t data[], size_t ndata,
+                                   const pmix_info_t directives[], size_t ndirs);
+
+PMIX_EXPORT pmix_status_t PMIx_Log_nb(const pmix_info_t data[], size_t ndata,
+                                      const pmix_info_t directives[], size_t ndirs,
+                                      pmix_op_cbfunc_t cbfunc, void *cbdata);
+
+/* Request an allocation operation from the host resource manager.
+ * Several broad categories are envisioned, including the ability to:
+ *
+ * - request allocation of additional resources, including memory,
+ *   bandwidth, and compute. This should be accomplished in a
+ *   non-blocking manner so that the application can continue to
+ *   progress while waiting for resources to become available. Note
+ *   that the new allocation will be disjoint from (i.e., not
+ *   affiliated with) the allocation of the requestor - thus the
+ *   termination of one allocation will not impact the other.
+ *
+ * - extend the reservation on currently allocated resources, subject
+ *   to scheduling availability and priorities. This includes extending
+ *   the time limit on current resources, and/or requesting additional
+ *   resources be allocated to the requesting job. Any additional
+ *   allocated resources will be considered as part of the current
+ *   allocation, and thus will be released at the same time.
+ *
+ * - release currently allocated resources that are no longer required.
+ *   This is intended to support partial release of resources since all
+ *   resources are normally released upon termination of the job. The
+ *   identified use-cases include resource variations across discrete steps
+ *   of a workflow, as well as applications that spawn sub-jobs and/or
+ *   dynamically grow/shrink over time
+ *
+ * - "lend" resources back to the scheduler with an expectation of getting
+ *   them back at some later time in the job. This can be a proactive
+ *   operation (e.g., to save on computing costs when resources are
+ *   temporarily not required), or in response to scheduler requests in
+ *   lieue of preemption. A corresponding ability to "reacquire" resources
+ *   previously released is included.
+ */
+PMIX_EXPORT pmix_status_t PMIx_Allocation_request(pmix_alloc_directive_t directive,
+                                                  pmix_info_t *info, size_t ninfo,
+                                                  pmix_info_t **results, size_t *nresults);
+
+PMIX_EXPORT pmix_status_t PMIx_Allocation_request_nb(pmix_alloc_directive_t directive,
+                                                     pmix_info_t *info, size_t ninfo,
+                                                     pmix_info_cbfunc_t cbfunc, void *cbdata);
+
+/* Request a job control action. The targets array identifies the
+ * processes to which the requested job control action is to be applied.
+ * A NULL value can be used to indicate all processes in the caller's
+ * nspace. The use of PMIX_RANK_WILDARD can also be used to indicate
+ * that all processes in the given nspace are to be included.
+ *
+ * The directives are provided as pmix_info_t structs in the directives
+ * array. The callback function provides a status to indicate whether or
+ * not the request was granted, and to provide some information as to
+ * the reason for any denial in the pmix_info_cbfunc_t array of pmix_info_t
+ * structures. If non-NULL, then the specified release_fn must be called
+ * when the callback function completes - this will be used to release
+ * any provided pmix_info_t array.
+ */
+PMIX_EXPORT pmix_status_t PMIx_Job_control(const pmix_proc_t targets[], size_t ntargets,
+                                           const pmix_info_t directives[], size_t ndirs,
+                                           pmix_info_t **results, size_t *nresults);
+
+PMIX_EXPORT pmix_status_t PMIx_Job_control_nb(const pmix_proc_t targets[], size_t ntargets,
+                                              const pmix_info_t directives[], size_t ndirs,
+                                              pmix_info_cbfunc_t cbfunc, void *cbdata);
+
+/* Request that something be monitored - e.g., that the server monitor
+ * this process for periodic heartbeats as an indication that the process
+ * has not become "wedged". When a monitor detects the specified alarm
+ * condition, it will generate an event notification using the provided
+ * error code and passing along any available relevant information. It is
+ * up to the caller to register a corresponding event handler.
+ *
+ * Params:
+ *
+ * monitor: attribute indicating the type of monitor being requested - e.g.,
+ *          PMIX_MONITOR_FILE to indicate that the requestor is asking that
+ *          a file be monitored.
+ *
+ * error: the status code to be used when generating an event notification
+ *        alerting that the monitor has been triggered. The range of the
+ *        notification defaults to PMIX_RANGE_NAMESPACE - this can be
+ *        changed by providing a PMIX_RANGE directive
+ *
+ * directives: characterize the monitoring request (e.g., monitor file size)
+ *             and frequency of checking to be done
+ *
+ * cbfunc: provides a status to indicate whether or not the request was granted,
+ *         and to provide some information as to the reason for any denial in
+ *         the pmix_info_cbfunc_t array of pmix_info_t structures.
+ *
+ * Note: a process can send a heartbeat to the server using the PMIx_Heartbeat
+ * macro provided below*/
+PMIX_EXPORT pmix_status_t PMIx_Process_monitor(const pmix_info_t *monitor, pmix_status_t error,
+                                               const pmix_info_t directives[], size_t ndirs,
+                                               pmix_info_t **results, size_t *nresults);
+
+PMIX_EXPORT pmix_status_t PMIx_Process_monitor_nb(const pmix_info_t *monitor, pmix_status_t error,
+                                                  const pmix_info_t directives[], size_t ndirs,
+                                                  pmix_info_cbfunc_t cbfunc, void *cbdata);
+
+/* define a special macro to simplify sending of a heartbeat */
+#define PMIx_Heartbeat()                                                    \
+    do {                                                                    \
+        pmix_info_t _in;                                                    \
+        PMIX_INFO_CONSTRUCT(&_in);                                          \
+        PMIX_INFO_LOAD(&_in, PMIX_SEND_HEARTBEAT, NULL, PMIX_POINTER);      \
+        PMIx_Process_monitor_nb(&_in, PMIX_SUCCESS, NULL, 0, NULL, NULL);   \
+        PMIX_INFO_DESTRUCT(&_in);                                           \
+    } while(0)
+
+/* Request a credential from the PMIx server/SMS.
+ * Input values include:
+ *
+ * info - an array of pmix_info_t structures containing any directives the
+ *        caller may wish to pass. Typical usage might include:
+ *            PMIX_TIMEOUT - how long to wait (in seconds) for a credential
+ *                           before timing out and returning an error
+ *            PMIX_CRED_TYPE - a prioritized, comma-delimited list of desired
+ *                             credential types for use in environments where
+ *                             multiple authentication mechanisms may be
+ *                             available
+ *
+ * ninfo - number of elements in the info array
+ *
+ * cbfunc - the pmix_credential_cbfunc_t function to be called upon completion
+ *          of the request
+ *
+ * cbdata - pointer to an object to be returned when cbfunc is called
+ *
+ * Returned values:
+ * PMIX_SUCCESS - indicates that the request has been successfully communicated to
+ *                the local PMIx server. The response will be coming in the provided
+ *                callback function.
+ *
+ * Any other value indicates an appropriate error condition. The callback function
+ * will _not_ be called in such cases.
+ */
+PMIX_EXPORT pmix_status_t PMIx_Get_credential(const pmix_info_t info[], size_t ninfo,
+                                              pmix_byte_object_t *credential);
+
+PMIX_EXPORT pmix_status_t PMIx_Get_credential_nb(const pmix_info_t info[], size_t ninfo,
+                                                 pmix_credential_cbfunc_t cbfunc, void *cbdata);
+
+/* Request validation of a credential by the PMIx server/SMS
+ * Input values include:
+ *
+ * cred - pointer to a pmix_byte_object_t containing the credential
+ *
+ * info - an array of pmix_info_t structures containing any directives the
+ *        caller may wish to pass. Typical usage might include:
+ *            PMIX_TIMEOUT - how long to wait (in seconds) for validation
+ *                           before timing out and returning an error
+ *            PMIX_USERID - the expected effective userid of the credential
+ *                          to be validated
+ *            PMIX_GROUPID - the expected effective group id of the credential
+ *                          to be validated
+ *
+ * ninfo - number of elements in the info array
+ *
+ * cbfunc - the pmix_validation_cbfunc_t function to be called upon completion
+ *          of the request
+ *
+ * cbdata - pointer to an object to be returned when cbfunc is called
+ *
+ * Returned values:
+ * PMIX_SUCCESS - indicates that the request has been successfully communicated to
+ *                the local PMIx server. The response will be coming in the provided
+ *                callback function.
+ *
+ * Any other value indicates an appropriate error condition. The callback function
+ * will _not_ be called in such cases.
+ */
+PMIX_EXPORT pmix_status_t PMIx_Validate_credential(const pmix_byte_object_t *cred,
+                                                   const pmix_info_t info[], size_t ninfo,
+                                                   pmix_info_t **results, size_t *nresults);
+
+PMIX_EXPORT pmix_status_t PMIx_Validate_credential_nb(const pmix_byte_object_t *cred,
+                                                      const pmix_info_t info[], size_t ninfo,
+                                                      pmix_validation_cbfunc_t cbfunc, void *cbdata);
+
+
+/* Construct a new group composed of the specified processes and identified with
+ * the provided group identifier. Both blocking and non-blocking versions
+ * are provided (the callback function for the non-blocking form will be called
+ * once all specified processes have joined the group). The group identifier is
+ * a user-defined, NULL-terminated character array of length less than or equal
+ * to PMIX_MAX_NSLEN. Only characters accepted by standard string comparison
+ * functions (e.g., strncmp) are supported.
+ *
+ * Processes may engage in multiple simultaneous group construct operations as
+ * desired so long as each is provided with a unique group ID. The info array
+ * can be used to pass user-level directives regarding timeout constraints and
+ * other options available from the PMIx server.
+ *
+ * The construct leader (if PMIX_GROUP_LEADER is provided) or all participants
+ * will receive events (if registered for the PMIX_GROUP_MEMBER_FAILED event)
+ * whenever a process fails or terminates prior to calling
+ * PMIx_Group_construct(_nb) – the events will contain the identifier of the
+ * process that failed to join plus any other information that the resource
+ * manager provided. This provides an opportunity for the leader to react to
+ * the event – e.g., to invite an alternative member to the group or to decide
+ * to proceed with a smaller group. The decision to proceed with a smaller group
+ * is communicated to the PMIx library in the results array at the end of the
+ * event handler. This allows PMIx to properly adjust accounting for procedure
+ * completion. When construct is complete, the participating PMIx servers will
+ * be alerted to any change in participants and each group member will (if
+ * registered) receive a PMIX_GROUP_MEMBERSHIP_UPDATE event updating the group
+ * membership.
+ *
+ * Processes in a group under construction are not allowed to leave the group
+ * until group construction is complete. Upon completion of the construct
+ * procedure, each group member will have access to the job-level information
+ * of all nspaces represented in the group and the contact information for
+ * every group member.
+ *
+ * Failure of the leader at any time will cause a PMIX_GROUP_LEADER_FAILED event
+ * to be delivered to all participants so they can optionally declare a new leader.
+ * A new leader is identified by providing the PMIX_GROUP_LEADER attribute in
+ * the results array in the return of the event handler. Only one process is
+ * allowed to return that attribute, declaring itself as the new leader. Results
+ * of the leader selection will be communicated to all participants via a
+ * PMIX_GROUP_LEADER_SELECTED event identifying the new leader. If no leader
+ * was selected, then the status code provided in the event handler will provide
+ * an error value so the participants can take appropriate action.
+ *
+ * Any participant that returns PMIX_GROUP_CONSTRUCT_ABORT from the leader failed
+ * event handler will cause the construct process to abort. Those processes
+ * engaged in the blocking construct will return from the call with the
+ * PMIX_GROUP_CONSTRUCT_ABORT status. Non-blocking participants will have
+ * their callback function executed with that status.
+ *
+ * Some relevant attributes for this operation:
+ *    PMIX_GROUP_LEADER - declare this process to be the leader of the construction
+ *                        procedure. If a process provides this attribute, then
+ *                        failure notification for any participating process will
+ *                        go only to that one process. In the absence of a
+ *                        declared leader, failure events go to all participants.
+ *    PMIX_GROUP_OPTIONAL - participation is optional - do not return an error if
+ *                          any of the specified processes terminate
+ *                          without having joined (default=false)
+ *    PMIX_GROUP_NOTIFY_TERMINATION - notify remaining members when another member
+ *                                    terminates without first leaving the
+ *                                    group (default=false)
+ *    PMIX_GROUP_ASSIGN_CONTEXT_ID - requests that the RM assign a unique context
+ *                                   ID (size_t) to the group. The value is returned
+ *                                   in the PMIX_GROUP_CONSTRUCT_COMPLETE event
+ *    PMIX_TIMEOUT - return an error if the group doesn't assemble within the
+ *                   specified number of seconds. Targets the scenario where a
+ *                   process fails to call PMIx_Group_connect due to hanging
+ *
+ */
+PMIX_EXPORT pmix_status_t PMIx_Group_construct(const char grp[],
+                                               const pmix_proc_t procs[], size_t nprocs,
+                                               const pmix_info_t directives[], size_t ndirs,
+                                               pmix_info_t **results, size_t *nresults);
+
+PMIX_EXPORT pmix_status_t PMIx_Group_construct_nb(const char grp[],
+                                                  const pmix_proc_t procs[], size_t nprocs,
+                                                  const pmix_info_t info[], size_t ninfo,
+                                                  pmix_info_cbfunc_t cbfunc, void *cbdata);
+
+/* Explicitly invite specified processes to join a group.
+ *
+ * Each invited process will be notified of the invitation via the PMIX_GROUP_INVITED
+ * event. The processes being invited must have registered for the PMIX_GROUP_INVITED
+ * event in order to be notified of the invitation. When ready to respond, each invited
+ * process provides a response using the appropriate form of PMIx_Group_join. This will
+ * notify the inviting process that the invitation was either accepted (via the
+ * PMIX_GROUP_INVITE_ACCEPTED event) or declined (via the PMIX_GROUP_INVITE_DECLINED event).
+ * The inviting process will also receive PMIX_GROUP_MEMBER_FAILED events whenever a
+ * process fails or terminates prior to responding to the invitation.
+ *
+ * Upon accepting the invitation, both the inviting and invited process will receive
+ * access to the job-level information of each other’s nspaces and the contact
+ * information of the other process.
+ *
+ * Some relevant attributes for this operation:
+ *    PMIX_GROUP_ASSIGN_CONTEXT_ID - requests that the RM assign a unique context
+ *                                   ID (size_t) to the group. The value is returned
+ *                                   in the PMIX_GROUP_CONSTRUCT_COMPLETE event
+ *    PMIX_TIMEOUT (int): return an error if the group doesn’t assemble within the
+ *                        specified number of seconds. Targets the scenario where a
+ *                        process fails to call PMIx_Group_connect due to hanging
+ *
+ * The inviting process is automatically considered the leader of the asynchronous
+ * group construction procedure and will receive all failure or termination events
+ * for invited members prior to completion. The inviting process is required to
+ * provide a PMIX_GROUP_CONSTRUCT_COMPLETE event once the group has been fully
+ * assembled – this event will be distributed to all participants along with the
+ * final membership.
+ *
+ * Failure of the leader at any time will cause a PMIX_GROUP_LEADER_FAILED event
+ * to be delivered to all participants so they can optionally declare a new leader.
+ * A new leader is identified by providing the PMIX_GROUP_LEADER attribute in
+ * the results array in the return of the event handler. Only one process is
+ * allowed to return that attribute, declaring itself as the new leader. Results
+ * of the leader selection will be communicated to all participants via a
+ * PMIX_GROUP_LEADER_SELECTED event identifying the new leader. If no leader
+ * was selected, then the status code provided in the event handler will provide
+ * an error value so the participants can take appropriate action.
+ *
+ * Any participant that returns PMIX_GROUP_CONSTRUCT_ABORT from the event
+ * handler will cause all participants to receive an event notifying them
+ * of that status.
+ */
+PMIX_EXPORT pmix_status_t PMIx_Group_invite(const char grp[],
+                                            const pmix_proc_t procs[], size_t nprocs,
+                                            const pmix_info_t info[], size_t ninfo,
+                                            pmix_info_t **results, size_t *nresult);
+
+PMIX_EXPORT pmix_status_t PMIx_Group_invite_nb(const char grp[],
+                                               const pmix_proc_t procs[], size_t nprocs,
+                                               const pmix_info_t info[], size_t ninfo,
+                                               pmix_info_cbfunc_t cbfunc, void *cbdata);
+
+/* Respond to an invitation to join a group that is being asynchronously constructed.
+ *
+ * The process must have registered for the PMIX_GROUP_INVITED event in order to be
+ * notified of the invitation. When ready to respond, the process provides a response
+ * using the appropriate form of PMIx_Group_join.
+ *
+ * Critical Note: Since the process is alerted to the invitation in a PMIx event handler,
+ * the process must not use the blocking form of this call unless it first “thread shifts”
+ * out of the handler and into its own thread context. Likewise, while it is safe to call
+ * the non-blocking form of the API from the event handler, the process must not block
+ * in the handler while waiting for the callback function to be called.
+ *
+ * Calling this function causes the group “leader” to be notified that the process has
+ * either accepted or declined the request. The blocking form of the API will return
+ * once the group has been completely constructed or the group’s construction has failed
+ * (as determined by the leader) – likewise, the callback function of the non-blocking
+ * form will be executed upon the same conditions.
+ *
+ * Failure of the leader at any time will cause a PMIX_GROUP_LEADER_FAILED event
+ * to be delivered to all participants so they can optionally declare a new leader.
+ * A new leader is identified by providing the PMIX_GROUP_LEADER attribute in
+ * the results array in the return of the event handler. Only one process is
+ * allowed to return that attribute, declaring itself as the new leader. Results
+ * of the leader selection will be communicated to all participants via a
+ * PMIX_GROUP_LEADER_SELECTED event identifying the new leader. If no leader
+ * was selected, then the status code provided in the event handler will provide
+ * an error value so the participants can take appropriate action.
+ *
+ * Any participant that returns PMIX_GROUP_CONSTRUCT_ABORT from the leader failed
+ * event handler will cause all participants to receive an event notifying them
+ * of that status. Similarly, the leader may elect to abort the procedure
+ * by either returning PMIX_GROUP_CONSTRUCT_ABORT from the handler assigned
+ * to the PMIX_GROUP_INVITE_ACCEPTED or PMIX_GROUP_INVITE_DECLINED codes, or
+ * by generating an event for the abort code. Abort events will be sent to
+ * all invited participants.
+ */
+PMIX_EXPORT pmix_status_t PMIx_Group_join(const char grp[],
+                                          const pmix_proc_t *leader,
+                                          pmix_group_opt_t opt,
+                                          const pmix_info_t info[], size_t ninfo,
+                                          pmix_info_t **results, size_t *nresult);
+
+PMIX_EXPORT pmix_status_t PMIx_Group_join_nb(const char grp[],
+                                             const pmix_proc_t *leader,
+                                             pmix_group_opt_t opt,
+                                             const pmix_info_t info[], size_t ninfo,
+                                             pmix_info_cbfunc_t cbfunc, void *cbdata);
+
+/* Leave a PMIx Group. Calls to PMIx_Group_leave (or its non-blocking form) will cause
+ * a PMIX_GROUP_LEFT event to be generated notifying all members of the group of the
+ * caller’s departure. The function will return (or the non-blocking function will
+ * execute the specified callback function) once the event has been locally generated
+ * and is not indicative of remote receipt. All PMIx-based collectives such as
+ * PMIx_Fence in action across the group will automatically be adjusted if the
+ * collective was called with the PMIX_GROUP_FT_COLLECTIVE attribute (default is
+ * false) – otherwise, the standard error return behavior will be provided.
+ *
+ * Critical Note: The PMIx_Group_leave API is intended solely for asynchronous
+ * departures of individual processes from a group as it is not a scalable
+ * operation – i.e., when a process determines it should no longer be a part of a
+ * defined group, but the remainder of the group retains a valid reason to continue
+ * in existence. Developers are advised to use PMIx_Group_destruct (or its
+ * non-blocking form) for all other scenarios as it represents a more scalable
+ * operation.
+ */
+PMIX_EXPORT pmix_status_t PMIx_Group_leave(const char grp[],
+                                           const pmix_info_t info[], size_t ninfo);
+
+PMIX_EXPORT pmix_status_t PMIx_Group_leave_nb(const char grp[],
+                                              const pmix_info_t info[], size_t ninfo,
+                                              pmix_op_cbfunc_t cbfunc, void *cbdata);
+
+/* Destruct a group identified by the provided group identifier. Both blocking and
+ * non-blocking versions are provided (the callback function for the non-blocking
+ * form will be called once all members of the group have called “destruct”).
+ * Processes may engage in multiple simultaneous group destruct operations as
+ * desired so long as each involves a unique group ID. The info array can be used
+ * to pass user-level directives regarding timeout constraints and other options
+ * available from the PMIx server.
+ *
+ * Some relevant attributes for this operation:
+ *
+ *    PMIX_TIMEOUT (int): return an error if the group doesn’t destruct within the
+ *                        specified number of seconds. Targets the scenario where
+ *                        a process fails to call PMIx_Group_destruct due to hanging
+ *
+ * The destruct API will return an error if any group process fails or terminates
+ * prior to calling PMIx_Group_destruct or its non-blocking version unless the
+ * PMIX_GROUP_NOTIFY_TERMINATION attribute was provided (with a value of true) at
+ * time of group construction. If notification was requested, then a event will
+ * be delivered (using PMIX_GROUP_MEMBER_FAILED) for each process that fails to
+ * call destruct and the destruct tracker updated to account for the lack of
+ * participation. The PMIx_Group_destruct operation will subsequently return
+ * PMIX_SUCCESS when the remaining processes have all called destruct – i.e., the
+ * event will serve in place of return of an error.
+ */
+PMIX_EXPORT pmix_status_t PMIx_Group_destruct(const char grp[],
+                                              const pmix_info_t info[], size_t ninfo);
+
+PMIX_EXPORT pmix_status_t PMIx_Group_destruct_nb(const char grp[],
+                                                 const pmix_info_t info[], size_t ninfo,
+                                                 pmix_op_cbfunc_t cbfunc, void *cbdata);
+
+/****************************************/
+/****    COMMON SUPPORT FUNCTIONS    ****/
+/****************************************/
+
+/******     EVENT NOTIFICATION SUPPORT      ******/
+/* Register an event handler to report events. Three types of events
+ * can be reported:
+ *
+ * (a) those that occur within the client library, but are not
+ *     reportable via the API itself (e.g., loss of connection to
+ *     the server). These events typically occur during behind-the-scenes
+ *     non-blocking operations.
+ *
+ * (b) job-related events such as the failure of another process in
+ *     the job or in any connected job, impending failure of hardware
+ *     within the job's usage footprint, etc.
+ *
+ * (c) system notifications that are made available by the local
+ *     administrators
+ *
+ * By default, only events that directly affect the process and/or
+ * any process to which it is connected (via the PMIx_Connect call)
+ * will be reported. Options to modify that behavior can be provided
+ * in the info array
+ *
+ * Both the client application and the resource manager can register
+ * err handlers for specific events. PMIx client/server calls the registered
+ * err handler upon receiving event notify notification (via PMIx_Notify_event)
+ * from the other end (Resource Manager/Client application).
+ *
+ * Multiple err handlers can be registered for different events. PMIX returns
+ * an integer reference to each register handler in the callback fn. The caller
+ * must retain the reference in order to deregister the evhdlr.
+ * Modification of the notification behavior can be accomplished by
+ * deregistering the current evhdlr, and then registering it
+ * using a new set of info values.
+ *
+ * If cbfunc is NULL, then this is treated as a BLOCKING call - a positive
+ * return value represents the reference ID for the request, while
+ * negative values indicate the corresponding error
+ *
+ * See pmix_common.h for a description of the notification function */
+PMIX_EXPORT pmix_status_t PMIx_Register_event_handler(pmix_status_t codes[], size_t ncodes,
+                                                      pmix_info_t info[], size_t ninfo,
+                                                      pmix_notification_fn_t evhdlr,
+                                                      pmix_hdlr_reg_cbfunc_t cbfunc,
+                                                      void *cbdata);
+
+/* Deregister an event handler
+ * evhdlr_ref is the reference returned by PMIx from the call to
+ * PMIx_Register_event_handler. If non-NULL, the provided cbfunc
+ * will be called to confirm removal of the designated handler */
+PMIX_EXPORT pmix_status_t PMIx_Deregister_event_handler(size_t evhdlr_ref,
+                                                        pmix_op_cbfunc_t cbfunc,
+                                                        void *cbdata);
+
+/* Report an event for notification via any
+ * registered evhdlr.
+ *
+ * This function allows the host server to direct the server
+ * convenience library to notify all registered local procs of
+ * an event. The event can be local, or anywhere in the cluster.
+ * The status indicates the event being reported.
+ *
+ * The client application can also call this function to notify the
+ * resource manager and/or other processes of an event it encountered.
+ * It can also be used to asynchronously notify other parts of its
+ * own internal process - e.g., for one library to notify another
+ * when initialized inside the process.
+ *
+ * status - status code indicating the event being reported
+ *
+ * source - the process that generated the event
+ *
+ * range - the range in which the event is to be reported. For example,
+ *         a value of PMIX_RANGE_LOCAL would instruct the system
+ *         to only notify procs on the same local node as the
+ *         event generator.
+ *
+ * info - an array of pmix_info_t structures provided by the event
+ *        generator to pass any additional information about the
+ *        event. This can include an array of pmix_proc_t structs
+ *        describing the processes impacted by the event, the nature
+ *        of the event and its severity, etc. The precise contents
+ *        of the array will depend on the event generator.
+ *
+ * ninfo - number of elements in the info array
+ *
+ * cbfunc - callback function to be called upon completion of the
+ *          notify_event function's actions. Note that any messages
+ *          will have been queued, but may not have been transmitted
+ *          by this time. Note that the caller is required to maintain
+ *          the input data until the callback function has been executed!
+ *          If cbfunc is NULL, then this is treated as a BLOCKING call and
+ *          the result of the operation is provided in the returned
+ *          status
+ *
+ * cbdata - the caller's provided void* object
+ */
+PMIX_EXPORT pmix_status_t PMIx_Notify_event(pmix_status_t status,
+                                            const pmix_proc_t *source,
+                                            pmix_data_range_t range,
+                                            const pmix_info_t info[], size_t ninfo,
+                                            pmix_op_cbfunc_t cbfunc, void *cbdata);
+
+
+/******    FABRIC-RELATED APIS    ******/
+/* Register for access to fabric-related information, including
+ * communication cost matrix. This call must be made prior to
+ * requesting information from a fabric.
+ *
+ * fabric - address of a pmix_fabric_t (backed by storage). User
+ *          may populate the "name" field at will - PMIx does not
+ *          utilize this field
+ *
+ * directives - an optional array of values indicating desired
+ *              behaviors and/or fabric to be accessed. If NULL,
+ *              then the highest priority available fabric will
+ *              be used
+ *
+ * ndirs - number of elements in the directives array
+ *
+ * Return values include:
+ *
+ * PMIX_SUCCESS - indicates success
+ */
+PMIX_EXPORT pmix_status_t PMIx_Fabric_register(pmix_fabric_t *fabric,
+                                               const pmix_info_t directives[],
+                                               size_t ndirs);
+
+PMIX_EXPORT pmix_status_t PMIx_Fabric_register_nb(pmix_fabric_t *fabric,
+												  const pmix_info_t directives[],
+											      size_t ndirs,
+												  pmix_op_cbfunc_t cbfunc, void *cbdata);
+
+
+/* Update fabric-related information. This call can be made at any time to request an update of the
+ * fabric information contained in the provided pmix_fabric_t object. The caller is not allowed
+ * to access the provided pmix_fabric_t until the call has returned.
+ *
+ * fabric - pointer to the pmix_fabric_t struct provided to
+ *          the registration function
+ *
+ * Return values include:
+ *
+ * PMIX_SUCCESS - indicates successful update
+ */
+PMIX_EXPORT pmix_status_t PMIx_Fabric_update(pmix_fabric_t *fabric);
+
+PMIX_EXPORT pmix_status_t PMIx_Fabric_update_nb(pmix_fabric_t *fabric,
+											    pmix_op_cbfunc_t cbfunc, void *cbdata);
+
+
+/* Deregister a fabric object, providing an opportunity for
+ * the PMIx server library to cleanup any information
+ * (e.g., cost matrix) associated with it
+ *
+ * fabric - pointer to the pmix_fabric_t struct provided
+ *          to the registration function
+ */
+PMIX_EXPORT pmix_status_t PMIx_Fabric_deregister(pmix_fabric_t *fabric);
+
+PMIX_EXPORT pmix_status_t PMIx_Fabric_deregister_nb(pmix_fabric_t *fabric,
+												    pmix_op_cbfunc_t cbfunc, void *cbdata);
+
+
+/* Compute the distance information for the current process
+ * Returns an array of distances from the current process
+ * location to each of the local devices of the specified type(s)
+ *
+ * distances - pointer to location where the array of
+ *             distances is to be returned
+ *
+ * ndist - number of elements in the distances array
+ *
+ * Return values include:
+ *
+ * PMIX_SUCCESS - distance array was successfully returned
+ * Other error
+ */
+PMIX_EXPORT pmix_status_t PMIx_Compute_distances(pmix_topology_t *topo,
+                                                 pmix_cpuset_t *cpuset,
+                                                 pmix_info_t info[], size_t ninfo,
+                                                 pmix_device_distance_t *distances[],
+                                                 size_t *ndist);
+
+PMIX_EXPORT pmix_status_t PMIx_Compute_distances_nb(pmix_topology_t *topo,
+                                                    pmix_cpuset_t *cpuset,
+                                                    pmix_info_t info[], size_t ninfo,
+                                                    pmix_device_dist_cbfunc_t cbfunc,
+                                                    void *cbdata);
+
+/* Load the local hwardware topology description
+ *
+ * topo - pointer to a pmix_topology_t object. This object
+ *        must be initialized! If the a particular "source"
+ *        for the topology is required (e.g., "hwloc"), then
+ *        the "source" field of the object must be set to
+ *        that value
+ *
+ * Return values include:
+ * PMIX_SUCCESS - indicates return of a valid value
+ * PMIX_ERR_NOT_FOUND - provided source is not available
+ * PMIX_ERR_NOT_SUPPORTED - current implementation does not support this option
+ */
+PMIX_EXPORT pmix_status_t PMIx_Load_topology(pmix_topology_t *topo);
+
+
+PMIX_EXPORT void PMIx_Topology_destruct(pmix_topology_t *topo);
+
+/* Get the PU binding bitmap from its string representation
+ *
+ * cpuset_string - string representation of the binding bitmap
+ *                 (as returned by PMIx_Get using the PMIX_CPUSET key)
+ *
+ * cpuset - pointer to a pmix_cpuset_t object where the result
+ *          is to be stored
+ *
+ * Return values include:
+ * PMIX_SUCCESS - indicates return of a valid value
+ * PMIX_ERR_NOT_FOUND - provided source is not available
+ * PMIX_ERR_NOT_SUPPORTED - current implementation does not support this option
+ */
+PMIX_EXPORT pmix_status_t PMIx_Parse_cpuset_string(const char *cpuset_string,
+	                                               pmix_cpuset_t *cpuset);
+
+PMIX_EXPORT pmix_status_t PMIx_Get_cpuset(pmix_cpuset_t *cpuset, pmix_bind_envelope_t ref);
+
+PMIX_EXPORT void PMIx_Cpuset_destruct(pmix_cpuset_t *cpuset);
+
+/* Get the relative locality of two local processes given their locality strings.
+ *
+ * locality1 - String returned by the PMIx_server_generate_locality_string API
+ *
+ * locality2 - String returned by the PMIx_server_generate_locality_string API
+ *
+ * locality - Pointer to the location where the relative locality bitmask is
+ *            to be constructed
+ *
+ * Return values include:
+ * PMIX_SUCCESS - indicates return of a valid value
+ * other error constant
+ */
+PMIX_EXPORT pmix_status_t PMIx_Get_relative_locality(const char *locality1,
+	                                                 const char *locality2,
+	                                                 pmix_locality_t *locality);
+
+PMIX_EXPORT void PMIx_Progress(void);
+
+/******    PRETTY-PRINT DEFINED VALUE TYPES     ******/
+/* Provide a string representation for several types of value. Note
+ * that the provided string is statically defined and must NOT be
+ * free'd. Supported value types:
+ *
+ * - pmix_status_t (PMIX_STATUS)
+ * - pmix_scope_t   (PMIX_SCOPE)
+ * - pmix_persistence_t  (PMIX_PERSIST)
+ * - pmix_data_range_t   (PMIX_DATA_RANGE)
+ * - pmix_info_directives_t   (PMIX_INFO_DIRECTIVES)
+ * - pmix_data_type_t   (PMIX_DATA_TYPE)
+ * - pmix_alloc_directive_t  (PMIX_ALLOC_DIRECTIVE)
+ * - pmix_iof_channel_t  (PMIX_IOF_CHANNEL)
+ * - pmix_job_state_t  (PMIX_JOB_STATE)
+ * - pmix_proc_state_t  (PMIX_PROC_STATE)
+ */
+PMIX_EXPORT const char* PMIx_Error_string(pmix_status_t status);
+PMIX_EXPORT const char* PMIx_Proc_state_string(pmix_proc_state_t state);
+PMIX_EXPORT const char* PMIx_Scope_string(pmix_scope_t scope);
+PMIX_EXPORT const char* PMIx_Persistence_string(pmix_persistence_t persist);
+PMIX_EXPORT const char* PMIx_Data_range_string(pmix_data_range_t range);
+PMIX_EXPORT const char* PMIx_Data_type_string(pmix_data_type_t type);
+PMIX_EXPORT const char* PMIx_Alloc_directive_string(pmix_alloc_directive_t directive);
+PMIX_EXPORT const char* PMIx_IOF_channel_string(pmix_iof_channel_t channel);
+PMIX_EXPORT const char* PMIx_Job_state_string(pmix_job_state_t state);
+PMIX_EXPORT const char* PMIx_Get_attribute_string(const char *attribute);
+PMIX_EXPORT const char* PMIx_Get_attribute_name(const char *attrstring);
+PMIX_EXPORT const char* PMIx_Link_state_string(pmix_link_state_t state);
+PMIX_EXPORT const char* PMIx_Device_type_string(pmix_device_type_t type);
+PMIX_EXPORT const char* PMIx_Value_comparison_string(pmix_value_cmp_t cmp);
+
+/* the following print statements return ALLOCATED strings
+ * that the user must release when done */
+PMIX_EXPORT char* PMIx_Info_string(const pmix_info_t *info);
+PMIX_EXPORT char* PMIx_Value_string(const pmix_value_t *value);
+PMIX_EXPORT char* PMIx_Info_directives_string(pmix_info_directives_t directives);
+
+/* Get the PMIx version string. Note that the provided string is
+ * statically defined and must NOT be free'd  */
+PMIX_EXPORT const char* PMIx_Get_version(void);
+
+/* Store some data locally for retrieval by other areas of the
+ * proc. This is data that has only internal scope - it will
+ * never be "pushed" externally */
+PMIX_EXPORT pmix_status_t PMIx_Store_internal(const pmix_proc_t *proc,
+                                              const char key[], pmix_value_t *val);
+
+
+/******    DATA BUFFER PACK/UNPACK SUPPORT    ******/
+/**
+ * Top-level interface function to pack one or more values into a
+ * buffer.
+ *
+ * The pack function packs one or more values of a specified type into
+ * the specified buffer.  The buffer must have already been
+ * initialized via the PMIX_DATA_BUFFER_CREATE or PMIX_DATA_BUFFER_CONSTRUCT
+ * call - otherwise, the pack_value function will return an error.
+ * Providing an unsupported type flag will likewise be reported as an error.
+ *
+ * Note that any data to be packed that is not hard type cast (i.e.,
+ * not type cast to a specific size) may lose precision when unpacked
+ * by a non-homogeneous recipient.  The PACK function will do its best to deal
+ * with heterogeneity issues between the packer and unpacker in such
+ * cases. Sending a number larger than can be handled by the recipient
+ * will return an error code (generated upon unpacking) -
+ * the error cannot be detected during packing.
+ *
+ * The identity of the intended recipient of the packed buffer (i.e., the
+ * process that will be unpacking it) is used solely to resolve any data type
+ * differences between PMIx versions. The recipient must, therefore, be
+ * known to the user prior to calling the pack function so that the
+ * PMIx library is aware of the version the recipient is using.
+ *
+ * @param *target Pointer to a pmix_proc_t structure containing the
+ * nspace/rank of the process that will be unpacking the final buffer.
+ * A NULL value may be used to indicate that the target is based on
+ * the same PMIx version as the caller.
+ *
+ * @param *buffer A pointer to the buffer into which the value is to
+ * be packed.
+ *
+ * @param *src A void* pointer to the data that is to be packed. Note
+ * that strings are to be passed as (char **) - i.e., the caller must
+ * pass the address of the pointer to the string as the void*. This
+ * allows PMIx to use a single pack function, but still allow
+ * the caller to pass multiple strings in a single call.
+ *
+ * @param num_values An int32_t indicating the number of values that are
+ * to be packed, beginning at the location pointed to by src. A string
+ * value is counted as a single value regardless of length. The values
+ * must be contiguous in memory. Arrays of pointers (e.g., string
+ * arrays) should be contiguous, although (obviously) the data pointed
+ * to need not be contiguous across array entries.
+ *
+ * @param type The type of the data to be packed - must be one of the
+ * PMIX defined data types.
+ *
+ * @retval PMIX_SUCCESS The data was packed as requested.
+ *
+ * @retval PMIX_ERROR(s) An appropriate PMIX error code indicating the
+ * problem encountered. This error code should be handled
+ * appropriately.
+ *
+ * @code
+ * pmix_data_buffer_t *buffer;
+ * int32_t src;
+ *
+ * PMIX_DATA_BUFFER_CREATE(buffer);
+ * status_code = PMIx_Data_pack(buffer, &src, 1, PMIX_INT32);
+ * @endcode
+ */
+PMIX_EXPORT pmix_status_t PMIx_Data_pack(const pmix_proc_t *target,
+                                         pmix_data_buffer_t *buffer,
+                                         void *src, int32_t num_vals,
+                                         pmix_data_type_t type);
+
+/**
+ * Unpack values from a buffer.
+ *
+ * The unpack function unpacks the next value (or values) of a
+ * specified type from the specified buffer.
+ *
+ * The buffer must have already been initialized via an PMIX_DATA_BUFFER_CREATE or
+ * PMIX_DATA_BUFFER_CONSTRUCT call (and assumedly filled with some data) -
+ * otherwise, the unpack_value function will return an
+ * error. Providing an unsupported type flag will likewise be reported
+ * as an error, as will specifying a data type that DOES NOT match the
+ * type of the next item in the buffer. An attempt to read beyond the
+ * end of the stored data held in the buffer will also return an
+ * error.
+ *
+ * NOTE: it is possible for the buffer to be corrupted and that
+ * PMIx will *think* there is a proper variable type at the
+ * beginning of an unpack region - but that the value is bogus (e.g., just
+ * a byte field in a string array that so happens to have a value that
+ * matches the specified data type flag). Therefore, the data type error check
+ * is NOT completely safe. This is true for ALL unpack functions.
+ *
+ *
+ * Unpacking values is a "nondestructive" process - i.e., the values are
+ * not removed from the buffer. It is therefore possible for the caller
+ * to re-unpack a value from the same buffer by resetting the unpack_ptr.
+ *
+ * Warning: The caller is responsible for providing adequate memory
+ * storage for the requested data. As noted below, the user
+ * must provide a parameter indicating the maximum number of values that
+ * can be unpacked into the allocated memory. If more values exist in the
+ * buffer than can fit into the memory storage, then the function will unpack
+ * what it can fit into that location and return an error code indicating
+ * that the buffer was only partially unpacked.
+ *
+ * Note that any data that was not hard type cast (i.e., not type cast
+ * to a specific size) when packed may lose precision when unpacked by
+ * a non-homogeneous recipient.  PMIx will do its best to deal with
+ * heterogeneity issues between the packer and unpacker in such
+ * cases. Sending a number larger than can be handled by the recipient
+ * will return an error code generated upon unpacking - these errors
+ * cannot be detected during packing.
+ *
+ * The identity of the source of the packed buffer (i.e., the
+ * process that packed it) is used solely to resolve any data type
+ * differences between PMIx versions. The source must, therefore, be
+ * known to the user prior to calling the unpack function so that the
+ * PMIx library is aware of the version the source used.
+ *
+ * @param *source Pointer to a pmix_proc_t structure containing the
+ * nspace/rank of the process that packed the provided buffer.
+ * A NULL value may be used to indicate that the source is based on
+ * the same PMIx version as the caller.
+ *
+ * @param *buffer A pointer to the buffer from which the value will be
+ * extracted.
+ *
+ * @param *dest A void* pointer to the memory location into which the
+ * data is to be stored. Note that these values will be stored
+ * contiguously in memory. For strings, this pointer must be to (char
+ * **) to provide a means of supporting multiple string
+ * operations. The unpack function will allocate memory for each
+ * string in the array - the caller must only provide adequate memory
+ * for the array of pointers.
+ *
+ * @param type The type of the data to be unpacked - must be one of
+ * the BFROP defined data types.
+ *
+ * @retval *max_num_values The number of values actually unpacked. In
+ * most cases, this should match the maximum number provided in the
+ * parameters - but in no case will it exceed the value of this
+ * parameter.  Note that if you unpack fewer values than are actually
+ * available, the buffer will be in an unpackable state - the function will
+ * return an error code to warn of this condition.
+ *
+ * @note The unpack function will return the actual number of values
+ * unpacked in this location.
+ *
+ * @retval PMIX_SUCCESS The next item in the buffer was successfully
+ * unpacked.
+ *
+ * @retval PMIX_ERROR(s) The unpack function returns an error code
+ * under one of several conditions: (a) the number of values in the
+ * item exceeds the max num provided by the caller; (b) the type of
+ * the next item in the buffer does not match the type specified by
+ * the caller; or (c) the unpack failed due to either an error in the
+ * buffer or an attempt to read past the end of the buffer.
+ *
+ * @code
+ * pmix_data_buffer_t *buffer;
+ * int32_t dest;
+ * char **string_array;
+ * int32_t num_values;
+ *
+ * num_values = 1;
+ * status_code = PMIx_Data_unpack(buffer, (void*)&dest, &num_values, PMIX_INT32);
+ *
+ * num_values = 5;
+ * string_array = pmix_malloc(num_values*sizeof(char *));
+ * status_code = PMIx_Data_unpack(buffer, (void*)(string_array), &num_values, PMIX_STRING);
+ *
+ * @endcode
+ */
+PMIX_EXPORT pmix_status_t PMIx_Data_unpack(const pmix_proc_t *source,
+                                           pmix_data_buffer_t *buffer, void *dest,
+                                           int32_t *max_num_values,
+                                           pmix_data_type_t type);
+
+/**
+ * Copy a data value from one location to another.
+ *
+ * Since registered data types can be complex structures, the system
+ * needs some way to know how to copy the data from one location to
+ * another (e.g., for storage in the registry). This function, which
+ * can call other copy functions to build up complex data types, defines
+ * the method for making a copy of the specified data type.
+ *
+ * @param **dest The address of a pointer into which the
+ * address of the resulting data is to be stored.
+ *
+ * @param *src A pointer to the memory location from which the
+ * data is to be copied.
+ *
+ * @param type The type of the data to be copied - must be one of
+ * the PMIx defined data types.
+ *
+ * @retval PMIX_SUCCESS The value was successfully copied.
+ *
+ * @retval PMIX_ERROR(s) An appropriate error code.
+ *
+ */
+PMIX_EXPORT pmix_status_t PMIx_Data_copy(void **dest, void *src,
+                                         pmix_data_type_t type);
+
+/**
+ * Print a data value.
+ *
+ * Since registered data types can be complex structures, the system
+ * needs some way to know how to print them (i.e., convert them to a string
+ * representation). Provided for debug purposes.
+ *
+ * @retval PMIX_SUCCESS The value was successfully printed.
+ *
+ * @retval PMIX_ERROR(s) An appropriate error code.
+ */
+PMIX_EXPORT pmix_status_t PMIx_Data_print(char **output, char *prefix,
+                                          void *src, pmix_data_type_t type);
+
+/**
+ * Copy a payload from one buffer to another
+ *
+ * This function will append a copy of the payload in one buffer into
+ * another buffer.
+ * NOTE: This is NOT a destructive procedure - the
+ * source buffer's payload will remain intact, as will any pre-existing
+ * payload in the destination's buffer.
+ */
+PMIX_EXPORT pmix_status_t PMIx_Data_copy_payload(pmix_data_buffer_t *dest,
+                                                 pmix_data_buffer_t *src);
+
+/**
+ * Unload a buffer into a byte object
+ *
+ * The unload function provides the caller with a pointer to the data
+ * payload within the buffer and the size of that payload. This allows
+ * the user to directly access the payload.
+ *
+ * @note This is a destructive operation. While the payload is
+ * undisturbed, the function will clear the buffer's pointers to the
+ * payload. Thus, the buffer and the payload are completely separated,
+ * leaving the caller free to the buffer.
+ *
+ * @param buffer A pointer to the buffer whose payload is to be
+ * unloaded.
+ *
+ * @param payload The address of a pmix_byte_object_t into which
+ * the buffer is to be unloaded
+ *
+ * @retval PMIX_SUCCESS The request was successfully completed.
+ *
+ * @retval PMIX_ERROR(s) An appropriate error code indicating the
+ * problem will be returned. This should be handled appropriately by
+ * the caller.
+ *
+ * @code
+ * pmix_data_buffer_t *buffer;
+ * pmix_byte_object_t payload;
+ *
+ * status_code = PMIx_Data_unload(buffer, &payload);
+ * @endcode
+ */
+PMIX_EXPORT pmix_status_t PMIx_Data_unload(pmix_data_buffer_t *buffer,
+                                           pmix_byte_object_t *payload);
+
+/**
+ * Load a data payload into a buffer.
+ *
+ * The load function allows the caller to replace the payload in a
+ * buffer with one provided by the caller. If a payload already exists
+ * in the buffer, the function will "free" the existing data to
+ * release it, and then replace the data payload with the one provided
+ * by the caller.
+ *
+ * @note The buffer must be allocated in advance - failing to do so
+ * will cause the load function to return an error code.
+ *
+ * @note The caller is responsible for pre-packing the provided
+ * payload - the load function cannot convert to network byte order
+ * any data contained in the provided payload.
+ *
+ * @note The "payload" object will be empty upon completion of
+ * this operation.
+ *
+ * @param buffer A pointer to the buffer into which the payload is to
+ * be loaded.
+ *
+ * @param payload A pointer to the pmix_byte_object_t .containing the
+ * desired payload
+ *
+ * @retval PMIX_SUCCESS The request was successfully completed
+ *
+ * @retval PMIX_ERROR(s) An appropriate error code indicating the
+ * problem will be returned. This should be handled appropriately by
+ * the caller.
+ *
+ * @code
+ * pmix_data_buffer_t *buffer;
+ * pmix_byte_object_t payload;
+ *
+ * PMIX_DATA_BUFFER_CREATE(buffer);
+ * status_code = PMIx_Data_load(buffer, &payload);
+ * @endcode
+ */
+PMIX_EXPORT pmix_status_t PMIx_Data_load(pmix_data_buffer_t *buffer,
+                                         pmix_byte_object_t *payload);
+
+/**
+* Embed a data payload into a buffer.
+*
+* The embed function is identical in operation to PMIx_Data_load
+* except that it does NOT "clear" the payload upon completion.
+*
+* @note The buffer must be allocated in advance - failing to do so
+* will cause the function to return an error code.
+*
+* @note The caller is responsible for pre-packing the provided
+* payload - the load function cannot convert to network byte order
+* any data contained in the provided payload.
+*
+* @note The "payload" object is unaltered by this operation.
+*
+* @param buffer A pointer to the buffer into which the payload is to
+* be loaded.
+*
+* @param payload A pointer to the pmix_byte_object_t .containing the
+* desired payload
+*
+* @retval PMIX_SUCCESS The request was successfully completed
+*
+* @retval PMIX_ERROR(s) An appropriate error code indicating the
+* problem will be returned. This should be handled appropriately by
+* the caller.
+*
+* @code
+* pmix_data_buffer_t *buffer;
+* pmix_byte_object_t payload;
+*
+* PMIX_DATA_BUFFER_CREATE(buffer);
+* status_code = PMIx_Data_embed(buffer, &payload);
+* @endcode
+*/
+PMIX_EXPORT pmix_status_t PMIx_Data_embed(pmix_data_buffer_t *buffer,
+                                          const pmix_byte_object_t *payload);
+
+/**
+* Compress data using loss-less compression algorithm.
+*
+* Compress the provided data block. Destination memory
+* will be allocated if successful operation is concluded. Caller
+* is responsible for release of the allocated region. The input
+* data block will remain unaltered.
+*
+* Note: the compress function will return "false" if the operation
+* would not result in a smaller data block.
+*
+* @param inbytes A pointer to the data to be compressed
+*
+* @param size Number of bytes in the input data region
+*
+* @param outbytes Address where a pointer to the compressed
+* data region is to be returned
+*
+* @param nbytes Address where the number of bytes in the
+* compressed data region is to be returned
+*
+* @retval true The input data was compressed.
+*
+* @retval false The input data was not compressed
+*
+*/
+PMIX_EXPORT bool PMIx_Data_compress(const uint8_t *inbytes,
+                                    size_t size,
+                                    uint8_t **outbytes,
+                                    size_t *nbytes);
+
+/**
+* Decompress data.
+*
+* Decompress the provided data block. Destination memory
+* will be allocated if successful operation is concluded. Caller
+* is responsible for release of the allocated region. The input
+* data block will remain unaltered.
+*
+* note: only data compressed using PMIx_Data_compress can
+* be input to this function
+*
+* @param inbytes A pointer to the data to be decompressed
+*
+* @param size Number of bytes in the input data region
+*
+* @param outbytes Address where a pointer to the decompressed
+* data region is to be returned
+*
+* @param nbytes Address where the number of bytes in the
+* decompressed data region is to be returned
+*
+* @retval true The input data was decompressed
+*
+* @retval false The input data was not decompressed
+*
+*/
+PMIX_EXPORT bool PMIx_Data_decompress(const uint8_t *inbytes,
+                                      size_t size,
+                                      uint8_t **outbytes,
+                                      size_t *nbytes);
+
+
+/* We had to put some function definitions into pmix_deprecated.h for
+ * now-deprecated macros that utilize them as there are people who only
+ * included pmix_common.h if they were using macros but not APIs.
+ * However, we really want those APIs here so people will
+ * see them and know they exist. So include them here as well. */
+
+#ifndef PMIx_DEPRECATED_H
+/* Load data into a pmix_value_t structure. The data can be of any
+ * PMIx data type - which means the load can be somewhat complex
+ * to implement (e.g., in the case of a pmix_data_array_t). The
+ * data is COPIED into the value struct
+ */
+PMIX_EXPORT pmix_status_t PMIx_Value_load(pmix_value_t *val,
+                                          const void *data,
+                                          pmix_data_type_t type);
+
+/* Unload data from a pmix_value_t structure. */
+PMIX_EXPORT pmix_status_t PMIx_Value_unload(pmix_value_t *val,
+                                            void **data,
+                                            size_t *sz);
+
+PMIX_EXPORT void PMIx_Value_destruct(pmix_value_t *val);
+
+/* Transfer data from one pmix_value_t to another - this is actually
+ * executed as a COPY operation, so the original data is not altered.
+ */
+PMIX_EXPORT pmix_status_t PMIx_Value_xfer(pmix_value_t *dest,
+                                          const pmix_value_t *src);
+
+/* Compare the contents of two pmix_value_t structures */
+PMIX_EXPORT pmix_value_cmp_t PMIx_Value_compare(pmix_value_t *v1,
+                                                pmix_value_t *v2);
+
+PMIX_EXPORT void PMIx_Data_array_destruct(pmix_data_array_t *d);
+
+/* Load key/value data into a pmix_info_t struct. Note that this
+ * effectively is a PMIX_LOAD_KEY operation to copy the key,
+ * followed by a PMIx_Value_load to COPY the data into the
+ * pmix_value_t in the provided info struct */
+PMIX_EXPORT pmix_status_t PMIx_Info_load(pmix_info_t *info,
+                                         const char *key,
+                                         const void *data,
+                                         pmix_data_type_t type);
+
+/* Transfer data from one pmix_info_t to another - this is actually
+ * executed as a COPY operation, so the original data is not altered */
+PMIX_EXPORT pmix_status_t PMIx_Info_xfer(pmix_info_t *dest,
+                                         const pmix_info_t *src);
+
+/* Constructing arrays of pmix_info_t for passing to an API can
+ * be tedious since the pmix_info_t itself is not a "list object".
+ * Since this is a very frequent operation, a set of APIs has been
+ * provided that opaquely manipulates internal PMIx list structures
+ * for this purpose. The user only need provide a void* pointer to
+ * act as the caddy for the internal list object.
+ */
+
+/* Initialize a list of pmix_info_t structures */
+PMIX_EXPORT void* PMIx_Info_list_start(void);
+
+/* Add data to a list of pmix_info_t structs. The "ptr" passed
+ * here is the pointer returned by PMIx_Info_list_start.
+ */
+PMIX_EXPORT pmix_status_t PMIx_Info_list_add(void *ptr,
+                                             const char *key,
+                                             const void *value,
+                                             pmix_data_type_t type);
+
+PMIX_EXPORT pmix_status_t PMIx_Info_list_insert(void *ptr, pmix_info_t *info);
+
+/* Transfer the data in an existing pmix_info_t struct to a list. This
+ * is executed as a COPY operation, so the original data is not altered.
+ * The "ptr" passed here is the pointer returned by PMIx_Info_list_start
+ */
+PMIX_EXPORT pmix_status_t PMIx_Info_list_xfer(void *ptr,
+                                              const pmix_info_t *info);
+
+/* Convert the constructed list of pmix_info_t structs to a pmix_data_array_t
+ * of pmix_info_t. Data on the list is COPIED to the array elements.
+ */
+PMIX_EXPORT pmix_status_t PMIx_Info_list_convert(void *ptr, pmix_data_array_t *par);
+
+/* Release all data on the list and destruct all internal tracking */
+PMIX_EXPORT void PMIx_Info_list_release(void *ptr);
+
+#endif
+
+#if defined(c_plusplus) || defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/deps/pmix/include/pmix_common.h b/deps/pmix/include/pmix_common.h
new file mode 100644
index 000000000..d1e83cecc
--- /dev/null
+++ b/deps/pmix/include/pmix_common.h
@@ -0,0 +1,4352 @@
+/* include/pmix_common.h.  Generated from pmix_common.h.in by configure.  */
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
+/*
+ * Copyright (c) 2013-2020 Intel, Inc.  All rights reserved.
+ * Copyright (c) 2016-2019 Research Organization for Information Science
+ *                         and Technology (RIST).  All rights reserved.
+ * Copyright (c) 2016-2022 IBM Corporation.  All rights reserved.
+ * Copyright (c) 2016-2019 Mellanox Technologies, Inc.
+ *                         All rights reserved.
+ * Copyright (c) 2021      Triad National Security, LLC. All rights
+ *                         reserved.
+ *
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer listed
+ *   in this license in the documentation and/or other materials
+ *   provided with the distribution.
+ *
+ * - Neither the name of the copyright holders nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ *
+ * The copyright holders provide no reassurances that the source code
+ * provided does not infringe any patent, copyright, or any other
+ * intellectual property rights of third parties.  The copyright holders
+ * disclaim any liability to any recipient for claims brought against
+ * recipient by any third party for infringement of that parties
+ * intellectual property rights.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Copyright (c) 2020      Cisco Systems, Inc.  All rights reserved
+ * Copyright (c) 2021-2022 Nanook Consulting  All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#ifndef PMIx_COMMON_H
+#define PMIx_COMMON_H
+
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+#include <ctype.h>
+#include <sys/time.h> /* for struct timeval */
+#include <unistd.h> /* for uid_t and gid_t */
+#include <sys/types.h> /* for uid_t and gid_t */
+
+extern char **environ;
+
+/* Whether C compiler supports -fvisibility */
+#define PMIX_HAVE_VISIBILITY 1
+
+#if PMIX_HAVE_VISIBILITY == 1
+#define PMIX_EXPORT __attribute__((__visibility__("default")))
+#else
+#define PMIX_EXPORT
+#endif
+
+
+#include <pmix_version.h>
+
+#if defined(c_plusplus) || defined(__cplusplus)
+extern "C" {
+#endif
+
+/****  PMIX CONSTANTS    ****/
+
+/* define maximum value and key sizes */
+#define PMIX_MAX_NSLEN     255
+#define PMIX_MAX_KEYLEN    511
+
+/* define abstract types for namespaces and keys */
+typedef char pmix_nspace_t[PMIX_MAX_NSLEN+1];
+typedef char pmix_key_t[PMIX_MAX_KEYLEN+1];
+
+/* define a type for rank values */
+typedef uint32_t pmix_rank_t;
+
+/* define a value for requests for job-level data
+ * where the info itself isn't associated with any
+ * specific rank, or when a request involves
+ * a rank that isn't known - e.g., when someone requests
+ * info thru one of the legacy interfaces where the rank
+ * is typically encoded into the key itself since there is
+ * no rank parameter in the API itself */
+#define PMIX_RANK_UNDEF     UINT32_MAX
+/* define a value to indicate that the user wants the
+ * data for the given key from every rank that posted
+ * that key */
+#define PMIX_RANK_WILDCARD  UINT32_MAX-1
+/* other special rank values will be used to define
+ * groups of ranks for use in collectives */
+#define PMIX_RANK_LOCAL_NODE    UINT32_MAX-2        // all ranks on local node
+#define PMIX_RANK_LOCAL_PEERS   UINT32_MAX-4        // all peers (i.e., all procs within the same nspace) on local node
+/* define an invalid value */
+#define PMIX_RANK_INVALID   UINT32_MAX-3
+/* define a boundary for valid ranks */
+#define PMIX_RANK_VALID         UINT32_MAX-50
+/* define a macro for testing for valid ranks */
+#define PMIX_RANK_IS_VALID(r)   \
+    ((r) < PMIX_RANK_VALID)
+
+/* define a value to indicate that data applies
+ * to all apps in a job */
+#define PMIX_APP_WILDCARD  UINT32_MAX
+
+/****  PMIX ENVIRONMENTAL PARAMETERS  ****/
+/* URI of tool waiting for launcher to rendezvous back to it */
+#define PMIX_LAUNCHER_RNDZ_URI "PMIX_LAUNCHER_RNDZ_URI"
+
+/* PMIX_LAUNCHER_RNDZ_FILE - if set, contains the full pathname
+ * of a file the launcher is to write that contains its connection info.
+ * Works in addition to anything else the launcher may output.
+ */
+#define PMIX_LAUNCHER_RNDZ_FILE "PMIX_LAUNCHER_RNDZ_FILE"
+
+/* pipe to be monitored that indicates when the parent process
+ * terminates - used by fork'd tools to identify when the tool
+ * that started them has died */
+#define PMIX_KEEPALIVE_PIPE "PMIX_KEEPALIVE_PIPE"
+
+
+/* define a set of "standard" PMIx attributes that can
+ * be queried. Implementations (and users) are free to extend as
+ * desired, so the get functions need to be capable
+ * of handling the "not found" condition. Note that these
+ * are attributes of the system and the job as opposed to
+ * values the application (or underlying MPI library)
+ * might choose to expose - i.e., they are values provided
+ * by the resource manager as opposed to the application. Thus,
+ * these keys are RESERVED */
+#define PMIX_ATTR_UNDEF      "pmix.undef"
+
+/* initialization attributes */
+#define PMIX_EXTERNAL_PROGRESS              "pmix.evext"            // (bool) The host shall progress the PMIx library via
+                                                                    //        calls to PMIx_Progress
+#define PMIX_SERVER_TOOL_SUPPORT            "pmix.srvr.tool"        // (bool) The host RM wants to declare itself as willing
+                                                                    //        to accept tool connection requests
+#define PMIX_SERVER_REMOTE_CONNECTIONS      "pmix.srvr.remote"      // (bool) Allow connections from remote tools (do not use
+                                                                    //        loopback device)
+#define PMIX_SERVER_SYSTEM_SUPPORT          "pmix.srvr.sys"         // (bool) The host RM wants to declare itself as being
+                                                                    //        the local system server for PMIx connection
+                                                                    //        requests
+#define PMIX_SERVER_SESSION_SUPPORT         "pmix.srvr.sess"        // (bool) The host RM wants to declare itself as being
+                                                                    //        the local session server for PMIx connection
+                                                                    //        requests
+#define PMIX_SERVER_TMPDIR                  "pmix.srvr.tmpdir"      // (char*) temp directory where PMIx server will place
+                                                                    //        client rendezvous points and contact info
+#define PMIX_SYSTEM_TMPDIR                  "pmix.sys.tmpdir"       // (char*) temp directory for this system, where PMIx
+                                                                    //        server will place tool rendezvous points and
+                                                                    //        contact info
+#define PMIX_SERVER_SHARE_TOPOLOGY          "pmix.srvr.share"       // (bool) server is to share its copy of the local node
+                                                                    // topology (whether given to it or self-discovered) with any clients.
+#define PMIX_SERVER_ENABLE_MONITORING       "pmix.srv.monitor"      // (bool) Enable PMIx internal monitoring by server
+#define PMIX_SERVER_NSPACE                  "pmix.srv.nspace"       // (char*) Name of the nspace to use for this server
+#define PMIX_SERVER_RANK                    "pmix.srv.rank"         // (pmix_rank_t) Rank of this server
+#define PMIX_SERVER_GATEWAY                 "pmix.srv.gway"         // (bool) Server is acting as a gateway for PMIx requests
+                                                                    //        that cannot be serviced on backend nodes
+                                                                    //        (e.g., logging to email)
+#define PMIX_SERVER_SCHEDULER               "pmix.srv.sched"        // (bool) Server supports system scheduler
+#define PMIX_SERVER_START_TIME              "pmix.srv.strtime"      // (char*) Time when the server started - i.e., when the server created
+                                                                    //         it's rendezvous file (given in ctime string format)
+#define PMIX_HOMOGENEOUS_SYSTEM             "pmix.homo"             // (bool) The nodes comprising the session are homogeneous - i.e., they
+                                                                    //        each contain the same number of identical packages, fabric interfaces,
+                                                                    //        GPU, and other devices
+#define PMIX_SINGLETON                      "pmix.singleton"        // (char*) String representation (nspace.rank) of proc ID for the singleton
+                                                                    //         the server was started to support
+#define PMIX_BIND_PROGRESS_THREAD           "pmix.bind.pt"          // (char*) Comma-delimited ranges of CPUs that the internal PMIx progress
+                                                                    //         thread shall be bound to
+#define PMIX_BIND_REQUIRED                  "pmix.bind.reqd"        // (bool) Return error if the internal PMIx progress thread cannot be bound
+
+
+/* tool-related attributes */
+#define PMIX_TOOL_NSPACE                    "pmix.tool.nspace"      // (char*) Name of the nspace to use for this tool
+#define PMIX_TOOL_RANK                      "pmix.tool.rank"        // (uint32_t) Rank of this tool
+#define PMIX_SERVER_PIDINFO                 "pmix.srvr.pidinfo"     // (pid_t) pid of the target server for a tool
+#define PMIX_CONNECT_TO_SYSTEM              "pmix.cnct.sys"         // (bool) The requestor requires that a connection be made only to
+                                                                    //        a local system-level PMIx server
+#define PMIX_CONNECT_SYSTEM_FIRST           "pmix.cnct.sys.first"   // (bool) Preferentially look for a system-level PMIx server first
+#define PMIX_SERVER_URI                     "pmix.srvr.uri"         // (char*) URI of server to be contacted
+#define PMIX_MYSERVER_URI                   "pmix.mysrvr.uri"       // (char*) URI of this proc's listener socket
+#define PMIX_SERVER_HOSTNAME                "pmix.srvr.host"        // (char*) node where target server is located
+#define PMIX_CONNECT_MAX_RETRIES            "pmix.tool.mretries"    // (uint32_t) maximum number of times to try to connect to server
+#define PMIX_CONNECT_RETRY_DELAY            "pmix.tool.retry"       // (uint32_t) time in seconds between connection attempts
+#define PMIX_TOOL_DO_NOT_CONNECT            "pmix.tool.nocon"       // (bool) the tool wants to use internal PMIx support, but does
+                                                                    //        not want to connect to a PMIx server
+                                                                    //        from the specified processes to this tool
+#define PMIX_TOOL_CONNECT_OPTIONAL          "pmix.tool.conopt"      // (bool) tool shall connect to a server if available, but otherwise
+                                                                    //        continue to operate unconnected
+#define PMIX_LAUNCHER                       "pmix.tool.launcher"    // (bool) tool is a launcher and needs rendezvous files created
+#define PMIX_LAUNCHER_RENDEZVOUS_FILE       "pmix.tool.lncrnd"      // (char*) Pathname of file where connection info is to be stored
+#define PMIX_TOOL_ATTACHMENT_FILE           "pmix.tool.attach"      // (char*) File containing connection info to be used for attaching to server
+#define PMIX_PRIMARY_SERVER                 "pmix.pri.srvr"         // (bool) The server to which the tool is connecting shall be designated
+                                                                    //        the primary server once connection has been accomplished.
+#define PMIX_NOHUP                          "pmix.nohup"            // (bool) Any processes started on behalf of the calling tool (or the
+                                                                    //        specified namespace, if such specification is included in the
+                                                                    //        list of attributes) should continue after the tool disconnects
+                                                                    //        from its server
+#define PMIX_LAUNCHER_DAEMON                "pmix.lnch.dmn"         // (char*) Path to executable that is to be used as the backend daemon
+                                                                    //        for the launcher. This replaces the launcher's own daemon with
+                                                                    //        the specified executable. Note that the user is therefore
+                                                                    //        responsible for ensuring compatibility of the specified
+                                                                    //        executable and the host launcher.
+#define PMIX_EXEC_AGENT                     "pmix.exec.agnt"        // (char*) Path to executable that the launcher's backend daemons are to
+                                                                    //        fork/exec in place of the actual application processes. The
+                                                                    //        launcher's daemon shall pass the full command line of the
+                                                                    //        application on the command line of the exec agent, which shall
+                                                                    //        not connect back to the launcher's daemon. The exec agent is
+                                                                    //        responsible for exec'ing the specified application process in
+                                                                    //        its own place.
+#define PMIX_LAUNCH_DIRECTIVES              "pmix.lnch.dirs"        // (pmix_data_array_t*) Array of pmix_info_t containing directives for
+                                                                    //        the launcher - a convenience attribute for retrieving all
+                                                                    //        directives with a single call to PMIx_Get
+
+/* identification attributes */
+#define PMIX_USERID                         "pmix.euid"             // (uint32_t) effective user id
+#define PMIX_GRPID                          "pmix.egid"             // (uint32_t) effective group id
+#define PMIX_VERSION_INFO                   "pmix.version"          // (char*) PMIx version of contactor
+#define PMIX_REQUESTOR_IS_TOOL              "pmix.req.tool"         // (bool) requesting process is a tool
+#define PMIX_REQUESTOR_IS_CLIENT            "pmix.req.client"       // (bool) requesting process is a client process
+#define PMIX_PSET_NAME                      "pmix.pset.nm"          // (char*) The name of the newly defined process set.
+#define PMIX_PSET_NAMES                     "pmix.pset.nms"         // (pmix_data_array_t*) Returns an array of string names of the
+                                                                    //          process sets in which the given process is a member.
+#define PMIX_PSET_MEMBERS                   "pmix.pset.mems"        // (pmix_data_array_t*) An array of pmix_proc_t containing
+                                                                    //          the members of the newly defined process set.
+#define PMIX_REINCARNATION                  "pmix.reinc"            // (uint32_t) number of times this process has been instantiated - i.e.,
+                                                                    //            tracks the number of times it has been restarted
+
+/* model attributes */
+#define PMIX_PROGRAMMING_MODEL              "pmix.pgm.model"        // (char*) programming model being initialized (e.g., "MPI" or "OpenMP")
+#define PMIX_MODEL_LIBRARY_NAME             "pmix.mdl.name"         // (char*) programming model implementation ID (e.g., "OpenMPI" or "MPICH")
+#define PMIX_MODEL_LIBRARY_VERSION          "pmix.mld.vrs"          // (char*) programming model version string (e.g., "2.1.1")
+#define PMIX_THREADING_MODEL                "pmix.threads"          // (char*) threading model used (e.g., "pthreads")
+#define PMIX_MODEL_NUM_THREADS              "pmix.mdl.nthrds"       // (uint64_t) number of active threads being used by the model
+#define PMIX_MODEL_NUM_CPUS                 "pmix.mdl.ncpu"         // (uint64_t) number of cpus being used by the model
+#define PMIX_MODEL_CPU_TYPE                 "pmix.mdl.cputype"      // (char*) granularity - "hwthread", "core", etc.
+#define PMIX_MODEL_PHASE_NAME               "pmix.mdl.phase"        // (char*) user-assigned name for a phase in the application execution - e.g.,
+                                                                    //         "cfd reduction"
+#define PMIX_MODEL_PHASE_TYPE               "pmix.mdl.ptype"        // (char*) type of phase being executed - e.g., "matrix multiply"
+#define PMIX_MODEL_AFFINITY_POLICY          "pmix.mdl.tap"          // (char*) thread affinity policy - e.g.:
+                                                                    //           "master" (thread co-located with master thread),
+                                                                    //           "close" (thread located on cpu close to master thread)
+                                                                    //           "spread" (threads load-balanced across available cpus)
+
+/* attributes for TCP connections */
+#define PMIX_TCP_REPORT_URI                 "pmix.tcp.repuri"       // (char*) output URI - '-' => stdout, '+' => stderr, or filename
+#define PMIX_TCP_URI                        "pmix.tcp.uri"          // (char*) URI of server to connect to, or file:<name of file containing it>
+#define PMIX_TCP_IF_INCLUDE                 "pmix.tcp.ifinclude"    // (char*) comma-delimited list of devices and/or CIDR notation
+#define PMIX_TCP_IF_EXCLUDE                 "pmix.tcp.ifexclude"    // (char*) comma-delimited list of devices and/or CIDR notation
+#define PMIX_TCP_IPV4_PORT                  "pmix.tcp.ipv4"         // (int) IPv4 port to be used
+#define PMIX_TCP_IPV6_PORT                  "pmix.tcp.ipv6"         // (int) IPv6 port to be used
+#define PMIX_TCP_DISABLE_IPV4               "pmix.tcp.disipv4"      // (bool) true to disable IPv4 family
+#define PMIX_TCP_DISABLE_IPV6               "pmix.tcp.disipv6"      // (bool) true to disable IPv6 family
+
+
+/* general proc-level attributes */
+#define PMIX_CPUSET                         "pmix.cpuset"           // (char*) String representation of bitmap applied to process upon launch
+#define PMIX_CPUSET_BITMAP                  "pmix.bitmap"           // (pmix_cpuset_t*) Bitmap applied to process at launch
+#define PMIX_CREDENTIAL                     "pmix.cred"             // (char*) security credential assigned to proc
+#define PMIX_SPAWNED                        "pmix.spawned"          // (bool) true if this proc resulted from a call to PMIx_Spawn
+#define PMIX_NODE_OVERSUBSCRIBED            "pmix.ndosub"           // (bool) true if number of procs from this job on this node
+                                                                    //        exceeds the number of slots allocated to it
+
+/* scratch directory locations for use by applications */
+#define PMIX_TMPDIR                         "pmix.tmpdir"           // (char*) top-level tmp dir assigned to session
+#define PMIX_NSDIR                          "pmix.nsdir"            // (char*) sub-tmpdir assigned to namespace
+#define PMIX_PROCDIR                        "pmix.pdir"             // (char*) sub-nsdir assigned to proc
+#define PMIX_TDIR_RMCLEAN                   "pmix.tdir.rmclean"     // (bool)  Resource Manager will clean session directories
+
+
+/* information about relative ranks as assigned by the RM */
+#define PMIX_CLUSTER_ID                     "pmix.clid"             // (char*) a string name for the cluster this proc is executing on
+#define PMIX_PROCID                         "pmix.procid"           // (pmix_proc_t*) process identifier
+#define PMIX_NSPACE                         "pmix.nspace"           // (char*) nspace of a job
+#define PMIX_JOBID                          "pmix.jobid"            // (char*) jobid assigned by scheduler
+#define PMIX_APPNUM                         "pmix.appnum"           // (uint32_t) app number within the job
+#define PMIX_RANK                           "pmix.rank"             // (pmix_rank_t) process rank within the job
+#define PMIX_GLOBAL_RANK                    "pmix.grank"            // (pmix_rank_t) rank spanning across all jobs in this session
+#define PMIX_APP_RANK                       "pmix.apprank"          // (pmix_rank_t) rank within this app
+#define PMIX_NPROC_OFFSET                   "pmix.offset"           // (pmix_rank_t) starting global rank of this job
+#define PMIX_LOCAL_RANK                     "pmix.lrank"            // (uint16_t) rank on this node within this job
+#define PMIX_NODE_RANK                      "pmix.nrank"            // (uint16_t) rank on this node spanning all jobs
+#define PMIX_PACKAGE_RANK                   "pmix.pkgrank"          // (uint16_t) rank within this job on the package where this proc resides
+#define PMIX_LOCALLDR                       "pmix.lldr"             // (pmix_rank_t) lowest rank on this node within this job
+#define PMIX_APPLDR                         "pmix.aldr"             // (pmix_rank_t) lowest rank in this app within this job
+#define PMIX_PROC_PID                       "pmix.ppid"             // (pid_t) pid of specified proc
+#define PMIX_SESSION_ID                     "pmix.session.id"       // (uint32_t) session identifier
+#define PMIX_NODE_LIST                      "pmix.nlist"            // (char*) comma-delimited list of nodes running procs for the specified nspace
+#define PMIX_ALLOCATED_NODELIST             "pmix.alist"            // (char*) comma-delimited list of all nodes in this allocation regardless of
+                                                                    //         whether or not they currently host procs.
+#define PMIX_HOSTNAME                       "pmix.hname"            // (char*) name of the host the specified proc is on
+#define PMIX_HOSTNAME_ALIASES               "pmix.alias"            // (char*) comma-delimited list of names by which this node is known
+#define PMIX_HOSTNAME_KEEP_FQDN             "pmix.fqdn"             // (bool) FQDN hostnames are being retained
+#define PMIX_NODEID                         "pmix.nodeid"           // (uint32_t) node identifier where the specified proc is located
+#define PMIX_LOCAL_PEERS                    "pmix.lpeers"           // (char*) comma-delimited string of ranks on this node within the specified nspace
+#define PMIX_LOCAL_PROCS                    "pmix.lprocs"           // (pmix_data_array_t*) array of pmix_proc_t of procs on the specified node
+#define PMIX_LOCAL_CPUSETS                  "pmix.lcpus"            // (char*) colon-delimited cpusets of local peers within the specified nspace
+#define PMIX_PARENT_ID                      "pmix.parent"           // (pmix_proc_t*) identifier of the process that called PMIx_Spawn
+                                                                    //                to launch this proc's application
+#define PMIX_EXIT_CODE                      "pmix.exit.code"        // (int) exit code returned when proc terminated
+
+/* size info */
+#define PMIX_UNIV_SIZE                      "pmix.univ.size"        // (uint32_t) #slots in this session
+#define PMIX_JOB_SIZE                       "pmix.job.size"         // (uint32_t) #procs in this job
+#define PMIX_JOB_NUM_APPS                   "pmix.job.napps"        // (uint32_t) #apps in this job
+#define PMIX_APP_SIZE                       "pmix.app.size"         // (uint32_t) #procs in this application
+#define PMIX_LOCAL_SIZE                     "pmix.local.size"       // (uint32_t) #procs in this job on this node
+#define PMIX_NODE_SIZE                      "pmix.node.size"        // (uint32_t) #procs across all jobs on this node
+#define PMIX_MAX_PROCS                      "pmix.max.size"         // (uint32_t) max #procs for this job
+#define PMIX_NUM_SLOTS                      "pmix.num.slots"        // (uint32_t) #slots allocated
+#define PMIX_NUM_NODES                      "pmix.num.nodes"        // (uint32_t) #nodes currently hosting processes in the specified realm.
+#define PMIX_NUM_ALLOCATED_NODES            "pmix.num.anodes"       // (uint32_t) #nodes in the specified realm regardless of whether or
+                                                                    //            not they currently host processes.
+
+/* Memory info */
+#define PMIX_AVAIL_PHYS_MEMORY              "pmix.pmem"             // (uint64_t) total available physical memory on this node
+#define PMIX_DAEMON_MEMORY                  "pmix.dmn.mem"          // (float) Mbytes of memory currently used by daemon
+#define PMIX_CLIENT_AVG_MEMORY              "pmix.cl.mem.avg"       // (float) Average Mbytes of memory used by client processes
+
+
+/* topology info */
+#define PMIX_TOPOLOGY2                      "pmix.topo2"            // (pmix_topology_t*) pointer to a PMIx topology object
+#define PMIX_LOCALITY_STRING                "pmix.locstr"           // (char*) string describing a proc's location
+
+
+/* request-related info */
+#define PMIX_COLLECT_DATA                   "pmix.collect"          // (bool) collect data and return it at the end of the operation
+#define PMIX_ALL_CLONES_PARTICIPATE         "pmix.clone.part"       // (bool) All clones of the calling process must participate in the collective operation.
+#define PMIX_COLLECT_GENERATED_JOB_INFO     "pmix.collect.gen"      // (bool) Collect all job-level information (i.e., reserved keys) that was locally
+                                                                    //        generated by PMIx servers. Some job-level information (e.g., distance between
+                                                                    //        processes and fabric devices) is best determined on a distributed basis as it
+                                                                    //        primarily pertains to local processes. Should remote processes need to access
+                                                                    //        the information, it can either be obtained collectively using the PMIx_Fence
+                                                                    //        operation with this directive, or can be retrieved one peer at a time using
+                                                                    //        PMIx_Get without first having performed the job-wide collection.
+#define PMIX_TIMEOUT                        "pmix.timeout"          // (int) time in sec before specified operation should time out (0 => infinite)
+#define PMIX_IMMEDIATE                      "pmix.immediate"        // (bool) specified operation should immediately return an error from the PMIx
+                                                                    //        server if requested data cannot be found - do not request it from
+                                                                    //        the host RM
+#define PMIX_WAIT                           "pmix.wait"             // (int) caller requests that the server wait until at least the specified
+                                                                    //       #values are found (0 => all and is the default)
+#define PMIX_NOTIFY_COMPLETION              "pmix.notecomp"         // (bool) notify parent process upon termination of child job
+#define PMIX_RANGE                          "pmix.range"            // (pmix_data_range_t) value for calls to publish/lookup/unpublish or for
+                                                                    //        monitoring event notifications
+#define PMIX_PERSISTENCE                    "pmix.persist"          // (pmix_persistence_t) value for calls to publish
+#define PMIX_DATA_SCOPE                     "pmix.scope"            // (pmix_scope_t) scope of the data to be found in a PMIx_Get call
+#define PMIX_OPTIONAL                       "pmix.optional"         // (bool) look only in the client's local data store for the requested value - do
+                                                                    //        not request data from the server if not found
+#define PMIX_GET_STATIC_VALUES              "pmix.get.static"       // (bool) Request that the data be returned in the provided storage location
+#define PMIX_GET_POINTER_VALUES             "pmix.get.pntrs"        // (bool) Request that any pointers in the returned value point directly
+                                                                    //       to values in the key-value store
+#define PMIX_EMBED_BARRIER                  "pmix.embed.barrier"    // (bool) execute a blocking fence operation before executing the
+                                                                    //        specified operation
+#define PMIX_JOB_TERM_STATUS                "pmix.job.term.status"  // (pmix_status_t) status returned upon job termination
+#define PMIX_PROC_TERM_STATUS               "pmix.proc.term.status" // (pmix_status_t) status returned upon process termination
+#define PMIX_PROC_STATE_STATUS              "pmix.proc.state"       // (pmix_proc_state_t) process state
+#define PMIX_GET_REFRESH_CACHE              "pmix.get.refresh"      // (bool) when retrieving data for a remote process, refresh the existing
+                                                                    //        local data cache for the process in case new values have been
+                                                                    //        put and committed by it since the last refresh
+#define PMIX_ACCESS_PERMISSIONS             "pmix.aperms"           // (pmix_data_array_t*) Define access permissions for the published
+                                                                    //        data. The value shall contain an array of pmix_info_t structs
+                                                                    //        containing the specified permissions.
+#define PMIX_ACCESS_USERIDS                 "pmix.auids"            // (pmix_data_array_t*) Array of effective UIDs that are allowed to
+                                                                    //        access the published data
+#define PMIX_ACCESS_GRPIDS                  "pmix.agids"            // (pmix_data_array_t*) Array of effective GIDs that are allowed to
+                                                                    //        access the published data
+#define PMIX_WAIT_FOR_CONNECTION            "pmix.wait.conn"        // (bool) wait until the specified connection has been made
+#define PMIX_QUALIFIED_VALUE                "pmix.qual.val"         // (pmix_data_array_t*) Value being provided consists of the primary
+                                                                    //        key-value pair in first position, followed by one or more
+                                                                    //        key-value qualifiers to be used when subsequently retrieving
+                                                                    //        the primary value
+
+
+/* attributes used by host server to pass data to/from the server convenience library - the
+ * data will then be parsed and provided to the local clients. Not generally accessible by users */
+#define PMIX_REGISTER_NODATA                "pmix.reg.nodata"       // (bool) Registration is for nspace only, do not copy job data
+#define PMIX_NODE_MAP                       "pmix.nmap"             // (char*) regex of nodes containing procs for this job
+#define PMIX_NODE_MAP_RAW                   "pmix.nmap.raw"         // (char*) comma-delimited list of nodes containing procs for this job
+#define PMIX_PROC_MAP                       "pmix.pmap"             // (char*) regex describing procs on each node within this job
+#define PMIX_PROC_MAP_RAW                   "pmix.pmap.raw"         // (char*) semi-colon delimited list of strings, each string containing
+                                                                    //         a comma-delimited list of ranks on the corresponding node
+#define PMIX_ANL_MAP                        "pmix.anlmap"           // (char*) process mapping in ANL notation (used in PMI-1/PMI-2)
+#define PMIX_APP_MAP_TYPE                   "pmix.apmap.type"       // (char*) type of mapping used to layout the application (e.g., cyclic)
+#define PMIX_APP_MAP_REGEX                  "pmix.apmap.regex"      // (char*) regex describing the result of the mapping
+#define PMIX_REQUIRED_KEY                   "pmix.req.key"          // (char*) key the user needs prior to responding from a dmodex request
+#define PMIX_LOCAL_COLLECTIVE_STATUS        "pmix.loc.col.st"       // (pmix_status_t) status code for local collective operation being
+                                                                    //         reported to host by server library
+#define PMIX_SORTED_PROC_ARRAY              "pmix.sorted.parr"      // (bool) Proc array being passed has been sorted
+
+
+/* event handler registration and notification info keys */
+#define PMIX_EVENT_HDLR_NAME                "pmix.evname"           // (char*) string name identifying this handler
+#define PMIX_EVENT_HDLR_FIRST               "pmix.evfirst"          // (bool) invoke this event handler before any other handlers
+#define PMIX_EVENT_HDLR_LAST                "pmix.evlast"           // (bool) invoke this event handler after all other handlers have been called
+#define PMIX_EVENT_HDLR_FIRST_IN_CATEGORY   "pmix.evfirstcat"       // (bool) invoke this event handler before any other handlers in this category
+#define PMIX_EVENT_HDLR_LAST_IN_CATEGORY    "pmix.evlastcat"        // (bool) invoke this event handler after all other handlers in this category have been called
+#define PMIX_EVENT_HDLR_BEFORE              "pmix.evbefore"         // (char*) put this event handler immediately before the one specified in the (char*) value
+#define PMIX_EVENT_HDLR_AFTER               "pmix.evafter"          // (char*) put this event handler immediately after the one specified in the (char*) value
+#define PMIX_EVENT_HDLR_PREPEND             "pmix.evprepend"        // (bool) prepend this handler to the precedence list within its category
+#define PMIX_EVENT_HDLR_APPEND              "pmix.evappend"         // (bool) append this handler to the precedence list within its category
+#define PMIX_EVENT_CUSTOM_RANGE             "pmix.evrange"          // (pmix_data_array_t*) array of pmix_proc_t defining range of event notification
+#define PMIX_EVENT_AFFECTED_PROC            "pmix.evproc"           // (pmix_proc_t*) single proc that was affected
+#define PMIX_EVENT_AFFECTED_PROCS           "pmix.evaffected"       // (pmix_data_array_t*) array of pmix_proc_t defining affected procs
+#define PMIX_EVENT_NON_DEFAULT              "pmix.evnondef"         // (bool) event is not to be delivered to default event handlers
+#define PMIX_EVENT_RETURN_OBJECT            "pmix.evobject"         // (void*) object to be returned whenever the registered cbfunc is invoked
+                                                                    //     NOTE: the object will _only_ be returned to the process that
+                                                                    //           registered it
+#define PMIX_EVENT_DO_NOT_CACHE             "pmix.evnocache"        // (bool) instruct the PMIx server not to cache the event
+#define PMIX_EVENT_SILENT_TERMINATION       "pmix.evsilentterm"     // (bool) do not generate an event when this job normally terminates
+#define PMIX_EVENT_PROXY                    "pmix.evproxy"          // (pmix_proc_t*) PMIx server that sourced the event
+#define PMIX_EVENT_TEXT_MESSAGE             "pmix.evtext"           // (char*) text message suitable for output by recipient - e.g., describing
+                                                                    //         the cause of the event
+#define PMIX_EVENT_TIMESTAMP                "pmix.evtstamp"         // (time_t) System time when the associated event occurred.
+#define PMIX_EVENT_ONESHOT                  "pmix.evone"            // (bool) when registering, indicate that this event handler is to be deleted
+                                                                    //        after being invoked
+
+/* fault tolerance-related events */
+#define PMIX_EVENT_TERMINATE_SESSION        "pmix.evterm.sess"      // (bool) RM intends to terminate session
+#define PMIX_EVENT_TERMINATE_JOB            "pmix.evterm.job"       // (bool) RM intends to terminate this job
+#define PMIX_EVENT_TERMINATE_NODE           "pmix.evterm.node"      // (bool) RM intends to terminate all procs on this node
+#define PMIX_EVENT_TERMINATE_PROC           "pmix.evterm.proc"      // (bool) RM intends to terminate just this process
+#define PMIX_EVENT_ACTION_TIMEOUT           "pmix.evtimeout"        // (int) time in sec before RM will execute error response
+
+/* attributes used to describe "spawn" directives */
+#define PMIX_PERSONALITY                    "pmix.pers"             // (char*) name of personality to use
+#define PMIX_HOST                           "pmix.host"             // (char*) comma-delimited list of hosts to use for spawned procs
+#define PMIX_HOSTFILE                       "pmix.hostfile"         // (char*) hostfile to use for spawned procs
+#define PMIX_ADD_HOST                       "pmix.addhost"          // (char*) comma-delimited list of hosts to add to allocation
+#define PMIX_ADD_HOSTFILE                   "pmix.addhostfile"      // (char*) hostfile to add to existing allocation
+#define PMIX_PREFIX                         "pmix.prefix"           // (char*) prefix to use for starting spawned procs
+#define PMIX_WDIR                           "pmix.wdir"             // (char*) working directory for spawned procs
+#define PMIX_WDIR_USER_SPECIFIED            "pmix.wdir.user"        // (bool) User specified the working directory
+#define PMIX_DISPLAY_MAP                    "pmix.dispmap"          // (bool) display placement map upon spawn
+#define PMIX_DISPLAY_MAP_DETAILED           "pmix.dispmapdet"       // (bool) display a highly detailed placement map upon spawn
+#define PMIX_DISPLAY_ALLOCATION             "pmix.dispalloc"        // (bool) display the resource allocation
+#define PMIX_DISPLAY_TOPOLOGY               "pmix.disptopo"         // (char*) comma-delimited list of hosts whose topology is
+                                                                    //         to be displayed
+#define PMIX_PPR                            "pmix.ppr"              // (char*) #procs to spawn on each identified resource
+#define PMIX_MAPBY                          "pmix.mapby"            // (char*) mapping policy
+#define PMIX_RANKBY                         "pmix.rankby"           // (char*) ranking policy
+#define PMIX_BINDTO                         "pmix.bindto"           // (char*) binding policy
+#define PMIX_PRELOAD_BIN                    "pmix.preloadbin"       // (bool) preload binaries
+#define PMIX_PRELOAD_FILES                  "pmix.preloadfiles"     // (char*) comma-delimited list of files to pre-position
+#define PMIX_STDIN_TGT                      "pmix.stdin"            // (pmix_proc_t*) proc that is to receive stdin
+                                                                    //                (PMIX_RANK_WILDCARD = all in given nspace)
+#define PMIX_DEBUGGER_DAEMONS               "pmix.debugger"         // (bool) spawned app consists of debugger daemons
+#define PMIX_COSPAWN_APP                    "pmix.cospawn"          // (bool) designated app is to be spawned as a disconnected
+                                                                    //        job - i.e., not part of the "comm_world" of the job
+#define PMIX_COLOCATE_PROCS                 "pmix.colproc"          // (pmix_data_array_t*) Array of pmix_proc_t identifying the procs
+                                                                    //        with which the new job's procs are to be colocated
+#define PMIX_COLOCATE_NPERPROC              "pmix.colnum.proc"      // (uint16_t) Number of procs to colocate with each identified proc
+#define PMIX_COLOCATE_NPERNODE              "pmix.colnum.node"      // (uint16_t) Number of procs to colocate on the node of each identified proc
+#define PMIX_SET_SESSION_CWD                "pmix.ssncwd"           // (bool) set the application's current working directory to
+                                                                    //        the session working directory assigned by the RM
+#define PMIX_INDEX_ARGV                     "pmix.indxargv"         // (bool) mark the argv with the rank of the proc
+#define PMIX_CPUS_PER_PROC                  "pmix.cpuperproc"       // (uint32_t) #cpus to assign to each rank
+#define PMIX_NO_PROCS_ON_HEAD               "pmix.nolocal"          // (bool) do not place procs on the head node
+#define PMIX_NO_OVERSUBSCRIBE               "pmix.noover"           // (bool) do not oversubscribe the cpus
+#define PMIX_REPORT_BINDINGS                "pmix.repbind"          // (bool) report bindings of the individual procs
+#define PMIX_CPU_LIST                       "pmix.cpulist"          // (char*) list of cpus to use for this job
+#define PMIX_JOB_RECOVERABLE                "pmix.recover"          // (bool) application supports recoverable operations
+#define PMIX_JOB_CONTINUOUS                 "pmix.continuous"       // (bool) application is continuous, all failed procs should
+                                                                        //        be immediately restarted
+#define PMIX_MAX_RESTARTS                   "pmix.maxrestarts"      // (uint32_t) max number of times to restart a job
+#define PMIX_FWD_STDIN                      "pmix.fwd.stdin"        // (bool) forward the stdin from this process to the target processes
+#define PMIX_FWD_STDOUT                     "pmix.fwd.stdout"       // (bool) forward stdout from the spawned processes to this process (typically used by a tool)
+#define PMIX_FWD_STDERR                     "pmix.fwd.stderr"       // (bool) forward stderr from the spawned processes to this process (typically used by a tool)
+#define PMIX_FWD_STDDIAG                    "pmix.fwd.stddiag"      // (bool) if a diagnostic channel exists, forward any output on it
+                                                                    //        from the spawned processes to this process (typically used by a tool)
+#define PMIX_SPAWN_TOOL                     "pmix.spwn.tool"        // (bool) job being spawned is a tool
+#define PMIX_CMD_LINE                       "pmix.cmd.line"         // (char*) command line executing in the specified nspace
+#define PMIX_FORKEXEC_AGENT                 "pmix.fe.agnt"          // (char*) command line of fork/exec agent to be used for starting
+                                                                    //         local processes
+#define PMIX_JOB_TIMEOUT                    "pmix.job.time"         // (int) time in sec before job should time out (0 => infinite)
+#define PMIX_SPAWN_TIMEOUT                  "pmix.sp.time"          // (int) time in sec before spawn operation should time out (0 => infinite)
+                                                                    //       Logically equivalent to passing the PMIX_TIMEOUT attribute to the
+                                                                    //       PMIx_Spawn API, it is provided as a separate attribute to distinguish
+                                                                    //       it from the PMIX_JOB_TIMEOUT attribute
+#define PMIX_TIMEOUT_STACKTRACES            "pmix.tim.stack"        // (bool) include process stacktraces in timeout report from a job
+#define PMIX_TIMEOUT_REPORT_STATE           "pmix.tim.state"        // (bool) report process states in timeout report from a job
+#define PMIX_APP_ARGV                       "pmix.app.argv"         // (char*) consolidated argv passed to the spawn command for the given app
+#define PMIX_NOTIFY_JOB_EVENTS              "pmix.note.jev"         // (bool) Requests that the launcher generate the PMIX_EVENT_JOB_START,
+                                                                    //        PMIX_LAUNCH_COMPLETE, and PMIX_EVENT_JOB_END events. Each event is to
+                                                                    //        include at least the namespace of the corresponding job and a
+                                                                    //        PMIX_EVENT_TIMESTAMP indicating the time the event occurred.
+#define PMIX_NOTIFY_PROC_TERMINATION        "pmix.noteproc"         // (bool) Requests that the launcher generate the PMIX_EVENT_PROC_TERMINATED
+                                                                    //        event whenever a process either normally or abnormally terminates.
+#define PMIX_NOTIFY_PROC_ABNORMAL_TERMINATION   "pmix.noteabproc"   // (bool) Requests that the launcher generate the PMIX_EVENT_PROC_TERMINATED
+                                                                    //        event only when a process abnormally terminates.
+#define PMIX_ENVARS_HARVESTED               "pmix.evar.hvstd"       // (bool) Envars have been harvested by the spawn requestor
+#define PMIX_RUNTIME_OPTIONS                "pmix.runopt"           // (char*) Environment-specific runtime directives that control job behavior
+#define PMIX_ABORT_NON_ZERO_TERM            "pmix.abnz"             // (bool) Abort the spawned job if any process terminates with non-zero status
+#define PMIX_DO_NOT_LAUNCH                  "pmix.dnl"              // (bool) Execute all procedures to prepare the requested job for launch,
+                                                                    //        but do not launch it. Typically combined with the PMIX_DISPLAY_MAP
+                                                                    //        or PMIX_DISPLAY_MAP_DETAILED for debugging purposes.
+#define PMIX_SHOW_LAUNCH_PROGRESS           "pmix.showprog"         // (bool) Provide periodic progress reports on job launch procedure (e.g., after
+                                                                    //        every 100 processes have been spawned)
+#define PMIX_AGGREGATE_HELP                 "pmix.agg.help"         // (bool) Aggregate help messages, reporting each unique help message once
+                                                                    //        accompanied by the number of processes that reported it
+#define PMIX_REPORT_CHILD_SEP               "pmix.rptchildsep"      // (bool) Report the exit status of any child jobs spawned by the primary job
+                                                                    //        separately. If false, then the final exit status reported will be
+                                                                    //        zero if the primary job and all spawned jobs exit normally, or the
+                                                                    //        first non-zero status returned by either primary or child jobs.
+
+
+/* query keys - value type shown is the type of the value that will be RETURNED by that key  */
+#define PMIX_QUERY_SUPPORTED_KEYS           "pmix.qry.keys"         // (char*) returns comma-delimited list of keys supported by the query
+                                                                    //         function. NO QUALIFIERS
+#define PMIX_QUERY_NAMESPACES               "pmix.qry.ns"           // (char*) returns a comma-delimited list of active namespaces. NO QUALIFIERS
+#define PMIX_QUERY_NAMESPACE_INFO           "pmix.qry.nsinfo"       // (pmix_data_array_t*) returns an array of active nspace information - each
+                                                                    //        element will contain an array including the namespace plus the
+                                                                    //        command line of the application executing within it
+                                                                    //        SUPPORTED QUALIFIERS: PMIX_NSPACE of specific nspace whose info
+                                                                    //        is being requested
+#define PMIX_QUERY_JOB_STATUS               "pmix.qry.jst"          // (pmix_status_t) returns status of a specified currently executing job
+                                                                    //         REQUIRES a PMIX_NSPACE qualifier indicating the nspace being queried
+#define PMIX_QUERY_QUEUE_LIST               "pmix.qry.qlst"         // (char*) request a comma-delimited list of scheduler queues. NO QUALIFIERS
+#define PMIX_QUERY_QUEUE_STATUS             "pmix.qry.qst"          // (pmix_data_array_t*) returns array where each element contains the name and
+                                                                    //         status of a scheduler queue
+                                                                    //        SUPPORTED QUALIFIERS: PMIX_ALLOC_QUEUE naming specific queue whose status
+                                                                    //        is being requested
+#define PMIX_QUERY_PROC_TABLE               "pmix.qry.ptable"       // (pmix_data_array_t*) returns (pmix_data_array_t*) an array of pmix_proc_info_t
+                                                                    //         REQUIRES a PMIX_NSPACE qualifier indicating the nspace being queried
+#define PMIX_QUERY_LOCAL_PROC_TABLE         "pmix.qry.lptable"      // (pmix_data_array_t*) returns (pmix_data_array_t*) an array of pmix_proc_info_t
+                                                                    //         of pmix_proc_info_t for procs in job on same node
+                                                                    //         REQUIRES a PMIX_NSPACE qualifier indicating the nspace being queried
+#define PMIX_QUERY_AUTHORIZATIONS           "pmix.qry.auths"        // (pmix_data_array_t*) return operations tool is authorized to perform. The contents
+                                                                    //         of the array elements have not yet been standardized. NO QUALIFIERS
+#define PMIX_QUERY_SPAWN_SUPPORT            "pmix.qry.spawn"        // (char*) return a comma-delimited list of supported spawn attributes. NO QUALIFIERS
+#define PMIX_QUERY_DEBUG_SUPPORT            "pmix.qry.debug"        // (char*) return a comma-delimited list of supported debug attributes. NO QUALIFIERS
+#define PMIX_QUERY_MEMORY_USAGE             "pmix.qry.mem"          // (pmix_data_array_t*) return info on memory usage for the procs indicated in the qualifiers
+                                                                    //        SUPPORTED QUALIFIERS: PMIX_NSPACE/PMIX_RANK, or PMIX_PROCID of specific proc(s)
+                                                                    //        whose info is being requested
+#define PMIX_QUERY_ALLOC_STATUS             "pmix.query.alloc"      // (char*) return a string reporting status of an allocation request
+                                                                    //         REQUIRES a PMIX_ALLOC_ID qualifier indicating the allocation request being queried
+#define PMIX_TIME_REMAINING                 "pmix.time.remaining"   // (uint32_t) returns number of seconds remaining in allocation
+                                                                    //         for the specified nspace (defaults to allocation containing the caller)
+                                                                    //         SUPPORTED QUALIFIERS: PMIX_NSPACE of the nspace whose info is being requested
+#define PMIX_QUERY_NUM_PSETS                "pmix.qry.psetnum"      // (size_t) returns the number of psets defined
+                                                                    //          in the specified range (defaults to session)
+                                                                    //         SUPPORTED QUALIFIERS: PMIX_RANGE whose info is being requested
+#define PMIX_QUERY_PSET_NAMES               "pmix.qry.psets"        // (char*) returns a comma-delimited list of the names of the
+                                                                    //         psets defined in the specified range (defaults to session)
+                                                                    //         SUPPORTED QUALIFIERS: PMIX_RANGE whose info is being requested
+#define PMIX_QUERY_PSET_MEMBERSHIP          "pmix.qry.pmems"        // (pmix_data_array_t*) Return an array of pmix_proc_t containing the members of
+                                                                    //         the specified process set.
+#define PMIX_QUERY_NUM_GROUPS               "pmix.qry.pgrpnum"      // (size_t) Return the number of process groups defined in the specified range
+                                                                    //         (defaults to session). OPTIONAL QUALIFERS: PMIX_RANGE.
+#define PMIX_QUERY_GROUP_NAMES              "pmix.qry.pgrp"         // (pmix_data_array_t*) Return a pmix_data_array_t containing an array of string
+                                                                    //         names of the process groups defined in the specified range (defaults
+                                                                    //         to session). OPTIONAL QUALIFERS: PMIX_RANGE
+#define PMIX_QUERY_GROUP_MEMBERSHIP         "pmix.qry.pgrpmems"     // (pmix_data_array_t*) Return a pmix_data_array_t of pmix_proc_t containing
+                                                                    //         the members of the specified process group. REQUIRED QUALIFIERS:
+                                                                    //         PMIX_GROUP_ID.
+#define PMIX_QUERY_ATTRIBUTE_SUPPORT        "pmix.qry.attrs"        // (pmix_data_array_t*) returns array of pmix_info_t where each element consists
+                                                                    //         of a key containing the name of the function, and an array of pmix_regattr_t
+                                                                    //         detailing the attribute support for that function
+                                                                    //         SUPPORTED QUALIFIERS: PMIX_CLIENT_FUNCTIONS, PMIX_SERVER_FUNCTIONS,
+                                                                    //         PMIX_TOOL_FUNCTIONS, and/or PMIX_HOST_FUNCTIONS
+#define PMIX_CLIENT_FUNCTIONS               "pmix.client.fns"       // (char*) returns a comma-delimited list of supported PMIx client functions. NO QUALIFIERS
+#define PMIX_SERVER_FUNCTIONS               "pmix.srvr.fns"         // (char*) returns a comma-delimited list of supported PMIx server functions. NO QUALIFIERS
+#define PMIX_TOOL_FUNCTIONS                 "pmix.tool.fns"         // (char*) returns a comma-delimited list of supported PMIx tool functions. NO QUALIFIERS
+#define PMIX_HOST_FUNCTIONS                 "pmix.host.fns"         // (char*) returns a comma-delimited list of PMIx functions supported by the host environment
+#define PMIX_QUERY_AVAIL_SERVERS            "pmix.qry.asrvrs"       // (pmix_data_array_t*) array of pmix_info_t, each element containing an array of
+                                                                    //         pmix_info_t of available data for servers on this node
+                                                                    //         to which the caller might be able to connect. NO QUALIFIERS
+#define PMIX_QUERY_QUALIFIERS               "pmix.qry.quals"        // (pmix_data_array_t*) Contains an array of qualifiers that were included in the
+                                                                    //         query that produced the provided results. This attribute is solely for
+                                                                    //         reporting purposes and cannot be used in PMIx_Get or other query
+                                                                    //         operations
+#define PMIX_QUERY_RESULTS                  "pmix.qry.res"          // (pmix_data_array_t*) Contains an array of query results for a given pmix_query_t passed to the
+                                                                    //         PMIx_Query_info APIs. If qualifiers were included in the query, then the first element
+                                                                    //         of the array shall be the PMIX_QUERY_QUALIFIERS attribute containing those qualifiers.
+                                                                    //         Each of the remaining elements of the array is a pmix_info_t containing the query key
+                                                                    //         and the corresponding value returned by the query. This attribute is solely for
+                                                                    //         reporting purposes and cannot be used in PMIx_Get or other query operations
+
+
+/* query qualifiers - these are used to provide information to narrow/modify the query. Value type shown is the type of data expected
+ * to be provided with the key */
+#define PMIX_QUERY_REFRESH_CACHE            "pmix.qry.rfsh"         // (bool) retrieve updated information from server
+                                                                    //        to update local cache
+#define PMIX_QUERY_LOCAL_ONLY               "pmix.qry.local"        // (bool) constrain the query to local information only
+#define PMIX_QUERY_REPORT_AVG               "pmix.qry.avg"          // (bool) report average values
+#define PMIX_QUERY_REPORT_MINMAX            "pmix.qry.minmax"       // (bool) report minimum and maximum value
+#define PMIX_CLIENT_ATTRIBUTES              "pmix.client.attrs"     // (char*) comma-delimited list of functions, including "all"
+                                                                    //        when used in a query, indicates whether or not to include
+                                                                    //        attributes supported by the PMIx client library
+#define PMIX_SERVER_ATTRIBUTES              "pmix.srvr.attrs"       // (char*) comma-delimited list of functions, including "all"
+                                                                    //        when used in a query, indicates whether or not to include
+                                                                    //        attributes supported by the PMIx server library
+#define PMIX_HOST_ATTRIBUTES                "pmix.host.attrs"       // (char*) comma-delimited list of functions, including "all"
+                                                                    //        when used in a query, indicates whether or not to include
+                                                                    //        attributes supported by the host environment
+#define PMIX_TOOL_ATTRIBUTES                "pmix.tool.attrs"       // (char*) comma-delimited list of functions, including "all"
+                                                                    //        when used in a query, indicates whether or not to include
+                                                                    //        attributes supported by the PMIx tool library
+#define PMIX_QUERY_SUPPORTED_QUALIFIERS     "pmix.qry.quals"        // (bool) return comma-delimited list of qualifiers supported by
+                                                                    //        a query on the provided key, instead of actually performing
+                                                                    //        the query on the key.
+
+
+/* PMIx_Get information retrieval attributes */
+#define PMIX_SESSION_INFO                   "pmix.ssn.info"         // (bool) Return information about the specified session. If information
+                                                                    //        about a session other than the one containing the requesting
+                                                                    //        process is desired, then the attribute array must contain a
+                                                                    //        PMIX_SESSION_ID attribute identifying the desired target.
+#define PMIX_JOB_INFO                       "pmix.job.info"         // (bool) Return information about the specified job or namespace. If
+                                                                    //        information about a job or namespace other than the one containing
+                                                                    //        the requesting process is desired, then the attribute array must
+                                                                    //        contain a PMIX_JOBID or PMIX_NSPACE attribute identifying the
+                                                                    //        desired target. Similarly, if information is requested about a
+                                                                    //        job or namespace in a session other than the one containing the
+                                                                    //        requesting process, then an attribute identifying the target
+                                                                    //        session must be provided.
+#define PMIX_APP_INFO                       "pmix.app.info"         // (bool) Return information about the specified application. If information
+                                                                    //        about an application other than the one containing the requesting
+                                                                    //        process is desired, then the attribute array must contain a
+                                                                    //        PMIX_APPNUM attribute identifying the desired target. Similarly,
+                                                                    //        if information is requested about an application in a job or session
+                                                                    //        other than the one containing the requesting process, then attributes
+                                                                    //        identifying the target job and/or session must be provided.
+#define PMIX_NODE_INFO                      "pmix.node.info"        // (bool) Return information about the specified node. If information about a
+                                                                    //        node other than the one containing the requesting process is desired,
+                                                                    //        then the attribute array must contain either the PMIX_NODEID or
+                                                                    //        PMIX_HOSTNAME attribute identifying the desired target.
+
+
+/* information storage attributes */
+#define PMIX_SESSION_INFO_ARRAY             "pmix.ssn.arr"          // (pmix_data_array_t*) Provide an array of pmix_info_t containing
+                                                                    //        session-level information. The PMIX_SESSION_ID attribute is required
+                                                                    //        to be included in the array.
+#define PMIX_JOB_INFO_ARRAY                 "pmix.job.arr"          // (pmix_data_array_t*) Provide an array of pmix_info_t containing job-level
+                                                                    //        information. Information is registered one job (aka namespace) at a time
+                                                                    //        via the PMIx_server_register_nspace API. Thus, there is no requirement that
+                                                                    //        the array contain either the PMIX_NSPACE or PMIX_JOBID attributes, though
+                                                                    //        either or both of them may be included.
+#define PMIX_APP_INFO_ARRAY                 "pmix.app.arr"          // (pmix_data_array_t*) Provide an array of pmix_info_t containing app-level
+                                                                    //        information. The PMIX_NSPACE or PMIX_JOBID attributes of the job containing
+                                                                    //        the application, plus its PMIX_APPNUM attribute, are required to be
+                                                                    //        included in the array.
+#define PMIX_PROC_INFO_ARRAY                "pmix.pdata"            // (pmix_data_array_t*) Provide an array of pmix_info_t containing process-realm
+                                                                    //        information. The PMIX_RANK and PMIX_NSPACE attributes, or the
+                                                                    //        PMIX_PROCID attribute, are required to be included in the array when
+                                                                    //        the array is not included as part of a call to
+                                                                    //        PMIx_server_register_nspace - i.e., when the job containing the process
+                                                                    //        is ambiguous. All three may be included if desired. When the array is
+                                                                    //        included in some broader structure that identifies the job, then only
+                                                                    //        the PMIX_RANK or the PMIX_PROCID attribute must be included (the others
+                                                                    //        are optional).
+#define PMIX_NODE_INFO_ARRAY                "pmix.node.arr"         // (pmix_data_array_t*) Provide an array of pmix_info_t containing node-level
+                                                                    //        information. At a minimum, either the PMIX_NODEID or PMIX_HOSTNAME
+                                                                    //        attribute is required to be included in the array, though both may be
+                                                                    //        included.
+#define PMIX_SERVER_INFO_ARRAY              "pmix.srv.arr"          // (pmix_data_array_t*) array of data on a given server, starting with its nspace
+
+
+/* log attributes */
+#define PMIX_LOG_SOURCE                     "pmix.log.source"       // (pmix_proc_t*) ID of source of the log request
+#define PMIX_LOG_STDERR                     "pmix.log.stderr"       // (char*) log string to stderr
+#define PMIX_LOG_STDOUT                     "pmix.log.stdout"       // (char*) log string to stdout
+#define PMIX_LOG_SYSLOG                     "pmix.log.syslog"       // (char*) log message to syslog - defaults to ERROR priority. Will log
+                                                                    //         to global syslog if available, otherwise to local syslog
+#define PMIX_LOG_LOCAL_SYSLOG               "pmix.log.lsys"         // (char*) log msg to local syslog - defaults to ERROR priority
+#define PMIX_LOG_GLOBAL_SYSLOG              "pmix.log.gsys"         // (char*) forward data to system "master" and log msg to that syslog
+#define PMIX_LOG_SYSLOG_PRI                 "pmix.log.syspri"       // (int) syslog priority level
+
+#define PMIX_LOG_TIMESTAMP                  "pmix.log.tstmp"        // (time_t) timestamp for log report
+#define PMIX_LOG_GENERATE_TIMESTAMP         "pmix.log.gtstmp"       // (bool) generate timestamp for log
+#define PMIX_LOG_TAG_OUTPUT                 "pmix.log.tag"          // (bool) label the output stream with the channel name (e.g., "stdout")
+#define PMIX_LOG_TIMESTAMP_OUTPUT           "pmix.log.tsout"        // (bool) print timestamp in output string
+#define PMIX_LOG_XML_OUTPUT                 "pmix.log.xml"          // (bool) print the output stream in xml format
+#define PMIX_LOG_ONCE                       "pmix.log.once"         // (bool) only log this once with whichever channel can first support it
+#define PMIX_LOG_MSG                        "pmix.log.msg"          // (pmix_byte_object_t) message blob to be sent somewhere
+#define PMIX_LOG_KEY                        "pmix.log.key"          // (char*) key to a logging message
+#define PMIX_LOG_VAL                        "pmix.log.val"          // (char*) value to a logging message
+#define PMIX_LOG_AGG                        "pmix.log.agg"          // (bool) Whether to aggregate and prevent duplicate logging messages
+                                                                    //        based on key value pairs.
+
+#define PMIX_LOG_EMAIL                      "pmix.log.email"        // (pmix_data_array_t*) log via email based on array of pmix_info_t
+                                                                    //         containing directives
+#define PMIX_LOG_EMAIL_ADDR                 "pmix.log.emaddr"       // (char*) comma-delimited list of email addresses that are to recv msg
+#define PMIX_LOG_EMAIL_SENDER_ADDR          "pmix.log.emfaddr"      // (char*) return email address of sender
+#define PMIX_LOG_EMAIL_SUBJECT              "pmix.log.emsub"        // (char*) subject line for email
+#define PMIX_LOG_EMAIL_MSG                  "pmix.log.emmsg"        // (char*) msg to be included in email
+#define PMIX_LOG_EMAIL_SERVER               "pmix.log.esrvr"        // (char*) hostname (or IP addr) of estmp server
+#define PMIX_LOG_EMAIL_SRVR_PORT            "pmix.log.esrvrprt"     // (int32_t) port the email server is listening to
+
+#define PMIX_LOG_GLOBAL_DATASTORE           "pmix.log.gstore"       // (bool) log the provided data to a global datastore
+#define PMIX_LOG_JOB_RECORD                 "pmix.log.jrec"         // (bool) log the provided information to the RM's job record
+#define PMIX_LOG_PROC_TERMINATION           "pmix.logproc"          // (bool) Requests that the launcher log the PMIX_EVENT_PROC_TERMINATED event
+                                                                    //        whenever a process either normally or abnormally terminates.
+#define PMIX_LOG_PROC_ABNORMAL_TERMINATION  "pmix.logabproc"        // (bool) Requests that the launcher log the PMIX_EVENT_PROC_TERMINATED event
+                                                                    //        only when a process abnormally terminates.
+#define PMIX_LOG_JOB_EVENTS                 "pmix.log.jev"          // (bool) Requests that the launcher log the PMIX_EVENT_JOB_START,
+                                                                    //        PMIX_LAUNCH_COMPLETE, and PMIX_EVENT_JOB_END events using PMIx_Log
+#define PMIX_LOG_COMPLETION                 "pmix.logcomp"          // (bool) Requests that the launcher log the PMIX_EVENT_JOB_END event
+                                                                    //        for normal or abnormal termination of the spawned job using
+                                                                    //        PMIx_Log. The event shall include the returned status code
+                                                                    //        (PMIX_JOB_TERM_STATUS) for the corresponding job; the identity
+                                                                    //        (PMIX_PROCID) and exit status (PMIX_EXIT_CODE) of the first failed
+                                                                    //        process, if applicable; and a PMIX_EVENT_TIMESTAMP indicating the time
+                                                                    //        the termination occurred.
+
+
+/* debugger attributes */
+#define PMIX_DEBUG_STOP_ON_EXEC             "pmix.dbg.exec"         // (varies) stop specified rank(s) on exec and notify ready-to-debug
+                                                                    //        Can be any of three data types:
+                                                                    //           (a) bool - true indicating all ranks, false indicating none
+                                                                    //           (b) pmix_rank_t - the rank of one proc, or WILDCARD for all
+                                                                    //           (c) a pmix_data_array_t if an array of individual processes
+                                                                    //               are specified
+#define PMIX_DEBUG_STOP_IN_INIT             "pmix.dbg.init"         // (varies) stop specified rank(s) in PMIx_Init and notify ready-to-debug
+                                                                    //        Can be any of three data types:
+                                                                    //           (a) bool - true indicating all ranks, false indicating none
+                                                                    //           (b) pmix_rank_t - the rank of one proc, or WILDCARD for all
+                                                                    //           (c) a pmix_data_array_t if an array of individual processes
+                                                                    //               are specified
+#define PMIX_DEBUG_STOP_IN_APP              "pmix.dbg.notify"       // (varies) direct specified ranks to stop at application-specific point and
+                                                                    //        notify ready-to-debug. Can be any of three data types:
+                                                                    //           (a) bool - true indicating all ranks, false indicating none
+                                                                    //           (b) pmix_rank_t - the rank of one proc, or WILDCARD for all
+                                                                    //           (c) a pmix_data_array_t if an array of individual processes
+                                                                    //               are specified
+#define PMIX_BREAKPOINT                     "pmix.brkpnt"           // (char*) string ID of the breakpoint where the process(es) is(are) waiting
+#define PMIX_DEBUG_TARGET                   "pmix.dbg.tgt"          // (pmix_proc_t*) Identifier of proc(s) to be debugged
+#define PMIX_DEBUG_DAEMONS_PER_PROC         "pmix.dbg.dpproc"       // (uint16_t) Number of debugger daemons to be spawned per application
+                                                                    //        process. The launcher is to pass the identifier of the namespace to
+                                                                    //        be debugged by including the PMIX_DEBUG_TARGET attribute in the
+                                                                    //        daemon's job-level information. The debugger daemons spawned on a
+                                                                    //        given node are responsible for self-determining their specific
+                                                                    //        target process(es) - e.g., by referencing their own PMIX_LOCAL_RANK
+                                                                    //        in the daemon debugger job versus the corresponding PMIX_LOCAL_RANK
+                                                                    //        of the target processes on the node.
+#define PMIX_DEBUG_DAEMONS_PER_NODE         "pmix.dbg.dpnd"         // (uint16_t) Number of debugger daemons to be spawned on each node where the
+                                                                    //        target job is executing. The launcher is to pass the identifier of
+                                                                    //        the namespace to be debugged by including the PMIX_DEBUG_TARGET
+                                                                    //        attribute in the daemon's job-level information. The debugger
+                                                                    //        daemons spawned on a given node are responsible for
+                                                                    //        self-determining their specific target process(es) - e.g., by
+                                                                    //        referencing their own PMIX_LOCAL_RANK in the daemon debugger job
+                                                                    //        versus the corresponding PMIX_LOCAL_RANK of the target processes on
+                                                                    //        the node.
+
+
+/* Resource Manager identification */
+#define PMIX_RM_NAME                        "pmix.rm.name"          // (char*) string name of the resource manager
+#define PMIX_RM_VERSION                     "pmix.rm.version"       // (char*) RM version string
+
+/* environmental variable operation attributes */
+#define PMIX_SET_ENVAR                      "pmix.envar.set"          // (pmix_envar_t*) set the envar to the given value,
+                                                                      //                 overwriting any pre-existing one
+#define PMIX_ADD_ENVAR                      "pmix.envar.add"          // (pmix_envar_t*) add envar, but do not overwrite any existing one
+#define PMIX_UNSET_ENVAR                    "pmix.envar.unset"        // (char*) unset the envar, if present
+#define PMIX_PREPEND_ENVAR                  "pmix.envar.prepnd"       // (pmix_envar_t*) prepend the given value to the
+                                                                      //                 specified envar using the separator
+                                                                      //                 character, creating the envar if it doesn't already exist
+#define PMIX_APPEND_ENVAR                   "pmix.envar.appnd"        // (pmix_envar_t*) append the given value to the specified
+                                                                      //                 envar using the separator character,
+                                                                      //                 creating the envar if it doesn't already exist
+#define PMIX_FIRST_ENVAR                    "pmix.envar.first"        // (pmix_envar_t*) ensure the given value appears first in the
+                                                                      //                 specified envar using the separator
+                                                                      //                 character, creating the envar if it doesn't already exist
+
+/* attributes relating to allocations */
+#define PMIX_ALLOC_REQ_ID                   "pmix.alloc.reqid"      // (char*) User-provided string identifier for this allocation request
+                                                                    //         which can later be used to query status of the request.
+#define PMIX_ALLOC_ID                       "pmix.alloc.id"         // (char*) A string identifier (provided by the host environment) for
+                                                                    //         the resulting allocation which can later be used to reference
+                                                                    //         the allocated resources in, for example, a call to PMIx_Spawn
+#define PMIX_ALLOC_NUM_NODES                "pmix.alloc.nnodes"     // (uint64_t) number of nodes
+#define PMIX_ALLOC_NODE_LIST                "pmix.alloc.nlist"      // (char*) regex of specific nodes
+#define PMIX_ALLOC_NUM_CPUS                 "pmix.alloc.ncpus"      // (uint64_t) number of cpus
+#define PMIX_ALLOC_NUM_CPU_LIST             "pmix.alloc.ncpulist"   // (char*) regex of #cpus for each node
+#define PMIX_ALLOC_CPU_LIST                 "pmix.alloc.cpulist"    // (char*) regex of specific cpus indicating the cpus involved.
+#define PMIX_ALLOC_MEM_SIZE                 "pmix.alloc.msize"      // (float) number of Mbytes
+#define PMIX_ALLOC_FABRIC                   "pmix.alloc.net"        // (pmix_data_array_t*) Array of pmix_info_t describing
+                                                                    //         fabric resource request. This must include at least:
+                                                                    //           * PMIX_ALLOC_FABRIC_ID
+                                                                    //           * PMIX_ALLOC_FABRIC_TYPE
+                                                                    //           * PMIX_ALLOC_FABRIC_ENDPTS
+                                                                    //         plus whatever other descriptors are desired
+#define PMIX_ALLOC_FABRIC_ID                "pmix.alloc.netid"      // (char*) key to be used when accessing this requested fabric allocation. The
+                                                                    //         allocation will be returned/stored as a pmix_data_array_t of
+                                                                    //         pmix_info_t indexed by this key and containing at least one
+                                                                    //         entry with the same key and the allocated resource description.
+                                                                    //         The type of the included value depends upon the fabric
+                                                                    //         support. For example, a TCP allocation might consist of a
+                                                                    //         comma-delimited string of socket ranges such as
+                                                                    //         "32000-32100,33005,38123-38146". Additional entries will consist
+                                                                    //         of any provided resource request directives, along with their
+                                                                    //         assigned values. Examples include:
+                                                                    //           * PMIX_ALLOC_FABRIC_TYPE - the type of resources provided
+                                                                    //           * PMIX_ALLOC_FABRIC_PLANE - if applicable, what plane the
+                                                                    //               resources were assigned from
+                                                                    //           * PMIX_ALLOC_FABRIC_QOS - the assigned QoS
+                                                                    //           * PMIX_ALLOC_BANDWIDTH - the allocated bandwidth
+                                                                    //           * PMIX_ALLOC_FABRIC_SEC_KEY - a security key for the requested
+                                                                    //               fabric allocation
+                                                                    //         NOTE: the assigned values may differ from those requested,
+                                                                    //         especially if the "required" flag was not set in the request
+#define PMIX_ALLOC_BANDWIDTH                "pmix.alloc.bw"         // (float) Mbits/sec
+#define PMIX_ALLOC_FABRIC_QOS               "pmix.alloc.netqos"     // (char*) quality of service level
+#define PMIX_ALLOC_TIME                     "pmix.alloc.time"       // (uint32_t) time in seconds that the allocation shall remain valid
+#define PMIX_ALLOC_FABRIC_TYPE              "pmix.alloc.nettype"    // (char*) type of desired transport (e.g., tcp, udp)
+#define PMIX_ALLOC_FABRIC_PLANE             "pmix.alloc.netplane"   // (char*) id string for the NIC (aka plane) to be used for this allocation
+                                                                    //         (e.g., CIDR for Ethernet)
+#define PMIX_ALLOC_FABRIC_ENDPTS            "pmix.alloc.endpts"     // (size_t) number of endpoints to allocate per process
+#define PMIX_ALLOC_FABRIC_ENDPTS_NODE       "pmix.alloc.endpts.nd"  // (size_t) number of endpoints to allocate per node
+#define PMIX_ALLOC_FABRIC_SEC_KEY           "pmix.alloc.nsec"       // (pmix_byte_object_t) fabric security key
+#define PMIX_ALLOC_QUEUE                    "pmix.alloc.queue"      // (char*) name of queue being referenced
+
+
+/* job control attributes */
+#define PMIX_JOB_CTRL_ID                    "pmix.jctrl.id"         // (char*) provide a string identifier for this request
+#define PMIX_JOB_CTRL_PAUSE                 "pmix.jctrl.pause"      // (bool) pause the specified processes
+#define PMIX_JOB_CTRL_RESUME                "pmix.jctrl.resume"     // (bool) "un-pause" the specified processes
+#define PMIX_JOB_CTRL_CANCEL                "pmix.jctrl.cancel"     // (char*) cancel the specified request
+                                                                    //         (NULL => cancel all requests from this requestor)
+#define PMIX_JOB_CTRL_KILL                  "pmix.jctrl.kill"       // (bool) forcibly terminate the specified processes and cleanup
+#define PMIX_JOB_CTRL_RESTART               "pmix.jctrl.restart"    // (char*) restart the specified processes using the given checkpoint ID
+#define PMIX_JOB_CTRL_CHECKPOINT            "pmix.jctrl.ckpt"       // (char*) checkpoint the specified processes and assign the given ID to it
+#define PMIX_JOB_CTRL_CHECKPOINT_EVENT      "pmix.jctrl.ckptev"     // (bool) use event notification to trigger process checkpoint
+#define PMIX_JOB_CTRL_CHECKPOINT_SIGNAL     "pmix.jctrl.ckptsig"    // (int) use the given signal to trigger process checkpoint
+#define PMIX_JOB_CTRL_CHECKPOINT_TIMEOUT    "pmix.jctrl.ckptsig"    // (int) time in seconds to wait for checkpoint to complete
+#define PMIX_JOB_CTRL_CHECKPOINT_METHOD     "pmix.jctrl.ckmethod"   // (pmix_data_array_t*) array of pmix_info_t declaring each
+                                                                    //      method and value supported by this application
+#define PMIX_JOB_CTRL_SIGNAL                "pmix.jctrl.sig"        // (int) send given signal to specified processes
+#define PMIX_JOB_CTRL_PROVISION             "pmix.jctrl.pvn"        // (char*) regex identifying nodes that are to be provisioned
+#define PMIX_JOB_CTRL_PROVISION_IMAGE       "pmix.jctrl.pvnimg"     // (char*) name of the image that is to be provisioned
+#define PMIX_JOB_CTRL_PREEMPTIBLE           "pmix.jctrl.preempt"    // (bool) job can be pre-empted
+#define PMIX_JOB_CTRL_TERMINATE             "pmix.jctrl.term"       // (bool) politely terminate the specified procs
+#define PMIX_REGISTER_CLEANUP               "pmix.reg.cleanup"      // (char*) comma-delimited list of files to
+                                                                    //         be removed upon process termination
+#define PMIX_REGISTER_CLEANUP_DIR           "pmix.reg.cleanupdir"   // (char*) comma-delimited list of directories to
+                                                                    //         be removed upon process termination
+#define PMIX_CLEANUP_RECURSIVE              "pmix.clnup.recurse"    // (bool) recursively cleanup all subdirectories under the
+                                                                    //        specified one(s)
+#define PMIX_CLEANUP_EMPTY                  "pmix.clnup.empty"      // (bool) only remove empty subdirectories
+#define PMIX_CLEANUP_IGNORE                 "pmix.clnup.ignore"     // (char*) comma-delimited list of filenames that are not
+                                                                    //         to be removed
+#define PMIX_CLEANUP_LEAVE_TOPDIR           "pmix.clnup.lvtop"      // (bool) when recursively cleaning subdirs, do not remove
+                                                                    //        the top-level directory (the one given in the
+                                                                    //        cleanup request)
+
+/* monitoring attributes */
+#define PMIX_MONITOR_ID                     "pmix.monitor.id"       // (char*) provide a string identifier for this request
+#define PMIX_MONITOR_CANCEL                 "pmix.monitor.cancel"   // (char*) identifier to be canceled (NULL = cancel all
+                                                                    //         monitoring for this process)
+#define PMIX_MONITOR_APP_CONTROL            "pmix.monitor.appctrl"  // (bool) the application desires to control the response to
+                                                                    //        a monitoring event
+#define PMIX_MONITOR_HEARTBEAT              "pmix.monitor.mbeat"    // (bool) register to have the server monitor the requestor for heartbeats
+#define PMIX_SEND_HEARTBEAT                 "pmix.monitor.beat"     // (bool) send heartbeat to local server
+#define PMIX_MONITOR_HEARTBEAT_TIME         "pmix.monitor.btime"    // (uint32_t) time in seconds before declaring heartbeat missed
+#define PMIX_MONITOR_HEARTBEAT_DROPS        "pmix.monitor.bdrop"    // (uint32_t) number of heartbeats that can be missed before
+                                                                    //            generating the event
+#define PMIX_MONITOR_FILE                   "pmix.monitor.fmon"     // (char*) register to monitor file for signs of life
+#define PMIX_MONITOR_FILE_SIZE              "pmix.monitor.fsize"    // (bool) monitor size of given file is growing to determine app is running
+#define PMIX_MONITOR_FILE_ACCESS            "pmix.monitor.faccess"  // (char*) monitor time since last access of given file to determine app is running
+#define PMIX_MONITOR_FILE_MODIFY            "pmix.monitor.fmod"     // (char*) monitor time since last modified of given file to determine app is running
+#define PMIX_MONITOR_FILE_CHECK_TIME        "pmix.monitor.ftime"    // (uint32_t) time in seconds between checking file
+#define PMIX_MONITOR_FILE_DROPS             "pmix.monitor.fdrop"    // (uint32_t) number of file checks that can be missed before
+                                                                    //            generating the event
+
+/* security attributes */
+#define PMIX_CRED_TYPE                      "pmix.sec.ctype"        // (char*) when passed in PMIx_Get_credential, a prioritized,
+                                                                    // comma-delimited list of desired credential types for use
+                                                                    // in environments where multiple authentication mechanisms
+                                                                    // may be available. When returned in a callback function, a
+                                                                    // string identifier of the credential type
+#define PMIX_CRYPTO_KEY                     "pmix.sec.key"          // (pmix_byte_object_t) blob containing crypto key
+
+
+/* IO Forwarding Attributes */
+#define PMIX_IOF_CACHE_SIZE                 "pmix.iof.csize"        // (uint32_t) requested size of the server cache in bytes for each specified channel.
+                                                                    //            By default, the server is allowed (but not required) to drop
+                                                                    //            all bytes received beyond the max size
+#define PMIX_IOF_DROP_OLDEST                "pmix.iof.old"          // (bool) in an overflow situation, drop the oldest bytes to make room in the cache
+#define PMIX_IOF_DROP_NEWEST                "pmix.iof.new"          // (bool) in an overflow situation, drop any new bytes received until room becomes
+                                                                    //        available in the cache (default)
+#define PMIX_IOF_BUFFERING_SIZE             "pmix.iof.bsize"        // (uint32_t) basically controls grouping of IO on the specified channel(s) to
+                                                                    //            avoid being called every time a bit of IO arrives. The library
+                                                                    //            will execute the callback whenever the specified number of bytes
+                                                                    //            becomes available. Any remaining buffered data will be "flushed"
+                                                                    //            upon call to deregister the respective channel
+#define PMIX_IOF_BUFFERING_TIME             "pmix.iof.btime"        // (uint32_t) max time in seconds to buffer IO before delivering it. Used in conjunction
+                                                                    //            with buffering size, this prevents IO from being held indefinitely
+                                                                    //            while waiting for another payload to arrive
+#define PMIX_IOF_COMPLETE                   "pmix.iof.cmp"          // (bool) indicates whether or not the specified IO channel has been closed
+                                                                    //        by the source
+#define PMIX_IOF_PUSH_STDIN                 "pmix.iof.stdin"        // (bool) Used by a tool to request that the PMIx library collect
+                                                                    //        the tool's stdin and forward it to the procs specified in
+                                                                    //        the PMIx_IOF_push call
+#define PMIX_IOF_TAG_OUTPUT                 "pmix.iof.tag"          // (bool) Tag output with the [local jobid,rank] and channel it comes from
+#define PMIX_IOF_TAG_DETAILED_OUTPUT        "pmix.iof.tagdet"       // (bool) Tag output with the [local jobid,rank][hostname:pid] and channel it comes from
+#define PMIX_IOF_TAG_FULLNAME_OUTPUT        "pmix.iof.tagfull"      // (bool) Tag output with the [nspace,rank] and channel it comes from
+#define PMIX_IOF_RANK_OUTPUT                "pmix.iof.rank"         // (bool) Tag output with the rank it came from
+#define PMIX_IOF_TIMESTAMP_OUTPUT           "pmix.iof.ts"           // (bool) Timestamp output
+#define PMIX_IOF_MERGE_STDERR_STDOUT        "pmix.iof.mrg"          // (bool) merge stdout and stderr streams from application procs
+#define PMIX_IOF_XML_OUTPUT                 "pmix.iof.xml"          // (bool) Format output in XML
+#define PMIX_IOF_OUTPUT_TO_FILE             "pmix.iof.file"         // (char*) direct application output into files of form
+                                                                    //         "<filename>.rank" with both stdout and stderr redirected into it
+#define PMIX_IOF_FILE_PATTERN               "pmix.iof.fpt"          // (bool) Specified output file is to be treated as a pattern and not
+                                                                    //        automatically annotated by nspace, rank, or other parameters
+#define PMIX_IOF_OUTPUT_TO_DIRECTORY        "pmix.iof.dir"          // (char*) direct application output into files of form
+                                                                    //         "<directory>/<jobid>/rank.<rank>/stdout[err]"
+#define PMIX_IOF_FILE_ONLY                  "pmix.iof.fonly"        // (bool) output only into designated files - do not also output
+                                                                    //        a copy to stdout/stderr
+#define PMIX_IOF_COPY                       "pmix.iof.cpy"          // (bool) Requests that the host environment deliver a copy of the
+                                                                    //        specified output stream(s) to the tool, letting the stream(s)
+                                                                    //        continue to also be delivered to the default location. This
+                                                                    //        allows the tool to tap into the output stream(s) without
+                                                                    //        redirecting it from its current final destination.
+#define PMIX_IOF_REDIRECT                   "pmix.iof.redir"        // (bool) Requests that the host environment intercept the specified
+                                                                    //        output stream(s) and deliver it to the requesting tool instead
+                                                                    //        of its current final destination. This might be used, for
+                                                                    //        example, during a debugging procedure to avoid injection of
+                                                                    //        debugger-related output into the application's results file.
+                                                                    //        The original output stream(s) destination is restored upon
+                                                                    //        termination of the tool.
+#define PMIX_IOF_LOCAL_OUTPUT               "pmix.iof.local"        // (bool) Write output streams to local stdout/err
+#define PMIX_IOF_OUTPUT_RAW                 "pmix.iof.raw"          // (bool) Do not buffer output to be written as complete lines - output
+                                                                    //        characters as the stream delivers them
+
+/* Attributes for controlling contents of application setup data */
+#define PMIX_SETUP_APP_ENVARS               "pmix.setup.env"        // (bool) harvest and include relevant envars
+#define PMIX_SETUP_APP_NONENVARS            "pmix.setup.nenv"       // (bool) include all non-envar data
+#define PMIX_SETUP_APP_ALL                  "pmix.setup.all"        // (bool) include all relevant data
+
+/* Attributes supporting the PMIx Groups APIs */
+#define PMIX_GROUP_ID                       "pmix.grp.id"           // (char*) user-provided group identifier
+#define PMIX_GROUP_LEADER                   "pmix.grp.ldr"          // (bool) this process is the leader of the group
+#define PMIX_GROUP_OPTIONAL                 "pmix.grp.opt"          // (bool) participation is optional - do not return an error if any of the
+                                                                    //        specified processes terminate without having joined. The default
+                                                                    //        is false
+#define PMIX_GROUP_NOTIFY_TERMINATION       "pmix.grp.notterm"      // (bool) notify remaining members when another member terminates without first
+                                                                    //        leaving the group. The default is false
+#define PMIX_GROUP_FT_COLLECTIVE            "pmix.grp.ftcoll"       // (bool) adjust internal tracking for terminated processes. Default is false
+#define PMIX_GROUP_MEMBERSHIP               "pmix.grp.mbrs"         // (pmix_data_array_t*) array of group member ID's
+#define PMIX_GROUP_ASSIGN_CONTEXT_ID        "pmix.grp.actxid"       // (bool) request that the RM assign a unique numerical (size_t) ID to this group
+#define PMIX_GROUP_CONTEXT_ID               "pmix.grp.ctxid"        // (size_t) context ID assigned to group
+#define PMIX_GROUP_LOCAL_ONLY               "pmix.grp.lcl"          // (bool) group operation only involves local procs
+#define PMIX_GROUP_ENDPT_DATA               "pmix.grp.endpt"        // (pmix_byte_object_t) data collected to be shared during construction
+#define PMIX_GROUP_NAMES                    "pmix.pgrp.nm"          // (pmix_data_array_t*) Returns an array of string names of the process groups
+                                                                    //        in which the given process is a member.
+#define PMIX_GROUP_INFO                     "pmix.grp.info"         // (pmix_data_array_t*) Array of pmix_info_t containing data that is to be
+                                                                    //        shared across all members of a group during group construction
+#define PMIX_GROUP_LOCAL_CID                "pmix.grp.lclid"        // (size_t) local context ID for the specified process member of a group
+#define PMIX_GROUP_ADD_MEMBERS              "pmix.grp.add"          // (pmix_data_array_t*) Array of pmix_proc_t identifying procs that are not
+                                                                    //        included in the membership specified in the procs array passed to
+                                                                    //        the PMIx_Group_construct[_nb] call, but are to be included in the
+                                                                    //        final group. The identified procs will be sent an invitation to
+                                                                    //        join the group during the construction procedure. This is used when
+                                                                    //        some members of the proposed group do not know the full membership
+                                                                    //        and therefore cannot include all members in the call to construct.
+
+/* Storage-Related Attributes */
+#define PMIX_QUERY_STORAGE_LIST             "pmix.strg.list"       // (char*) return comma-delimited list of identifiers for all available storage systems
+#define PMIX_STORAGE_CAPACITY_LIMIT         "pmix.strg.cap"        // (uint64_t) return overall capacity (in Megabytes[base2]) of specified storage system
+#define PMIX_STORAGE_OBJECT_LIMIT           "pmix.strg.obj"        // (uint64_t) return overall limit on number of objects (e.g., inodes) of specified storage system
+
+#define PMIX_STORAGE_ID                     "pmix.strg.id"         // (char*) identifier of the storage system being referenced
+#define PMIX_STORAGE_PATH                   "pmix.strg.path"       // (char*) Mount point corresponding to a specified storage ID
+#define PMIX_STORAGE_TYPE                   "pmix.strg.type"       // (char*) Qualifier indicating the type of storage being referenced by a query
+                                                                   //         (e.g., lustre, gpfs, online, fabric-attached, ...)
+#define PMIX_STORAGE_ACCESSIBILITY          "pmix.strg.access"     // (pmix_storage_accessibility_t) Accessibility level of the storage system
+                                                                   //         (e.g., within same node, within same session)
+#define PMIX_STORAGE_ACCESS_TYPE            "pmix.strg.atype"      // (pmix_storage_access_type_t) Qualifier describing the type of storage access to return
+                                                                   //       information for (e.g., for qualifying PMIX_STORAGE_BW_CUR, PMIX_STORAGE_IOPS_CUR,
+                                                                   //       or PMIX_STORAGE_SUGGESTED_XFER_SIZE attributes)
+#define PMIX_STORAGE_BW_CUR                 "pmix.strg.bwcur"      // (double) Observed bandwidth (in bytes/sec) for storage system - provided as a
+                                                                   //       recently observed bandwidth value, with the exact measurement interval
+                                                                   //       depending on the storage system and/or PMIx library implementation
+#define PMIX_STORAGE_BW_MAX                 "pmix.strg.bwmax"      // (double) Maximum bandwidth (in bytes/sec) for storage system - provided as the
+                                                                   //       theoretical maximum or the maximum observed bandwidth value
+#define PMIX_STORAGE_CAPACITY_USED          "pmix.strg.capuse"     // (double) Overall used capacity (in bytes) for the storage system
+#define PMIX_STORAGE_IOPS_CUR               "pmix.strg.iopscur"    // (double) Observed IOPS (in I/O operations per second) for storage system - provided
+                                                                   //       as a recently observed IOPS value, with the exact measurement interval depending
+                                                                   //       on the storage system and/or PMIx library implementation
+#define PMIX_STORAGE_IOPS_MAX               "pmix.strg.iopsmax"    // (double) Maximum IOPS (in I/O operations per second) for storage system - provided
+                                                                   //       as the theoretical maximum or the maximum observed IOPS value
+#define PMIX_STORAGE_MEDIUM                 "pmix.strg.medium"     // (pmix_storage_medium_t) Types of storage mediums utilized by the storage system
+                                                                   //       (e.g., SSDs, HDDs, tape)
+#define PMIX_STORAGE_MINIMAL_XFER_SIZE      "pmix.strg.minxfer"    // (double) Minimal transfer size (in bytes) for the storage system - this is the
+                                                                   //       storage system's atomic unit of transfer (e.g., block size)
+#define PMIX_STORAGE_OBJECTS_USED           "pmix.strg.objuse"     // (uint64_t) Overall used number of objects (e.g., inodes) for the storage system
+#define PMIX_STORAGE_PERSISTENCE            "pmix.strg.persist"    // (pmix_storage_persistence_t) Persistence level of the storage system
+                                                                   //       (e.g., sratch storage or archive storage)
+#define PMIX_STORAGE_SUGGESTED_XFER_SIZE    "pmix.strg.sxfer"      // (double) Suggested transfer size (in bytes) for the storage system
+#define PMIX_STORAGE_VERSION                "pmix.strg.ver"        // (char*) Version string for the storage system
+
+
+/* Fabric-related Attributes */
+#define PMIX_FABRIC_COST_MATRIX             "pmix.fab.cm"          // (pointer) Pointer to a two-dimensional array of point-to-point relative
+                                                                   //           communication costs expressed as uint16_t values
+#define PMIX_FABRIC_GROUPS                  "pmix.fab.grps"        // (char*) A string delineating the group membership of nodes in the system,
+                                                                   //         where each fabric group consists of the group number followed by
+                                                                   //         a colon and a comma-delimited list of nodes in that group, with the
+                                                                   //         groups delimited by semi-colons (e.g.,
+                                                                   //         0:node000,node002,node004,node006;1:node001,node003,node005,node007)
+#define PMIX_FABRIC_VENDOR                  "pmix.fab.vndr"        // (char*) Name of fabric vendor (e.g., Amazon, Mellanox, HPE, Intel)
+#define PMIX_FABRIC_IDENTIFIER              "pmix.fab.id"          // (char*) An identifier for the fabric (e.g., MgmtEthernet, Slingshot-11,
+                                                                   //         OmniPath-1)
+#define PMIX_FABRIC_INDEX                   "pmix.fab.idx"         // (size_t) The index of the fabric as returned in pmix_fabric_t
+#define PMIX_FABRIC_COORDINATES             "pmix.fab.coord"       // (pmix_data_array_t*) Array of pmix_geometry_t fabric coordinates for
+                                                                   //          devices on the specified node. The array will contain the
+                                                                   //          coordinates of all devices on the node, including values for
+                                                                   //          all supported coordinate views. The information for devices
+                                                                   //          on the local node shall be provided if the node is not
+                                                                   //          specified in the request.
+#define PMIX_FABRIC_DEVICE_VENDORID         "pmix.fabdev.vendid"   // (char*) This is a vendor-provided identifier for the device or product.
+#define PMIX_FABRIC_NUM_DEVICES             "pmix.fab.nverts"      // (size_t) Total number of fabric devices in the system - corresponds to
+                                                                   //          the number of rows or columns in the cost matrix
+#define PMIX_FABRIC_DIMS                    "pmix.fab.dims"        // (uint32_t) Number of dimensions in the specified fabric plane/view. If no
+                                                                   //          plane is specified in a request, then the dimensions of all planes
+                                                                   //          in the overall system will be returned as a pmix_data_array_t
+                                                                   //          containing an array of uint32_t values. Default is to
+                                                                   //          provide dimensions in logical view.
+
+#define PMIX_FABRIC_PLANE                   "pmix.fab.plane"       // (char*) ID string of a fabric plane (e.g., CIDR for Ethernet). When used as
+                                                                   //          a modifier in a request for information, specifies the plane whose
+                                                                   //          information is to be returned. When used directly as a key in a
+                                                                   //          request, returns a pmix_data_array_t of string
+                                                                   //          identifiers for all fabric planes in the overall system.
+
+#define PMIX_FABRIC_SWITCH                  "pmix.fab.switch"      // (char*) ID string of a fabric switch. When used as a modifier in a request
+                                                                   //         for information, specifies the switch whose information is to be
+                                                                   //         returned. When used directly as a key in a request, returns a
+                                                                   //         pmix_data_array_t of string identifiers for all fabric switches in
+                                                                   //         the overall system.
+
+#define PMIX_FABRIC_ENDPT                   "pmix.fab.endpt"       // (pmix_data_array_t*) Fabric endpoints for a specified process. As multiple
+                                                                   //         endpoints may be assigned to a given process (e.g., in the case
+                                                                   //         where multiple devices are associated with a package to which the
+                                                                   //         process is bound), the returned values will be provided in a
+                                                                   //         pmix_data_array_t of pmix_endpoint_t elements.
+
+#define PMIX_FABRIC_SHAPE                   "pmix.fab.shape"       // (pmix_data_array_t*) The size of each dimension in the specified fabric
+                                                                   //         plane/view, returned in a pmix_data_array_t containing an array of
+                                                                   //         uint32_t values. The size is defined as the number of elements
+                                                                   //         present in that dimension - e.g., the number of devices in one
+                                                                   //         dimension of a physical view of a fabric plane. If no plane is
+                                                                   //         specified, then the shape of each plane in the overall system will
+                                                                   //         be returned in a pmix_data_array_t array where each element is
+                                                                   //         itself a two-element array containing the PMIX_FABRIC_PLANE
+                                                                   //         followed by that plane's fabric shape. Default is to provide the
+                                                                   //         shape in logical view.
+
+#define PMIX_FABRIC_SHAPE_STRING            "pmix.fab.shapestr"    // (char*) Network shape expressed as a string (e.g., "10x12x2"). If no plane
+                                                                   //         is specified, then the shape of each plane in the overall system
+                                                                   //         will be returned in a pmix_data_array_t array where
+                                                                   //         each element is itself a two-element array containing the
+                                                                   //         PMIX_FABRIC_PLANE followed by that plane's fabric shape string.
+                                                                   //         Default is to provide the shape in logical view.
+
+#define PMIX_SWITCH_PEERS                   "pmix.speers"          // (char*) Peer ranks that share the same switch as the process specified in
+                                                                   //         the call to PMIx_Get. Returns a pmix_data_array_t array of
+                                                                   //         pmix_info_t results, each element containing the PMIX_SWITCH_PEERS
+                                                                   //         key with a three-element pmix_data_array_t array of pmix_info_t
+                                                                   //         containing the PMIX_FABRIC_DEVICE_ID of the local fabric device,
+                                                                   //         the PMIX_FABRIC_SWITCH identifying the switch to which it is
+                                                                   //         connected, and a comma-delimited string of peer ranks sharing the
+                                                                   //         switch to which that device is connected.
+#define PMIX_FABRIC_DEVICE                  "pmix.fabdev"          // (pmix_data_array_t*) An array of pmix_info_t describing a particular
+                                                                   //         fabric device. The first element in the array shall be the
+                                                                   //         PMIX_FABRIC_DEVICE_ID of the device
+#define PMIX_FABRIC_DEVICES                 "pmix.fab.devs"        // (pmix_data_array_t*) Array of pmix_info_t containing information for all
+                                                                   //         devices on the specified node. Each element of the array will contain
+                                                                   //         a PMIX_FABRIC_DEVICE entry, which in turn will contain an array of
+                                                                   //         information on a given device.
+#define PMIX_FABRIC_DEVICE_NAME             "pmix.fabdev.nm"       // (char*) The operating system name associated with the device. This may be
+                                                                   //         a logical fabric interface name (e.g. eth0 or eno1) or an absolute
+                                                                   //         filename.
+#define PMIX_FABRIC_DEVICE_INDEX            "pmix.fabdev.idx"      // (uint32_t) Index of the device within an associated communication cost
+                                                                   //         matrix.
+
+#define PMIX_FABRIC_DEVICE_VENDOR           "pmix.fabdev.vndr"     // (char*) Indicates the name of the vendor that distributes the NIC.
+#define PMIX_FABRIC_DEVICE_BUS_TYPE         "pmix.fabdev.btyp"     // (char*) The type of bus to which the device is attached (e.g., "PCI",
+                                                                   //         "GEN-Z").
+#define PMIX_FABRIC_DEVICE_DRIVER           "pmix.fabdev.driver"   // (char*) The name of the driver associated with the device
+#define PMIX_FABRIC_DEVICE_FIRMWARE         "pmix.fabdev.fmwr"     // (char*) The device's firmware version
+#define PMIX_FABRIC_DEVICE_ADDRESS          "pmix.fabdev.addr"     // (char*) The primary link-level address associated with the device, such as a
+                                                                   //         MAC address. If multiple addresses are available, only one will be
+                                                                   //         reported.
+#define PMIX_FABRIC_DEVICE_COORDINATES      "pmix.fab.coord"       // (pmix_geometry_t) The pmix_geometry_t fabric coordinates for the device, including
+                                                                   //          values for all supported coordinate views.
+#define PMIX_FABRIC_DEVICE_MTU              "pmix.fabdev.mtu"      // (size_t) The maximum transfer unit of link level frames or packets,
+                                                                   //          in bytes.
+#define PMIX_FABRIC_DEVICE_SPEED            "pmix.fabdev.speed"    // (size_t) The active link data rate, given in bits per second.
+#define PMIX_FABRIC_DEVICE_STATE            "pmix.fabdev.state"    // (pmix_link_state_t) The last available physical port state. Possible values
+                                                                   //          are PMIX_LINK_STATE_UNKNOWN, PMIX_LINK_DOWN, and PMIX_LINK_UP, to
+                                                                   //          indicate if the port state is unknown or not applicable (unknown),
+                                                                   //          inactive (down), or active (up).
+#define PMIX_FABRIC_DEVICE_TYPE             "pmix.fabdev.type"     // (char*) Specifies the type of fabric interface currently active on the
+                                                                   //         device, such as Ethernet or InfiniBand.
+#define PMIX_FABRIC_DEVICE_PCI_DEVID        "pmix.fabdev.pcidevid" // (char*) A node-level unique identifier for a PCI device. Provided only if the
+                                                                   //         device is located on a \ac{PCI} bus. The identifier is constructed as
+                                                                   //         a four-part tuple delimited by colons comprised of the \ac{PCI} 16-bit
+                                                                   //         domain, 8-bit bus, 8-bit device, and 8-bit function IDs, each expressed
+                                                                   //         in zero-extended hexadecimal form. Thus, an example identifier might be
+                                                                   //         "abc1:0f:23:01". The combination of node identifier PMIX_HOSTNAME or
+                                                                   //         PMIX_NODEID and PMIX_FABRIC_DEVICE_PCI_DEVID shall be unique within the
+                                                                   //         system.
+
+
+/* Distance Attributes */
+#define PMIX_DEVICE_DISTANCES               "pmix.dev.dist"        // (pmix_data_array_t*) Return an array of pmix_device_distance_t containing the
+                                                                   //         minimum and maximum distances of the given process location to all
+                                                                   //         devices of the specified type on the local node.
+#define PMIX_DEVICE_TYPE                    "pmix.dev.type"        // (pmix_device_type_t) Bitmask specifying the type(s) of device(s) whose
+                                                                   //         information is being requested. Only used as a directive/qualifier.
+#define PMIX_DEVICE_ID                      "pmix.dev.id"          // (char*) System-wide UUID or node-local OS name of a particular device.
+
+
+/* Descriptive Attributes */
+#define PMIX_MAX_VALUE                      "pmix.descr.maxval"    // (varies) Used in pmix_regattr_t to describe the maximum valid value
+                                                                   //          for the associated attribute.
+#define PMIX_MIN_VALUE                      "pmix.descr.minval"    // (varies) Used in pmix_regattr_t to describe the minimum valid value
+                                                                   //          for the associated attribute.
+#define PMIX_ENUM_VALUE                     "pmix.descr.enum"      // (char*) Used in pmix_regattr_t to describe accepted values for the
+                                                                   //         associated attribute. Numerical values shall be presented in
+                                                                   //         a form convertible to the attribute's declared data type.
+                                                                   //         Named values (i.e., values defined by constant names via a
+                                                                   //         typical C-language enum declaration) must be provided as
+                                                                   //         their numerical equivalent.
+#define PMIX_QUERY_STABLE_ABI_VERSION       "pmix.qry.stabiver"    // (char*) The PMIx Standard Stable ABI version supported returned in the form of a comma separated list of "MAJOR.MINOR"
+                                                                   //         This attribute can be used with PMIx_Query_info outside of the init/finalize region.
+#define PMIX_QUERY_PROVISIONAL_ABI_VERSION  "pmix.qry.prabiver"    // (char*) The PMIx Standard Provisional ABI version supported returned in the form of a comma separated "MAJOR.MINOR"
+                                                                   //         This attribute can be used with PMIx_Query_info outside of the init/finalize region.
+
+/****    PROCESS STATE DEFINITIONS    ****/
+typedef uint8_t pmix_proc_state_t;
+#define PMIX_PROC_STATE_UNDEF                    0  /* undefined process state */
+#define PMIX_PROC_STATE_PREPPED                  1  /* process is ready to be launched */
+#define PMIX_PROC_STATE_LAUNCH_UNDERWAY          2  /* launch process underway */
+#define PMIX_PROC_STATE_RESTART                  3  /* the proc is ready for restart */
+#define PMIX_PROC_STATE_TERMINATE                4  /* process is marked for termination */
+#define PMIX_PROC_STATE_RUNNING                  5  /* daemon has locally fork'd process */
+#define PMIX_PROC_STATE_CONNECTED                6  /* proc connected to PMIx server */
+/*
+* Define a "boundary" so users can easily and quickly determine
+* if a proc is still running or not - any value less than
+* this one means that the proc has not terminated
+*/
+#define PMIX_PROC_STATE_UNTERMINATED            15
+
+#define PMIX_PROC_STATE_TERMINATED              20  /* process has terminated and is no longer running */
+/* Define a boundary so users can easily and quickly determine
+* if a proc abnormally terminated - leave a little room
+* for future expansion
+*/
+#define PMIX_PROC_STATE_ERROR                   50
+/* Define specific error code values */
+#define PMIX_PROC_STATE_KILLED_BY_CMD           (PMIX_PROC_STATE_ERROR +  1)  /* process was killed by cmd */
+#define PMIX_PROC_STATE_ABORTED                 (PMIX_PROC_STATE_ERROR +  2)  /* process aborted */
+#define PMIX_PROC_STATE_FAILED_TO_START         (PMIX_PROC_STATE_ERROR +  3)  /* process failed to start */
+#define PMIX_PROC_STATE_ABORTED_BY_SIG          (PMIX_PROC_STATE_ERROR +  4)  /* process aborted by signal */
+#define PMIX_PROC_STATE_TERM_WO_SYNC            (PMIX_PROC_STATE_ERROR +  5)  /* process exit'd w/o calling PMIx_Finalize */
+#define PMIX_PROC_STATE_COMM_FAILED             (PMIX_PROC_STATE_ERROR +  6)  /* process communication has failed */
+#define PMIX_PROC_STATE_SENSOR_BOUND_EXCEEDED   (PMIX_PROC_STATE_ERROR +  7)  /* process exceeded a sensor limit */
+#define PMIX_PROC_STATE_CALLED_ABORT            (PMIX_PROC_STATE_ERROR +  8)  /* process called "PMIx_Abort" */
+#define PMIX_PROC_STATE_HEARTBEAT_FAILED        (PMIX_PROC_STATE_ERROR +  9)  /* process failed to send heartbeat w/in time limit */
+#define PMIX_PROC_STATE_MIGRATING               (PMIX_PROC_STATE_ERROR + 10)  /* process failed and is waiting for resources before restarting */
+#define PMIX_PROC_STATE_CANNOT_RESTART          (PMIX_PROC_STATE_ERROR + 11)  /* process failed and cannot be restarted */
+#define PMIX_PROC_STATE_TERM_NON_ZERO           (PMIX_PROC_STATE_ERROR + 12)  /* process exited with a non-zero status, indicating abnormal */
+#define PMIX_PROC_STATE_FAILED_TO_LAUNCH        (PMIX_PROC_STATE_ERROR + 13)  /* unable to launch process */
+
+
+/****    JOB STATE DEFINITIONS    ****/
+typedef uint8_t pmix_job_state_t;
+#define PMIX_JOB_STATE_UNDEF                     0  // undefined process state
+#define PMIX_JOB_STATE_AWAITING_ALLOC            1  // Job is waiting for resources to be allocated to it
+#define PMIX_JOB_STATE_LAUNCH_UNDERWAY           2  // job launch underway
+#define PMIX_JOB_STATE_RUNNING                   3  // all procs have been spawned
+#define PMIX_JOB_STATE_SUSPENDED                 4  // job has been suspended
+#define PMIX_JOB_STATE_CONNECTED                 5  // all procs have connected to their PMIx server
+
+/*
+* Define a "boundary" so users can easily and quickly determine
+* if a job is still running or not - any value less than
+* this one means that the job has not terminated
+*/
+#define PMIX_JOB_STATE_UNTERMINATED             15
+
+#define PMIX_JOB_STATE_TERMINATED               20  // job has terminated and is no longer running - typically will
+                                                    // be accompanied by the job exit status in response to a query
+
+/* Define a boundary so users can easily and quickly determine
+* if a job abnormally terminated - leave a little room
+* for future expansion
+*/
+#define PMIX_JOB_STATE_TERMINATED_WITH_ERROR    50  // job has terminated and is no longer running - typically will
+                                                    // be accompanied by a job-related error code in response to a query
+
+
+/****    PMIX ERROR CONSTANTS    ****/
+/* PMIx errors are always negative, with 0 reserved for success */
+typedef int pmix_status_t;
+
+/* v1.x error values - must be fixed in place for backward
+ * compatibility. Note that some number of these have been
+ * deprecated and may not be returned by v2.x and above
+ * clients or servers. However, they must always be
+ * at least defined to ensure older codes will compile */
+#define PMIX_SUCCESS                                 0
+#define PMIX_ERROR                                  -1          // general error
+/* fault tolerance */
+#define PMIX_ERR_PROC_RESTART                       -4
+#define PMIX_ERR_PROC_CHECKPOINT                    -5
+#define PMIX_ERR_PROC_MIGRATE                       -6
+#define PMIX_ERR_EXISTS                             -11
+/* communication failures */
+#define PMIX_ERR_INVALID_CRED                       -12
+#define PMIX_ERR_WOULD_BLOCK                        -15
+#define PMIX_ERR_UNKNOWN_DATA_TYPE                  -16
+#define PMIX_ERR_TYPE_MISMATCH                      -18
+#define PMIX_ERR_UNPACK_INADEQUATE_SPACE            -19
+#define PMIX_ERR_UNPACK_FAILURE                     -20
+#define PMIX_ERR_PACK_FAILURE                       -21
+#define PMIX_ERR_NO_PERMISSIONS                     -23
+#define PMIX_ERR_TIMEOUT                            -24
+#define PMIX_ERR_UNREACH                            -25
+#define PMIX_ERR_BAD_PARAM                          -27
+#define PMIX_ERR_RESOURCE_BUSY                      -28
+#define PMIX_ERR_OUT_OF_RESOURCE                    -29
+#define PMIX_ERR_INIT                               -31
+#define PMIX_ERR_NOMEM                              -32
+#define PMIX_ERR_NOT_FOUND                          -46
+#define PMIX_ERR_NOT_SUPPORTED                      -47
+#define PMIX_ERR_PARAM_VALUE_NOT_SUPPORTED          -59
+#define PMIX_ERR_COMM_FAILURE                       -49
+#define PMIX_ERR_UNPACK_READ_PAST_END_OF_BUFFER     -50
+#define PMIX_ERR_CONFLICTING_CLEANUP_DIRECTIVES     -51
+#define PMIX_ERR_PARTIAL_SUCCESS                    -52
+#define PMIX_ERR_DUPLICATE_KEY                      -53
+#define PMIX_ERR_EMPTY                              -60
+#define PMIX_ERR_LOST_CONNECTION                    -61
+#define PMIX_ERR_EXISTS_OUTSIDE_SCOPE               -62
+
+/* Process set */
+#define PMIX_PROCESS_SET_DEFINE                     -55
+#define PMIX_PROCESS_SET_DELETE                     -56
+
+/* Debugger ops */
+#define PMIX_DEBUGGER_RELEASE                       -3      // replaced deprecated PMIX_ERR_DEBUGGER_RELEASE
+#define PMIX_READY_FOR_DEBUG                        -58     // accompanied by PMIX_BREAKPOINT indicating where proc is waiting
+
+/* query errors */
+#define PMIX_QUERY_PARTIAL_SUCCESS                  -104
+
+/* job control */
+#define PMIX_JCTRL_CHECKPOINT                       -106    // monitored by client to trigger checkpoint operation
+#define PMIX_JCTRL_CHECKPOINT_COMPLETE              -107    // sent by client and monitored by server to notify that requested
+                                                            //     checkpoint operation has completed
+#define PMIX_JCTRL_PREEMPT_ALERT                    -108    // monitored by client to detect RM intends to preempt
+
+/* monitoring */
+#define PMIX_MONITOR_HEARTBEAT_ALERT                -109
+#define PMIX_MONITOR_FILE_ALERT                     -110
+#define PMIX_PROC_TERMINATED                        -111
+
+/* operational */
+#define PMIX_ERR_EVENT_REGISTRATION                 -144
+#define PMIX_MODEL_DECLARED                         -147
+#define PMIX_MODEL_RESOURCES                        -151     // model resource usage has changed
+#define PMIX_OPENMP_PARALLEL_ENTERED                -152     // an OpenMP parallel region has been entered
+#define PMIX_OPENMP_PARALLEL_EXITED                 -153     // an OpenMP parallel region has completed
+#define PMIX_LAUNCHER_READY                         -155
+#define PMIX_OPERATION_IN_PROGRESS                  -156
+#define PMIX_OPERATION_SUCCEEDED                    -157
+#define PMIX_ERR_INVALID_OPERATION                  -158
+#define PMIX_GROUP_INVITED                          -159
+#define PMIX_GROUP_LEFT                             -160
+#define PMIX_GROUP_INVITE_ACCEPTED                  -161
+#define PMIX_GROUP_INVITE_DECLINED                  -162
+#define PMIX_GROUP_INVITE_FAILED                    -163
+#define PMIX_GROUP_MEMBERSHIP_UPDATE                -164
+#define PMIX_GROUP_CONSTRUCT_ABORT                  -165
+#define PMIX_GROUP_CONSTRUCT_COMPLETE               -166
+#define PMIX_GROUP_LEADER_SELECTED                  -167
+#define PMIX_GROUP_LEADER_FAILED                    -168
+#define PMIX_GROUP_CONTEXT_ID_ASSIGNED              -169
+#define PMIX_GROUP_MEMBER_FAILED                    -170
+#define PMIX_ERR_REPEAT_ATTR_REGISTRATION           -171
+#define PMIX_ERR_IOF_FAILURE                        -172
+#define PMIX_ERR_IOF_COMPLETE                       -173
+#define PMIX_LAUNCH_COMPLETE                        -174     // include nspace of the launched job with notification
+#define PMIX_FABRIC_UPDATED                         -175
+#define PMIX_FABRIC_UPDATE_PENDING                  -176
+#define PMIX_FABRIC_UPDATE_ENDPOINTS                -113
+
+/* job-related errors */
+#define PMIX_ERR_JOB_APP_NOT_EXECUTABLE             -177
+#define PMIX_ERR_JOB_NO_EXE_SPECIFIED               -178
+#define PMIX_ERR_JOB_FAILED_TO_MAP                  -179
+#define PMIX_ERR_JOB_CANCELED                       -180
+#define PMIX_ERR_JOB_FAILED_TO_LAUNCH               -181
+#define PMIX_ERR_JOB_ABORTED                        -182
+#define PMIX_ERR_JOB_KILLED_BY_CMD                  -183
+#define PMIX_ERR_JOB_ABORTED_BY_SIG                 -184
+#define PMIX_ERR_JOB_TERM_WO_SYNC                   -185
+#define PMIX_ERR_JOB_SENSOR_BOUND_EXCEEDED          -186
+#define PMIX_ERR_JOB_NON_ZERO_TERM                  -187
+#define PMIX_ERR_JOB_ALLOC_FAILED                   -188
+#define PMIX_ERR_JOB_ABORTED_BY_SYS_EVENT           -189
+#define PMIX_ERR_JOB_EXE_NOT_FOUND                  -190
+#define PMIX_ERR_JOB_WDIR_NOT_FOUND                 -233
+#define PMIX_ERR_JOB_INSUFFICIENT_RESOURCES         -234
+#define PMIX_ERR_JOB_SYS_OP_FAILED                  -235
+
+/* job-related non-error events */
+#define PMIX_EVENT_JOB_START                        -191
+#define PMIX_EVENT_JOB_END                          -145
+#define PMIX_EVENT_SESSION_START                    -192
+#define PMIX_EVENT_SESSION_END                      -193
+
+/* process-related events */
+#define PMIX_ERR_PROC_TERM_WO_SYNC                  -200
+#define PMIX_EVENT_PROC_TERMINATED                  -201
+
+/* system failures */
+#define PMIX_EVENT_SYS_BASE                         -230
+#define PMIX_EVENT_NODE_DOWN                        -231
+#define PMIX_EVENT_NODE_OFFLINE                     -232
+#define PMIX_EVENT_SYS_OTHER                        -330
+
+
+/* define a macro for identifying system event values */
+#define PMIX_SYSTEM_EVENT(a)   \
+    ((a) <= PMIX_EVENT_SYS_BASE && PMIX_EVENT_SYS_OTHER <= (a))
+
+/* used by event handlers */
+#define PMIX_EVENT_NO_ACTION_TAKEN                  -331
+#define PMIX_EVENT_PARTIAL_ACTION_TAKEN             -332
+#define PMIX_EVENT_ACTION_DEFERRED                  -333
+#define PMIX_EVENT_ACTION_COMPLETE                  -334
+
+/* define a starting point for user-level defined error
+ * constants - negative values larger than this are guaranteed
+ * not to conflict with PMIx values. Definitions should always
+ * be based on the PMIX_EXTERNAL_ERR_BASE constant and -not- a
+ * specific value as the value of the constant may change */
+#define PMIX_EXTERNAL_ERR_BASE                      -3000
+
+/* define the results values for comparisons */
+typedef enum {
+    PMIX_EQUAL,
+    PMIX_VALUE1_GREATER,
+    PMIX_VALUE2_GREATER,
+    PMIX_VALUE_TYPE_DIFFERENT,
+    PMIX_VALUE_INCOMPATIBLE_OBJECTS,
+    PMIX_VALUE_COMPARISON_NOT_AVAIL
+} pmix_value_cmp_t;
+
+/****    PMIX DATA TYPES    ****/
+typedef uint16_t pmix_data_type_t;
+#define PMIX_UNDEF                       0
+#define PMIX_BOOL                        1  // converted to/from native true/false to uint8 for pack/unpack
+#define PMIX_BYTE                        2  // a byte of data
+#define PMIX_STRING                      3  // NULL-terminated string
+#define PMIX_SIZE                        4  // size_t
+#define PMIX_PID                         5  // OS-pid
+#define PMIX_INT                         6
+#define PMIX_INT8                        7
+#define PMIX_INT16                       8
+#define PMIX_INT32                       9
+#define PMIX_INT64                      10
+#define PMIX_UINT                       11
+#define PMIX_UINT8                      12
+#define PMIX_UINT16                     13
+#define PMIX_UINT32                     14
+#define PMIX_UINT64                     15
+#define PMIX_FLOAT                      16
+#define PMIX_DOUBLE                     17
+#define PMIX_TIMEVAL                    18
+#define PMIX_TIME                       19
+#define PMIX_STATUS                     20  // needs to be tracked separately from integer for those times
+                                            // when we are embedded and it needs to be converted to the
+                                            // host error definitions
+#define PMIX_VALUE                      21
+#define PMIX_PROC                       22
+#define PMIX_APP                        23
+#define PMIX_INFO                       24
+#define PMIX_PDATA                      25
+// Hole left by deprecation/removal of PMIX_BUFFER
+#define PMIX_BYTE_OBJECT                27
+#define PMIX_KVAL                       28
+// Hole left by deprecation/removal of PMIX_MODEX
+#define PMIX_PERSIST                    30
+#define PMIX_POINTER                    31
+#define PMIX_SCOPE                      32
+#define PMIX_DATA_RANGE                 33
+#define PMIX_COMMAND                    34
+#define PMIX_INFO_DIRECTIVES            35
+#define PMIX_DATA_TYPE                  36
+#define PMIX_PROC_STATE                 37
+#define PMIX_PROC_INFO                  38
+#define PMIX_DATA_ARRAY                 39
+#define PMIX_PROC_RANK                  40
+#define PMIX_QUERY                      41
+#define PMIX_COMPRESSED_STRING          42  // string compressed with zlib
+#define PMIX_ALLOC_DIRECTIVE            43
+// Hole left by deprecation/removal of PMIX_INFO_ARRAY
+#define PMIX_IOF_CHANNEL                45
+#define PMIX_ENVAR                      46
+#define PMIX_COORD                      47
+#define PMIX_REGATTR                    48
+#define PMIX_REGEX                      49
+#define PMIX_JOB_STATE                  50
+#define PMIX_LINK_STATE                 51
+#define PMIX_PROC_CPUSET                52
+#define PMIX_GEOMETRY                   53
+#define PMIX_DEVICE_DIST                54
+#define PMIX_ENDPOINT                   55
+#define PMIX_TOPO                       56
+#define PMIX_DEVTYPE                    57
+#define PMIX_LOCTYPE                    58
+#define PMIX_COMPRESSED_BYTE_OBJECT     59
+#define PMIX_PROC_NSPACE                60
+#define PMIX_PROC_STATS                 61
+#define PMIX_DISK_STATS                 62
+#define PMIX_NET_STATS                  63
+#define PMIX_NODE_STATS                 64
+#define PMIX_DATA_BUFFER                65
+#define PMIX_STOR_MEDIUM                66
+#define PMIX_STOR_ACCESS                67
+#define PMIX_STOR_PERSIST               68
+#define PMIX_STOR_ACCESS_TYPE           69
+/********************/
+
+/* define a boundary for implementers so they can add their own data types */
+#define PMIX_DATA_TYPE_MAX     500
+
+
+/* define a scope for data "put" by PMIx per the following:
+ *
+ * PMI_LOCAL - the data is intended only for other application
+ *             processes on the same node. Data marked in this way
+ *             will not be included in data packages sent to remote requestors
+ * PMI_REMOTE - the data is intended solely for applications processes on
+ *              remote nodes. Data marked in this way will not be shared with
+ *              other processes on the same node
+ * PMI_GLOBAL - the data is to be shared with all other requesting processes,
+ *              regardless of location
+ */
+typedef uint8_t pmix_scope_t;
+#define PMIX_SCOPE_UNDEF    0
+#define PMIX_LOCAL          1   // share to procs also on this node
+#define PMIX_REMOTE         2   // share with procs not on this node
+#define PMIX_GLOBAL         3   // share with all procs (local + remote)
+#define PMIX_INTERNAL       4   // store data in the internal tables
+
+/* define a range for data "published" by PMIx
+ */
+typedef uint8_t pmix_data_range_t;
+#define PMIX_RANGE_UNDEF        0
+#define PMIX_RANGE_RM           1   // data is intended for the host resource manager
+#define PMIX_RANGE_LOCAL        2   // available on local node only
+#define PMIX_RANGE_NAMESPACE    3   // data is available to procs in the same nspace only
+#define PMIX_RANGE_SESSION      4   // data available to all procs in session
+#define PMIX_RANGE_GLOBAL       5   // data available to all procs
+#define PMIX_RANGE_CUSTOM       6   // range is specified in a pmix_info_t
+#define PMIX_RANGE_PROC_LOCAL   7   // restrict range to the local proc
+#define PMIX_RANGE_INVALID   UINT8_MAX
+
+/* define a "persistence" policy for data published by clients */
+typedef uint8_t pmix_persistence_t;
+#define PMIX_PERSIST_INDEF          0   // retain until specifically deleted
+#define PMIX_PERSIST_FIRST_READ     1   // delete upon first access
+#define PMIX_PERSIST_PROC           2   // retain until publishing process terminates
+#define PMIX_PERSIST_APP            3   // retain until application terminates
+#define PMIX_PERSIST_SESSION        4   // retain until session/allocation terminates
+#define PMIX_PERSIST_INVALID   UINT8_MAX
+
+/* define a set of bit-mask flags for specifying behavior of
+ * command directives via pmix_info_t arrays */
+typedef uint32_t pmix_info_directives_t;
+#define PMIX_INFO_REQD              0x00000001
+#define PMIX_INFO_ARRAY_END         0x00000002      // mark the end of an array created by PMIX_INFO_CREATE
+#define PMIX_INFO_REQD_PROCESSED    0x00000004      // reqd attribute has been processed
+#define PMIX_INFO_QUALIFIER         0x00000008      // info is a qualifier to the primary value
+#define PMIX_INFO_PERSISTENT        0x00000010      // do not release included value
+/* the top 16-bits are reserved for internal use by
+ * implementers - these may be changed inside the
+ * PMIx library */
+#define PMIX_INFO_DIR_RESERVED 0xffff0000
+
+/* define a set of directives for allocation requests */
+typedef uint8_t pmix_alloc_directive_t;
+#define PMIX_ALLOC_NEW          1  // new allocation is being requested. The resulting allocation will be
+                                   // disjoint (i.e., not connected in a job sense) from the requesting allocation
+#define PMIX_ALLOC_EXTEND       2  // extend the existing allocation, either in time or as additional resources
+#define PMIX_ALLOC_RELEASE      3  // release part of the existing allocation. Attributes in the accompanying
+                                   // pmix\_info\_t array may be used to specify permanent release of the
+                                   // identified resources, or "lending" of those resources for some period
+                                   // of time.
+#define PMIX_ALLOC_REAQUIRE     4  // reacquire resources that were previously "lent" back to the scheduler
+
+/* define a value boundary beyond which implementers are free
+ * to define their own directive values */
+#define PMIX_ALLOC_EXTERNAL     128
+
+
+/* define a set of bit-mask flags for specifying IO
+ * forwarding channels. These can be OR'd together
+ * to reference multiple channels */
+typedef uint16_t pmix_iof_channel_t;
+#define PMIX_FWD_NO_CHANNELS        0x0000
+#define PMIX_FWD_STDIN_CHANNEL      0x0001
+#define PMIX_FWD_STDOUT_CHANNEL     0x0002
+#define PMIX_FWD_STDERR_CHANNEL     0x0004
+#define PMIX_FWD_STDDIAG_CHANNEL    0x0008
+#define PMIX_FWD_ALL_CHANNELS       0x00ff
+
+/* define values associated with PMIx_Group_join
+ * to indicate accept and decline - this is
+ * done for readability of user code */
+typedef enum {
+    PMIX_GROUP_DECLINE,
+    PMIX_GROUP_ACCEPT
+} pmix_group_opt_t;
+
+typedef enum {
+    PMIX_GROUP_CONSTRUCT,
+    PMIX_GROUP_DESTRUCT
+} pmix_group_operation_t;
+
+/* define storage medium values
+ * The pmix_storage_medium_t is a uint64_t type that defines
+ * a set of bit-mask flags for specifying different types of
+ * storage mediums. These can be bitwise OR'd together to
+ * accommodate storage systems that mix storage medium types. */
+typedef uint64_t pmix_storage_medium_t;
+#define PMIX_STORAGE_MEDIUM_UNKNOWN     0x0000000000000001
+#define PMIX_STORAGE_MEDIUM_TAPE        0x0000000000000002
+#define PMIX_STORAGE_MEDIUM_HDD         0x0000000000000004
+#define PMIX_STORAGE_MEDIUM_SSD         0x0000000000000008
+#define PMIX_STORAGE_MEDIUM_NVME        0x0000000000000010
+#define PMIX_STORAGE_MEDIUM_PMEM        0x0000000000000020
+#define PMIX_STORAGE_MEDIUM_RAM         0x0000000000000040
+
+/* define storage accessibility values
+ * The pmix_storage_accessibility_t is a uint64_t type that
+ * defines a set of bit-mask flags for specifying different
+ * levels of storage accessibility (i.e,. from where a storage
+ * system may be accessed). These can be bitwise OR'd together
+ * to accommodate storage systems that are accessibile in
+ * multiple ways. */
+typedef uint64_t pmix_storage_accessibility_t;
+#define PMIX_STORAGE_ACCESSIBILITY_NODE     0x0000000000000001
+#define PMIX_STORAGE_ACCESSIBILITY_SESSION  0x0000000000000002
+#define PMIX_STORAGE_ACCESSIBILITY_JOB      0x0000000000000004
+#define PMIX_STORAGE_ACCESSIBILITY_RACK     0x0000000000000008
+#define PMIX_STORAGE_ACCESSIBILITY_CLUSTER  0x0000000000000010
+#define PMIX_STORAGE_ACCESSIBILITY_REMOTE   0x0000000000000020
+
+/* define storage persistence values
+ * The pmix_storage_persistence_t is a uint64_t type that defines
+ * a set of bit-mask flags for specifying different levels of
+ * persistence for a particular storage system. */
+typedef uint64_t pmix_storage_persistence_t;
+#define PMIX_STORAGE_PERSISTENCE_TEMPORARY  0x0000000000000001
+#define PMIX_STORAGE_PERSISTENCE_NODE       0x0000000000000002
+#define PMIX_STORAGE_PERSISTENCE_SESSION    0x0000000000000004
+#define PMIX_STORAGE_PERSISTENCE_JOB        0x0000000000000008
+#define PMIX_STORAGE_PERSISTENCE_SCRATCH    0x0000000000000010
+#define PMIX_STORAGE_PERSISTENCE_PROJECT    0x0000000000000020
+#define PMIX_STORAGE_PERSISTENCE_ARCHIVE    0x0000000000000040
+
+/* define storage access values
+ * The pmix_storage_access_type_t is a uint16_t type that defines
+ * a set of bit-mask flags for specifying different storage system
+ * access types. */
+typedef uint16_t pmix_storage_access_type_t;
+#define PMIX_STORAGE_ACCESS_RD      0x0001
+#define PMIX_STORAGE_ACCESS_WR      0x0002
+#define PMIX_STORAGE_ACCESS_RDWR    0x0003
+
+
+/* define some "hooks" external libraries can use to
+ * intercept memory allocation/release operations */
+static inline void* pmix_malloc(size_t n)
+{
+    return malloc(n);
+}
+
+static inline void pmix_free(void *m)
+{
+    free(m);
+}
+
+static inline void* pmix_calloc(size_t n, size_t m)
+{
+    return calloc(n, m);
+}
+
+/* declare a convenience macro for checking keys */
+#define PMIX_CHECK_KEY(a, b) \
+    (0 == strncmp((a)->key, (b), PMIX_MAX_KEYLEN))
+
+#define PMIX_CHECK_RESERVED_KEY(a) \
+    (0 == strncmp((a), "pmix", 4))
+
+#define PMIX_LOAD_KEY(a, b)                                                 \
+    do {                                                                    \
+        memset((a), 0, PMIX_MAX_KEYLEN+1);                                  \
+        if (NULL != (b)) {                                                  \
+            pmix_strncpy((char*)(a), (const char*)(b), PMIX_MAX_KEYLEN);    \
+        }                                                                   \
+    }while(0)
+
+/* define a convenience macro for loading nspaces */
+#define PMIX_LOAD_NSPACE(a, b)                              \
+    do {                                                    \
+        memset((a), 0, PMIX_MAX_NSLEN+1);                   \
+        if (NULL != (b)) {                                  \
+            pmix_strncpy((char*)(a), (b), PMIX_MAX_NSLEN);  \
+        }                                                   \
+    }while(0)
+
+/* define a convenience macro for checking nspaces */
+#define PMIX_CHECK_NSPACE(a, b) \
+    (PMIX_NSPACE_INVALID((a)) || PMIX_NSPACE_INVALID((b)) || 0 == strncmp((a), (b), PMIX_MAX_NSLEN))
+
+/* define a convenience macro for loading names */
+#define PMIX_LOAD_PROCID(a, b, c)               \
+    do {                                        \
+        PMIX_LOAD_NSPACE((a)->nspace, (b));     \
+        (a)->rank = (c);                        \
+    }while(0)
+
+#define PMIX_XFER_PROCID(a, b)      \
+    memcpy((a), (b), sizeof(pmix_proc_t))
+
+#define PMIX_PROCID_XFER(a, b) PMIX_XFER_PROCID(a, b)
+
+/* define a convenience macro for checking names */
+#define PMIX_CHECK_PROCID(a, b) \
+    (PMIX_CHECK_NSPACE((a)->nspace, (b)->nspace) && ((a)->rank == (b)->rank || (PMIX_RANK_WILDCARD == (a)->rank || PMIX_RANK_WILDCARD == (b)->rank)))
+
+#define PMIX_CHECK_RANK(a, b) \
+    ((a) == (b) || (PMIX_RANK_WILDCARD == (a) || PMIX_RANK_WILDCARD == (b)))
+
+#define PMIX_NSPACE_INVALID(a) \
+    (NULL == (a) || 0 == pmix_nslen((a)))
+
+#define PMIX_PROCID_INVALID(a)  \
+    (PMIX_NSPACE_INVALID((a)->nspace) || PMIX_RANK_INVALID == (a)->rank)
+
+/**
+ * Provide a safe version of strncpy that doesn't generate
+ * a ton of spurious warnings. Note that not every environment
+ * provides nice string functions, and we aren't concerned about
+ * max performance here
+ *
+ * @param dest Destination string.
+ * @param src Source string.
+ * @param len Size of the dest array - 1
+ *
+ */
+static inline void pmix_strncpy(char *dest,
+                                const char *src,
+                                size_t len)
+{
+    size_t i;
+
+    /* use an algorithm that also protects against
+     * non-NULL-terminated src strings */
+    for (i=0; i < len; ++i, ++src, ++dest) {
+        *dest = *src;
+        if ('\0' == *src) {
+            break;
+        }
+    }
+    *dest = '\0';
+}
+
+static inline size_t pmix_keylen(const char *src)
+{
+    size_t i, maxlen;
+
+    if (NULL == src) {
+        return 0;
+    }
+    maxlen = PMIX_MAX_KEYLEN + 1;
+    /* use an algorithm that also protects against
+     * non-NULL-terminated src strings */
+    for (i=0; i < maxlen; ++i, ++src) {
+        if ('\0' == *src) {
+            break;
+        }
+    }
+    return i;
+}
+
+static inline size_t pmix_nslen(const char *src)
+{
+    size_t i, maxlen;
+
+    if (NULL == src) {
+        return 0;
+    }
+    maxlen = PMIX_MAX_NSLEN + 1;
+    /* use an algorithm that also protects against
+     * non-NULL-terminated src strings */
+    for (i=0; i < maxlen; ++i, ++src) {
+        if ('\0' == *src) {
+            break;
+        }
+    }
+    return i;
+}
+
+static inline
+int pmix_argv_count(char **argv)
+{
+    char **p;
+    int i;
+
+    if (NULL == argv)
+        return 0;
+
+    for (i = 0, p = argv; *p; i++, p++)
+        continue;
+
+    return i;
+}
+
+#define PMIX_ARGV_COUNT(r, a) \
+    (r) = pmix_argv_count(a)
+
+static inline
+pmix_status_t pmix_argv_append_nosize(char ***argv, const char *arg)
+{
+    int argc;
+
+    /* Create new argv. */
+
+    if (NULL == *argv) {
+        *argv = (char **) malloc(2 * sizeof(char *));
+        if (NULL == *argv) {
+            return PMIX_ERR_OUT_OF_RESOURCE;
+        }
+        argc = 0;
+        (*argv)[0] = NULL;
+        (*argv)[1] = NULL;
+    }
+
+    /* Extend existing argv. */
+    else {
+        /* count how many entries currently exist */
+        argc = pmix_argv_count(*argv);
+
+        *argv = (char **) realloc(*argv, (argc + 2) * sizeof(char *));
+        if (NULL == *argv) {
+            return PMIX_ERR_OUT_OF_RESOURCE;
+        }
+    }
+
+    /* Set the newest element to point to a copy of the arg string */
+
+    (*argv)[argc] = strdup(arg);
+    if (NULL == (*argv)[argc]) {
+        return PMIX_ERR_OUT_OF_RESOURCE;
+    }
+
+    argc = argc + 1;
+    (*argv)[argc] = NULL;
+
+    return PMIX_SUCCESS;
+}
+
+#define PMIX_ARGV_APPEND(r, a, b) \
+    (r) = pmix_argv_append_nosize(&(a), (b))
+
+static inline
+pmix_status_t pmix_argv_prepend_nosize(char ***argv, const char *arg)
+{
+    int argc;
+    int i;
+
+    /* Create new argv. */
+
+    if (NULL == *argv) {
+        *argv = (char **) malloc(2 * sizeof(char *));
+        if (NULL == *argv) {
+            return PMIX_ERR_OUT_OF_RESOURCE;
+        }
+        (*argv)[0] = strdup(arg);
+        (*argv)[1] = NULL;
+    } else {
+        /* count how many entries currently exist */
+        argc = pmix_argv_count(*argv);
+
+        *argv = (char **) realloc(*argv, (argc + 2) * sizeof(char *));
+        if (NULL == *argv) {
+            return PMIX_ERR_OUT_OF_RESOURCE;
+        }
+        (*argv)[argc + 1] = NULL;
+
+        /* shift all existing elements down 1 */
+        for (i = argc; 0 < i; i--) {
+            (*argv)[i] = (*argv)[i - 1];
+        }
+        (*argv)[0] = strdup(arg);
+    }
+
+    return PMIX_SUCCESS;
+}
+
+#define PMIX_ARGV_PREPEND(r, a, b) \
+    (r) = pmix_argv_prepend_nosize(&(a), b)
+
+static inline
+pmix_status_t pmix_argv_append_unique_nosize(char ***argv, const char *arg)
+{
+    int i;
+
+    /* if the provided array is NULL, then the arg cannot be present,
+     * so just go ahead and append
+     */
+    if (NULL == *argv) {
+        return pmix_argv_append_nosize(argv, arg);
+    }
+
+    /* see if this arg is already present in the array */
+    for (i = 0; NULL != (*argv)[i]; i++) {
+        if (0 == strcmp(arg, (*argv)[i])) {
+            /* already exists */
+            return PMIX_SUCCESS;
+        }
+    }
+
+    /* we get here if the arg is not in the array - so add it */
+    return pmix_argv_append_nosize(argv, arg);
+}
+
+#define PMIX_ARGV_APPEND_UNIQUE(r, a, b) \
+    (r) = pmix_argv_append_unique_nosize(a, b)
+
+static inline void pmix_argv_free(char **argv)
+{
+    char **p;
+
+    if (NULL == argv)
+        return;
+
+    for (p = argv; NULL != *p; ++p) {
+        pmix_free(*p);
+    }
+
+    pmix_free(argv);
+}
+
+#define PMIX_ARGV_FREE(a)  pmix_argv_free(a)
+
+static inline
+char **pmix_argv_split_inter(const char *src_string,
+                             int delimiter,
+                             bool include_empty)
+{
+    char arg[512];
+    char **argv = NULL;
+    const char *p;
+    char *argtemp;
+    size_t arglen;
+
+    while (src_string && *src_string) {
+        p = src_string;
+        arglen = 0;
+
+        while (('\0' != *p) && (*p != delimiter)) {
+            ++p;
+            ++arglen;
+        }
+
+        /* zero length argument, skip */
+
+        if (src_string == p) {
+            if (include_empty) {
+                arg[0] = '\0';
+                if (PMIX_SUCCESS != pmix_argv_append_nosize(&argv, arg)) {
+                    return NULL;
+                }
+            }
+            src_string = p + 1;
+            continue;
+        }
+
+        /* tail argument, add straight from the original string */
+
+        else if ('\0' == *p) {
+            if (PMIX_SUCCESS != pmix_argv_append_nosize(&argv, src_string)) {
+                return NULL;
+            }
+            src_string = p;
+            continue;
+        }
+
+        /* long argument, malloc buffer, copy and add */
+
+        else if (arglen > 511) {
+            argtemp = (char *) malloc(arglen + 1);
+            if (NULL == argtemp)
+                return NULL;
+
+            pmix_strncpy(argtemp, src_string, arglen);
+            argtemp[arglen] = '\0';
+
+            if (PMIX_SUCCESS != pmix_argv_append_nosize(&argv, argtemp)) {
+                free(argtemp);
+                return NULL;
+            }
+
+            free(argtemp);
+        }
+
+        /* short argument, copy to buffer and add */
+
+        else {
+            pmix_strncpy(arg, src_string, arglen);
+            arg[arglen] = '\0';
+
+            if (PMIX_SUCCESS != pmix_argv_append_nosize(&argv, arg)) {
+                return NULL;
+            }
+        }
+
+        src_string = p + 1;
+    }
+
+    /* All done */
+
+    return argv;
+}
+
+static inline
+char **pmix_argv_split_with_empty(const char *src_string, int delimiter)
+{
+    return pmix_argv_split_inter(src_string, delimiter, true);
+}
+
+static inline
+char **pmix_argv_split(const char *src_string, int delimiter)
+{
+    return pmix_argv_split_inter(src_string, delimiter, false);
+}
+
+#define PMIX_ARGV_SPLIT(a, b, c) \
+    (a) = pmix_argv_split(b, c)
+
+static inline
+char *pmix_argv_join(char **argv, int delimiter)
+{
+    char **p;
+    char *pp;
+    char *str;
+    size_t str_len = 0;
+    size_t i;
+
+    /* Bozo case */
+
+    if (NULL == argv || NULL == argv[0]) {
+        return strdup("");
+    }
+
+    /* Find the total string length in argv including delimiters.  The
+     last delimiter is replaced by the NULL character. */
+
+    for (p = argv; *p; ++p) {
+        str_len += strlen(*p) + 1;
+    }
+
+    /* Allocate the string. */
+
+    if (NULL == (str = (char *) malloc(str_len)))
+        return NULL;
+
+    /* Loop filling in the string. */
+
+    str[--str_len] = '\0';
+    p = argv;
+    pp = *p;
+
+    for (i = 0; i < str_len; ++i) {
+        if ('\0' == *pp) {
+
+            /* End of a string, fill in a delimiter and go to the next
+             string. */
+
+            str[i] = (char) delimiter;
+            ++p;
+            pp = *p;
+        } else {
+            str[i] = *pp++;
+        }
+    }
+
+    /* All done */
+
+    return str;
+}
+
+#define PMIX_ARGV_JOIN(a, b, c) \
+    (a) = pmix_argv_join(b, c)
+
+static inline
+char **pmix_argv_copy(char **argv)
+{
+    char **dupv = NULL;
+
+    if (NULL == argv)
+        return NULL;
+
+    /* create an "empty" list, so that we return something valid if we
+     were passed a valid list with no contained elements */
+    dupv = (char **) malloc(sizeof(char *));
+    dupv[0] = NULL;
+
+    while (NULL != *argv) {
+        if (PMIX_SUCCESS != pmix_argv_append_nosize(&dupv, *argv)) {
+            PMIX_ARGV_FREE(dupv);
+            return NULL;
+        }
+
+        ++argv;
+    }
+
+    /* All done */
+
+    return dupv;
+}
+
+#define PMIX_ARGV_COPY(a, b) \
+    (a) = pmix_argv_copy(b)
+
+/**
+ * Portable version of setenv(3), allowing editing of any
+ * environ-like array.
+ *
+ * @param name String name of the environment variable to look for
+ * @param value String value to set (may be NULL)
+ * @param overwrite Whether to overwrite any existing value with
+ * the same name
+ * @param env The environment to use
+ *
+ * @retval PMIX_ERR_OUT_OF_RESOURCE If internal malloc() fails.
+ * @retval PMIX_ERR_EXISTS If the name already exists in \em env and
+ * \em overwrite is false (and therefore the \em value was not
+ * saved in \em env)
+ * @retval PMIX_SUCESS If the value replaced another value or is
+ * appended to \em env.
+ *
+ * \em env is expected to be a NULL-terminated array of pointers
+ * (argv-style).  Note that unlike some implementations of
+ * putenv(3), if \em value is inserted in \em env, it is copied.
+ * So the caller can modify/free both \em name and \em value after
+ * pmix_setenv() returns.
+ *
+ * The \em env array will be grown if necessary.
+ *
+ * It is permissible to invoke this function with the
+ * system-defined \em environ variable.  For example:
+ *
+ * \code
+ *   #include "pmix_common.h"
+ *   pmix_setenv("foo", "bar", true, &environ);
+ * \endcode
+ *
+ * NOTE: If you use the real environ, pmix_setenv() will turn
+ * around and perform setenv() to put the value in the
+ * environment.  This may very well lead to a memory leak, so its
+ * use is strongly discouraged.
+ *
+ * It is also permissible to call this function with an empty \em
+ * env, as long as it is pre-initialized with NULL:
+ *
+ * \code
+ *   char **my_env = NULL;
+ *   pmix_setenv("foo", "bar", true, &my_env);
+ * \endcode
+ */
+static inline
+pmix_status_t pmix_setenv(const char *name,
+                          const char *value,
+                          bool overwrite,
+                          char ***env)
+{
+    int i;
+    char newvalue[100000], compare[100000];
+    size_t len;
+    bool valid;
+
+    /* Check the bozo case */
+    if (NULL == env) {
+        return PMIX_ERR_BAD_PARAM;
+    }
+
+    if (NULL != value) {
+        /* check the string for unacceptable length - i.e., ensure
+         * it is NULL-terminated */
+        valid = false;
+        for (i = 0; i < 100000; i++) {
+            if ('\0' == value[i]) {
+                valid = true;
+                break;
+            }
+        }
+        if (!valid) {
+            return PMIX_ERR_BAD_PARAM;
+        }
+    }
+
+    /* If this is the "environ" array, use setenv */
+    if (*env == environ) {
+        if (NULL == value) {
+            /* this is actually an unsetenv request */
+            unsetenv(name);
+        } else {
+            setenv(name, value, overwrite);
+        }
+        return PMIX_SUCCESS;
+    }
+
+    /* Make the new value */
+    if (NULL == value) {
+        snprintf(newvalue, 100000, "%s=", name);
+    } else {
+        snprintf(newvalue, 100000, "%s=%s", name, value);
+    }
+
+    if (NULL == *env) {
+        pmix_argv_append_nosize(env, newvalue);
+        return PMIX_SUCCESS;
+    }
+
+    /* Make something easy to compare to */
+
+    snprintf(compare, 100000, "%s=", name);
+    len = strlen(compare);
+
+    /* Look for a duplicate that's already set in the env */
+
+    for (i = 0; (*env)[i] != NULL; ++i) {
+        if (0 == strncmp((*env)[i], compare, len)) {
+            if (overwrite) {
+                free((*env)[i]);
+                (*env)[i] = strdup(newvalue);
+                return PMIX_SUCCESS;
+            } else {
+                return PMIX_ERR_EXISTS;
+            }
+        }
+    }
+
+    /* If we found no match, append this value */
+
+    pmix_argv_append_nosize(env, newvalue);
+
+    /* All done */
+    return PMIX_SUCCESS;
+}
+
+#define PMIX_SETENV(r, a, b, c) \
+    (r) = pmix_setenv((a), (b), true, (c))
+
+
+/****    PMIX COORD    ****/
+/* define coordinate system views */
+typedef uint8_t pmix_coord_view_t;
+#define PMIX_COORD_VIEW_UNDEF       0x00
+#define PMIX_COORD_LOGICAL_VIEW     0x01
+#define PMIX_COORD_PHYSICAL_VIEW    0x02
+
+/* define a structure for a proc's fabric coordinate */
+typedef struct pmix_coord {
+    pmix_coord_view_t view;
+    uint32_t *coord;
+    size_t dims;
+} pmix_coord_t;
+
+#define PMIX_COORD_STATIC_INIT      \
+{                                   \
+    .view = PMIX_COORD_VIEW_UNDEF,  \
+    .coord = NULL,                  \
+    .dims = 0                       \
+}
+
+#define PMIX_COORD_CREATE(m, d, n)                                              \
+    do {                                                                        \
+        pmix_coord_t *_m;                                                       \
+        if (0 == (d)) {                                                         \
+            (m) = NULL;                                                         \
+        } else {                                                                \
+            _m = (pmix_coord_t*)pmix_malloc((d) * sizeof(pmix_coord_t));        \
+            if (NULL != _m) {                                                   \
+                memset((m), 0, (d)*sizeof(pmix_coord_t));                       \
+                _m->view = PMIX_COORD_VIEW_UNDEF;                               \
+                _m->dims = (n);                                                 \
+                if (0 == (n)) {                                                 \
+                    _m->coord = NULL;                                           \
+                } else {                                                        \
+                    _m->coord = (uint32_t*)pmix_malloc((n) * sizeof(uint32_t)); \
+                    if (NULL != _m->coord) {                                    \
+                        memset(_m->coord, 0, (n)*sizeof(uint32_t));             \
+                    }                                                           \
+                }                                                               \
+            }                                                                   \
+            (m) = _m;                                                           \
+        }                                                                       \
+    } while(0)
+
+#define PMIX_COORD_CONSTRUCT(m)             \
+    do {                                    \
+        (m)->view = PMIX_COORD_VIEW_UNDEF;  \
+        (m)->coord = NULL;                  \
+        (m)->dims = 0;                      \
+    } while(0)
+
+#define PMIX_COORD_DESTRUCT(m)              \
+    do {                                    \
+        (m)->view = PMIX_COORD_VIEW_UNDEF;  \
+        if (NULL != (m)->coord) {           \
+            pmix_free((m)->coord);          \
+            (m)->coord = NULL;              \
+            (m)->dims = 0;                  \
+        }                                   \
+    } while(0)
+
+#define PMIX_COORD_FREE(m, n)                       \
+    do {                                            \
+        size_t _nc_;                                \
+        if (NULL != (m)) {                          \
+            for (_nc_ = 0; _nc_ < (n); _nc_++) {    \
+                PMIX_COORD_DESTRUCT(&(m)[_nc_]);    \
+            }                                       \
+            free((m));                              \
+            (m) = NULL;                             \
+        }                                           \
+    } while(0)
+
+
+/****    PMIX LINK STATES    ****/
+typedef uint8_t pmix_link_state_t;
+#define PMIX_LINK_STATE_UNKNOWN     0  // The port state is unknown or not applicable
+#define PMIX_LINK_DOWN              1  // The port is inactive.
+#define PMIX_LINK_UP                2  // The port is active.
+
+
+/****   PMIX CPUSET    ****/
+typedef struct{
+    char *source;
+    void *bitmap;
+} pmix_cpuset_t;
+
+#define PMIX_CPUSET_STATIC_INIT \
+{                               \
+    .source = NULL,             \
+    .bitmap = NULL              \
+}
+
+#define PMIX_CPUSET_CONSTRUCT(m) \
+    memset((m), 0, sizeof(pmix_cpuset_t))
+
+#define PMIX_CPUSET_CREATE(m, n)    \
+    do {                                                                    \
+        if (0 == (n))   {                                                   \
+            (m) = NULL;                                                     \
+        } else {                                                            \
+            (m) = (pmix_cpuset_t*)pmix_malloc((n) * sizeof(pmix_cpuset_t)); \
+            if (NULL != (m)) {                                              \
+                memset((m), 0, (n) * sizeof(pmix_cpuset_t));                \
+            }                                                               \
+        }                                                                   \
+    } while(0)
+
+
+/****    PMIX BIND ENVELOPE    ****/
+typedef uint8_t pmix_bind_envelope_t;
+#define PMIX_CPUBIND_PROCESS    0
+#define PMIX_CPUBIND_THREAD     1
+
+
+/****    PMIX TOPOLOGY    ****/
+typedef struct {
+    char *source;
+    void *topology;
+} pmix_topology_t;
+
+#define PMIX_TOPOLOGY_STATIC_INIT   \
+{                                   \
+    .source = NULL,                 \
+    .topology = NULL                \
+}
+
+#define PMIX_TOPOLOGY_CONSTRUCT(m) \
+    memset((m), 0, sizeof(pmix_topology_t))
+
+#define PMIX_TOPOLOGY_CREATE(m, n) \
+    do {                                                                        \
+        if (0 == (n)) {                                                         \
+            (m) = NULL;                                                         \
+        } else {                                                                \
+            (m) = (pmix_topology_t*)pmix_malloc((n) * sizeof(pmix_topology_t)); \
+            if (NULL != (m)) {                                                  \
+                memset((m), 0, (n) * sizeof(pmix_topology_t));                  \
+            }                                                                   \
+        }                                                                       \
+    } while(0)
+
+/**** PMIX RELATIVE LOCALITY    ****/
+typedef uint16_t pmix_locality_t;
+#define PMIX_LOCALITY_UNKNOWN           0x0000
+#define PMIX_LOCALITY_NONLOCAL          0x8000
+#define PMIX_LOCALITY_SHARE_HWTHREAD    0x0001
+#define PMIX_LOCALITY_SHARE_CORE        0x0002
+#define PMIX_LOCALITY_SHARE_L1CACHE     0x0004
+#define PMIX_LOCALITY_SHARE_L2CACHE     0x0008
+#define PMIX_LOCALITY_SHARE_L3CACHE     0x0010
+#define PMIX_LOCALITY_SHARE_PACKAGE     0x0020
+#define PMIX_LOCALITY_SHARE_NUMA        0x0040
+#define PMIX_LOCALITY_SHARE_NODE        0x4000
+
+
+/****    PMIX GEOMETRY     ****/
+typedef struct pmix_geometry {
+    size_t fabric;
+    char *uuid;
+    char *osname;
+    pmix_coord_t *coordinates;
+    size_t ncoords;
+} pmix_geometry_t;
+
+#define PMIX_GEOMETRY_STATIC_INIT   \
+{                                   \
+    .fabric = 0,                    \
+    .uuid = NULL,                   \
+    .osname = NULL,                 \
+    .coordinates = NULL,            \
+    .ncoords = 0                    \
+}
+
+#define PMIX_GEOMETRY_CONSTRUCT(m) \
+    memset((m), 0, sizeof(pmix_geometry_t));
+
+#define PMIX_GEOMETRY_DESTRUCT(m)                               \
+    do {                                                        \
+        if (NULL != (m)->uuid) {                                \
+            free((m)->uuid);                                    \
+            (m)->uuid = NULL;                                   \
+        }                                                       \
+        if (NULL != (m)->osname) {                              \
+            free((m)->osname);                                  \
+            (m)->osname = NULL;                                 \
+        }                                                       \
+        if (NULL != (m)->coordinates) {                         \
+            PMIX_COORD_FREE((m)->coordinates, (m)->ncoords);    \
+        }                                                       \
+    } while(0)
+
+#define PMIX_GEOMETRY_CREATE(m, n)                                              \
+    do {                                                                        \
+        if (0 == (n)) {                                                         \
+            (m) = NULL;                                                         \
+        } else {                                                                \
+            (m) = (pmix_geometry_t*)pmix_malloc((n) * sizeof(pmix_geometry_t)); \
+            if (NULL != (m)) {                                                  \
+                memset((m), 0, (n) * sizeof(pmix_geometry_t));                  \
+            }                                                                   \
+        }                                                                       \
+    } while(0)
+
+#define PMIX_GEOMETRY_FREE(m, n)                    \
+    do {                                            \
+        size_t _i;                                  \
+        if (NULL != (m)) {                          \
+            for (_i=0; _i < (n); _i++) {            \
+                PMIX_GEOMETRY_DESTRUCT(&(m)[_i]);   \
+            }                                       \
+            pmix_free((m));                         \
+            (m) = NULL;                             \
+        }                                           \
+    } while(0)
+
+
+/****    PMIX_DEVICE_TYPE    ****/
+typedef uint64_t pmix_device_type_t;
+#define PMIX_DEVTYPE_UNKNOWN        0x00
+#define PMIX_DEVTYPE_BLOCK          0x01
+#define PMIX_DEVTYPE_GPU            0x02
+#define PMIX_DEVTYPE_NETWORK        0x04
+#define PMIX_DEVTYPE_OPENFABRICS    0x08
+#define PMIX_DEVTYPE_DMA            0x10
+#define PMIX_DEVTYPE_COPROC         0x20
+
+/****    PMIX_DISTANCE     ****/
+typedef struct pmix_device_distance {
+    char *uuid;
+    char *osname;
+    pmix_device_type_t type;
+    uint16_t mindist;
+    uint16_t maxdist;
+} pmix_device_distance_t;
+
+#define PMIX_DEVICE_DIST_STATIC_INIT    \
+{                                       \
+    .uuid = NULL,                       \
+    .osname = NULL,                     \
+    .type = PMIX_DEVTYPE_UNKNOWN,       \
+    .mindist = 0,                       \
+    .maxdist = 0                        \
+}
+
+#define PMIX_DEVICE_DIST_CONSTRUCT(m)                       \
+    do {                                                    \
+        memset((m), 0, sizeof(pmix_device_distance_t));     \
+        (m)->mindist = UINT16_MAX;                          \
+        (m)->maxdist = UINT16_MAX;                          \
+    } while(0);
+
+#define PMIX_DEVICE_DIST_DESTRUCT(m)    \
+    do {                                \
+        if (NULL != ((m)->uuid)) {      \
+            pmix_free((m)->uuid);       \
+        }                               \
+        if (NULL != ((m)->osname)) {    \
+            pmix_free((m)->osname);     \
+        }                               \
+    } while(0)
+
+#define PMIX_DEVICE_DIST_CREATE(m, n)                                                           \
+    do {                                                                                        \
+        size_t _i;                                                                              \
+        pmix_device_distance_t *_m;                                                             \
+        if (0 == (n)) {                                                                         \
+            (m) = NULL;                                                                         \
+        } else {                                                                                \
+            _m = (pmix_device_distance_t*)pmix_malloc((n) * sizeof(pmix_device_distance_t));    \
+            if (NULL != _m) {                                                                   \
+                memset(_m, 0, (n)*sizeof(pmix_device_distance_t));                              \
+                for (_i=0; _i < (n); _i++) {                                                    \
+                    _m[_i].mindist = UINT16_MAX;                                                \
+                    _m[_i].maxdist = UINT16_MAX;                                                \
+                }                                                                               \
+            }                                                                                   \
+            (m) = _m;                                                                           \
+        }                                                                                       \
+    } while(0)
+
+#define PMIX_DEVICE_DIST_FREE(m, n)                     \
+    do {                                                \
+        size_t _i;                                      \
+        if (NULL != (m)) {                              \
+            for (_i=0; _i < (n); _i++) {                \
+                PMIX_DEVICE_DIST_DESTRUCT(&(m)[_i]);    \
+            }                                           \
+            pmix_free((m));                             \
+            (m) = NULL;                                 \
+        }                                               \
+    } while(0)
+
+
+/****    PMIX BYTE OBJECT    ****/
+typedef struct pmix_byte_object {
+    char *bytes;
+    size_t size;
+} pmix_byte_object_t;
+
+#define PMIX_BYTE_OBJECT_STATIC_INIT    \
+{                                       \
+    .bytes = NULL,                      \
+    .size = 0                           \
+}
+
+#define PMIX_BYTE_OBJECT_CREATE(m, n)                                                   \
+    do {                                                                                \
+        if (0 == (n)) {                                                                 \
+            (m) = NULL;                                                                 \
+        } else {                                                                        \
+            (m) = (pmix_byte_object_t*)pmix_malloc((n) * sizeof(pmix_byte_object_t));   \
+            if (NULL != (m)) {                                                          \
+                memset((m), 0, (n)*sizeof(pmix_byte_object_t));                         \
+            }                                                                           \
+        }                                                                               \
+    } while(0)
+
+#define PMIX_BYTE_OBJECT_CONSTRUCT(m)   \
+    do {                                \
+        (m)->bytes = NULL;              \
+        (m)->size = 0;                  \
+    } while(0)
+
+#define PMIX_BYTE_OBJECT_DESTRUCT(m)    \
+    do {                                \
+        if (NULL != (m)->bytes) {       \
+            pmix_free((m)->bytes);      \
+        }                               \
+        (m)->bytes = NULL;              \
+        (m)->size = 0;                  \
+    } while(0)
+
+#define PMIX_BYTE_OBJECT_FREE(m, n)                     \
+    do {                                                \
+        size_t _bon;                                    \
+        if (NULL != (m)) {                              \
+            for (_bon=0; _bon < n; _bon++) {            \
+                PMIX_BYTE_OBJECT_DESTRUCT(&(m)[_bon]);  \
+            }                                           \
+            pmix_free((m));                             \
+            (m) = NULL;                                 \
+        }                                               \
+    } while(0)
+
+#define PMIX_BYTE_OBJECT_LOAD(b, d, s)      \
+    do {                                    \
+        (b)->bytes = (char*)(d);            \
+        (d) = NULL;                         \
+        (b)->size = (s);                    \
+        (s) = 0;                            \
+    } while(0)
+
+
+/****    PMIX ENDPOINT    ****/
+typedef struct pmix_endpoint {
+    char *uuid;
+    char *osname;
+    pmix_byte_object_t endpt;
+} pmix_endpoint_t;
+
+#define PMIX_ENDPOINT_STATIC_INIT           \
+{                                           \
+    .uuid = NULL,                           \
+    .osname = NULL,                         \
+    .endpt = PMIX_BYTE_OBJECT_STATIC_INIT   \
+}
+
+#define PMIX_ENDPOINT_CONSTRUCT(m)      \
+    memset((m), 0, sizeof(pmix_endpoint_t))
+
+#define PMIX_ENDPOINT_DESTRUCT(m)       \
+    do {                                \
+        if (NULL != (m)->uuid) {        \
+            free((m)->uuid);            \
+        }                               \
+        if (NULL != (m)->osname) {      \
+            free((m)->osname);          \
+        }                               \
+        if (NULL != (m)->endpt.bytes) { \
+            free((m)->endpt.bytes);     \
+        }                               \
+    } while(0)
+
+#define PMIX_ENDPOINT_CREATE(m, n)                                              \
+    do {                                                                        \
+        if (0 == (n)) {                                                         \
+            (m) = NULL;                                                         \
+        } else {                                                                \
+            (m) = (pmix_endpoint_t*)pmix_malloc((n) * sizeof(pmix_endpoint_t)); \
+            if (NULL != (m)) {                                                  \
+                memset((m), 0, (n) * sizeof(pmix_endpoint_t));                  \
+            }                                                                   \
+        }                                                                       \
+    } while(0)
+
+#define PMIX_ENDPOINT_FREE(m, n)                    \
+    do {                                            \
+        size_t _n;                                  \
+        if (NULL != (m)) {                          \
+            for (_n=0; _n < (n); _n++) {            \
+                PMIX_ENDPOINT_DESTRUCT(&((m)[_n])); \
+            }                                       \
+            free((m));                              \
+            (m) = NULL;                             \
+        }                                           \
+    } while(0)
+
+
+
+/****    PMIX ENVAR STRUCT   ****/
+/* Provide a structure for specifying environment variable modifications
+ * Standard environment variables (e.g., PATH, LD_LIBRARY_PATH, and LD_PRELOAD)
+ * take multiple arguments separated by delimiters. Unfortunately, the delimiters
+ * depend upon the variable itself - some use semi-colons, some colons, etc. Thus,
+ * the operation requires not only the name of the variable to be modified and
+ * the value to be inserted, but also the separator to be used when composing
+ * the aggregate value
+ */
+typedef struct {
+    char *envar;
+    char *value;
+    char separator;
+} pmix_envar_t;
+
+#define PMIX_ENVAR_STATIC_INIT  \
+{                               \
+    .envar = NULL,              \
+    .value = NULL,              \
+    .separator = '\0'           \
+}
+
+#define PMIX_ENVAR_CREATE(m, n)                                             \
+    do {                                                                    \
+        if (0 == (n)) {                                                     \
+            (m) = NULL;                                                     \
+        } else {                                                            \
+            (m) = (pmix_envar_t*)pmix_malloc((n) * sizeof(pmix_envar_t));   \
+            if (NULL != (m)) {                                              \
+                memset((m), 0, (n) * sizeof(pmix_envar_t));                 \
+            }                                                               \
+        }                                                                   \
+    } while (0)
+#define PMIX_ENVAR_FREE(m, n)                       \
+    do {                                            \
+        size_t _ek;                                 \
+        if (NULL != (m)) {                          \
+            for (_ek=0; _ek < (n); _ek++) {         \
+               PMIX_ENVAR_DESTRUCT(&(m)[_ek]);      \
+            }                                       \
+            pmix_free((m));                              \
+        }                                           \
+    } while (0)
+#define PMIX_ENVAR_CONSTRUCT(m)        \
+    do {                               \
+        (m)->envar = NULL;             \
+        (m)->value = NULL;             \
+        (m)->separator = '\0';         \
+    } while(0)
+#define PMIX_ENVAR_DESTRUCT(m)         \
+    do {                               \
+        if (NULL != (m)->envar) {      \
+            pmix_free((m)->envar);          \
+            (m)->envar = NULL;         \
+        }                              \
+        if (NULL != (m)->value) {      \
+            pmix_free((m)->value);          \
+            (m)->value = NULL;         \
+        }                              \
+    } while(0)
+#define PMIX_ENVAR_LOAD(m, e, v, s)    \
+    do {                               \
+        if (NULL != (e)) {             \
+            (m)->envar = strdup(e);    \
+        }                              \
+        if (NULL != (v)) {             \
+            (m)->value = strdup(v);    \
+        }                              \
+        (m)->separator = (s);          \
+    } while(0)
+
+
+/****    PMIX DATA BUFFER MACROS   ****/
+#define PMIX_DATA_BUFFER_STATIC_INIT    \
+{                                       \
+    .base_ptr = NULL,                   \
+    .pack_ptr = NULL,                   \
+    .unpack_ptr = NULL,                 \
+    .bytes_allocated = 0,               \
+    .bytes_used = 0                     \
+}
+#define PMIX_DATA_BUFFER_CREATE(m)                                          \
+    do {                                                                    \
+        (m) = (pmix_data_buffer_t*)pmix_malloc(sizeof(pmix_data_buffer_t)); \
+        if (NULL != (m)) {                                                  \
+            memset((m), 0, sizeof(pmix_data_buffer_t));                     \
+        }                                                                   \
+    } while (0)
+#define PMIX_DATA_BUFFER_RELEASE(m)             \
+    do {                                        \
+        if (NULL != (m)->base_ptr) {            \
+            pmix_free((m)->base_ptr);                \
+        }                                       \
+        pmix_free((m));                              \
+        (m) = NULL;                             \
+    } while (0)
+#define PMIX_DATA_BUFFER_CONSTRUCT(m)       \
+    memset((m), 0, sizeof(pmix_data_buffer_t))
+#define PMIX_DATA_BUFFER_DESTRUCT(m)        \
+    do {                                    \
+        if (NULL != (m)->base_ptr) {        \
+            pmix_free((m)->base_ptr);            \
+            (m)->base_ptr = NULL;           \
+        }                                   \
+        (m)->pack_ptr = NULL;               \
+        (m)->unpack_ptr = NULL;             \
+        (m)->bytes_allocated = 0;           \
+        (m)->bytes_used = 0;                \
+    } while (0)
+#define PMIX_DATA_BUFFER_LOAD(b, d, s)  \
+    do {                                \
+        pmix_byte_object_t _bo;         \
+        _bo.bytes = (char*)(d);         \
+        _bo.size = (s);                 \
+        PMIx_Data_load((b), &_bo);      \
+    } while(0)
+
+#define PMIX_DATA_BUFFER_UNLOAD(b, d, s)    \
+    do {                                    \
+        pmix_byte_object_t _bo;             \
+        pmix_status_t _r;                   \
+        _r = PMIx_Data_unload((b), &_bo);   \
+        if (PMIX_SUCCESS == _r) {           \
+            (d) = _bo.bytes;                \
+            (s) = _bo.size;                 \
+        } else {                            \
+            (d) = NULL;                     \
+            (s) = 0;                        \
+        }                                   \
+    } while(0)
+
+/****    PMIX PROC OBJECT    ****/
+typedef struct pmix_proc {
+    pmix_nspace_t nspace;
+    pmix_rank_t rank;
+} pmix_proc_t;
+
+#define PMIX_PROC_STATIC_INIT   \
+{                               \
+    .nspace = {0},              \
+    .rank = PMIX_RANK_UNDEF     \
+}
+
+#define PMIX_PROC_CREATE(m, n)                                          \
+    do {                                                                \
+        if (0 == (n)) {                                                 \
+            (m) = NULL;                                                 \
+        } else {                                                        \
+            (m) = (pmix_proc_t*)pmix_malloc((n) * sizeof(pmix_proc_t)); \
+            if (NULL != (m)) {                                          \
+                memset((m), 0, (n) * sizeof(pmix_proc_t));              \
+            }                                                           \
+        }                                                               \
+    } while (0)
+
+#define PMIX_PROC_RELEASE(m)    \
+    do {                        \
+        pmix_free((m));         \
+        (m) = NULL;             \
+    } while (0)
+
+#define PMIX_PROC_CONSTRUCT(m)                  \
+    do {                                        \
+        memset((m), 0, sizeof(pmix_proc_t));    \
+    } while (0)
+
+#define PMIX_PROC_DESTRUCT(m)
+
+#define PMIX_PROC_FREE(m, n)                    \
+    do {                                        \
+        if (NULL != (m)) {                      \
+            pmix_free((m));                     \
+            (m) = NULL;                         \
+        }                                       \
+    } while (0)
+
+#define PMIX_PROC_LOAD(m, n, r)                             \
+    do {                                                    \
+        PMIX_PROC_CONSTRUCT((m));                           \
+        pmix_strncpy((char*)(m)->nspace, (n), PMIX_MAX_NSLEN);    \
+        (m)->rank = (r);                                    \
+    } while(0)
+
+#define PMIX_MULTICLUSTER_NSPACE_CONSTRUCT(t, c, n)                         \
+    do {                                                                    \
+        size_t _len;                                                        \
+        memset((t), 0, PMIX_MAX_NSLEN+1);                                   \
+        _len = pmix_nslen((c));                                             \
+        if ((_len + pmix_nslen((n))) < PMIX_MAX_NSLEN) {                    \
+            pmix_strncpy((char*)(t), (c), PMIX_MAX_NSLEN);                  \
+            (t)[_len] = ':';                                                \
+            pmix_strncpy((char*)&(t)[_len+1], (n), PMIX_MAX_NSLEN - _len);  \
+        }                                                                   \
+    } while(0)
+
+#define PMIX_MULTICLUSTER_NSPACE_PARSE(t, c, n)             \
+    do {                                                    \
+        size_t _n, _j;                                      \
+        for (_n=0; '\0' != (t)[_n] && ':' != (t)[_n] &&     \
+             _n <= PMIX_MAX_NSLEN; _n++) {                  \
+            (c)[_n] = (t)[_n];                              \
+        }                                                   \
+        _n++;                                               \
+        for (_j=0; _n <= PMIX_MAX_NSLEN &&                  \
+             '\0' != (t)[_n]; _n++, _j++) {                 \
+            (n)[_j] = (t)[_n];                              \
+        }                                                   \
+    } while(0)
+
+
+/****    PMIX PROC INFO STRUCT    ****/
+typedef struct pmix_proc_info {
+    pmix_proc_t proc;
+    char *hostname;
+    char *executable_name;
+    pid_t pid;
+    int exit_code;
+    pmix_proc_state_t state;
+} pmix_proc_info_t;
+
+#define PMIX_PROC_INFO_STATIC_INIT  \
+{                                   \
+    .proc = PMIX_PROC_STATIC_INIT,  \
+    .hostname = NULL,               \
+    .executable_name = NULL,        \
+    .pid = 0,                       \
+    .exit_code = 0,                 \
+    .state = PMIX_PROC_STATE_UNDEF  \
+}
+
+#define PMIX_PROC_INFO_CREATE(m, n)                                                 \
+    do {                                                                            \
+        if (0 == (n)) {                                                             \
+            (m) = NULL;                                                             \
+        } else {                                                                    \
+            (m) = (pmix_proc_info_t*)pmix_malloc((n) * sizeof(pmix_proc_info_t));   \
+            if (NULL != (m)) {                                                      \
+                memset((m), 0, (n) * sizeof(pmix_proc_info_t));                     \
+            }                                                                       \
+        }                                                                           \
+    } while (0)
+
+#define PMIX_PROC_INFO_RELEASE(m)      \
+    do {                               \
+        PMIX_PROC_INFO_FREE((m), 1);   \
+    } while (0)
+
+#define PMIX_PROC_INFO_CONSTRUCT(m)                 \
+    do {                                            \
+        memset((m), 0, sizeof(pmix_proc_info_t));   \
+    } while (0)
+
+#define PMIX_PROC_INFO_DESTRUCT(m)              \
+    do {                                        \
+        if (NULL != (m)->hostname) {            \
+            pmix_free((m)->hostname);                \
+            (m)->hostname = NULL;               \
+        }                                       \
+        if (NULL != (m)->executable_name) {     \
+            pmix_free((m)->executable_name);         \
+            (m)->executable_name = NULL;        \
+        }                                       \
+    } while(0)
+
+#define PMIX_PROC_INFO_FREE(m, n)                   \
+    do {                                            \
+        size_t _k;                                  \
+        if (NULL != (m)) {                          \
+            for (_k=0; _k < (n); _k++) {            \
+                PMIX_PROC_INFO_DESTRUCT(&(m)[_k]);  \
+            }                                       \
+            pmix_free((m));                              \
+        }                                           \
+    } while (0)
+
+
+/****    PMIX DATA ARRAY STRUCT    ****/
+
+typedef struct pmix_data_array {
+    pmix_data_type_t type;
+    size_t size;
+    void *array;
+} pmix_data_array_t;
+
+#define PMIX_DATA_ARRAY_STATIC_INIT     \
+{                                       \
+    .type = PMIX_UNDEF,                 \
+    .size = 0,                          \
+    .array = NULL                       \
+}
+
+/**** THE PMIX_DATA_ARRAY SUPPORT MACROS ARE DEFINED ****/
+/**** DOWN BELOW (NEAR THE BOTTOM OF THE FILE) TO    ****/
+/**** AVOID CIRCULAR DEPENDENCIES                    ****/
+
+
+/* we cannot forward-declare the pmix_regattr_t struct
+ * as Cython doesn't know what to do with it. Thus, we
+ * will utilize the void* entry of the pmix_value_t to
+ * hold the pointer to pmix_regattr_t */
+
+/****    PMIX DATA BUFFER    ****/
+typedef struct pmix_data_buffer {
+    /** Start of my memory */
+    char *base_ptr;
+    /** Where the next data will be packed to (within the allocated
+        memory starting at base_ptr) */
+    char *pack_ptr;
+    /** Where the next data will be unpacked from (within the
+        allocated memory starting as base_ptr) */
+    char *unpack_ptr;
+    /** Number of bytes allocated (starting at base_ptr) */
+    size_t bytes_allocated;
+    /** Number of bytes used by the buffer (i.e., amount of data --
+        including overhead -- packed in the buffer) */
+    size_t bytes_used;
+} pmix_data_buffer_t;
+
+#define PMIX_DATA_BUFFER_STATIC_INIT    \
+{                                       \
+    .base_ptr = NULL,                   \
+    .pack_ptr = NULL,                   \
+    .unpack_ptr = NULL,                 \
+    .bytes_allocated = 0,               \
+    .bytes_used = 0                     \
+}
+
+/****   STATISTICS STRUCTURES  ****/
+typedef struct pmix_proc_stats {
+    /* process ident info */
+    char *node;
+    pmix_proc_t proc;
+    pid_t pid;
+    char *cmd;
+    /* process stats */
+    char state;
+    struct timeval time;
+    float percent_cpu;
+    int32_t priority;
+    uint16_t num_threads;
+    float pss;   /* in MBytes */
+    float vsize;  /* in MBytes */
+    float rss;  /* in MBytes */
+    float peak_vsize;  /* in MBytes */
+    uint16_t processor;
+    /* time at which sample was taken */
+    struct timeval sample_time;
+} pmix_proc_stats_t;
+
+#define PMIX_PROC_STATS_STATIC_INIT     \
+{                                       \
+    .node = NULL,                       \
+    .proc = PMIX_PROC_STATIC_INIT,      \
+    .pid = 0,                           \
+    .cmd = NULL,                        \
+    .state = '\0',                      \
+    .time = {0, 0},                     \
+    .percent_cpu = 0.0,                 \
+    .priority = 0,                      \
+    .num_threads = 0,                   \
+    .pss = 0.0,                         \
+    .vsize = 0.0,                       \
+    .rss = 0.0,                         \
+    .peak_vsize = 0.0                   \
+    .processor = 0,                     \
+    .sample_time = {0, 0}               \
+}
+
+#define PMIX_PROC_STATS_CREATE(m, n)                                                \
+    do {                                                                            \
+        if (0 == (n)) {                                                             \
+            (m) = NULL;                                                             \
+        } else {                                                                    \
+            (m) = (pmix_proc_stats_t*)pmix_malloc((n) * sizeof(pmix_proc_stats_t)); \
+            if (NULL != (m)) {                                                      \
+                memset((m), 0, (n) * sizeof(pmix_proc_stats_t));                    \
+            }                                                                       \
+        }                                                                           \
+    } while (0)
+
+#define PMIX_PROC_STATS_RELEASE(m)      \
+    do {                                \
+        PMIX_PROC_STATS_FREE((m), 1);   \
+    } while (0)
+
+#define PMIX_PROC_STATS_CONSTRUCT(m)                \
+    do {                                            \
+        memset((m), 0, sizeof(pmix_proc_stats_t));  \
+    } while (0)
+
+#define PMIX_PROC_STATS_DESTRUCT(m)     \
+    do {                                \
+        if (NULL != (m)->node) {        \
+            pmix_free((m)->node);       \
+            (m)->node = NULL;           \
+        }                               \
+        if (NULL != (m)->cmd) {         \
+            pmix_free((m)->cmd);        \
+            (m)->cmd = NULL;            \
+        }                               \
+    } while(0)
+
+static inline void pmix_proc_stats_free(pmix_proc_stats_t *ps, size_t n)
+{
+    size_t k;
+
+    if (NULL != ps) {
+        for (k=0; k < n; k++) {
+            PMIX_PROC_STATS_DESTRUCT(&ps[k]);
+        }
+    }
+}
+
+#define PMIX_PROC_STATS_FREE(m, n)  \
+do {                                \
+    pmix_proc_stats_free(m, n);     \
+    pmix_free(m);                   \
+    (m) = NULL;                     \
+} while(0)
+
+typedef struct {
+    char *disk;
+    uint64_t num_reads_completed;
+    uint64_t num_reads_merged;
+    uint64_t num_sectors_read;
+    uint64_t milliseconds_reading;
+    uint64_t num_writes_completed;
+    uint64_t num_writes_merged;
+    uint64_t num_sectors_written;
+    uint64_t milliseconds_writing;
+    uint64_t num_ios_in_progress;
+    uint64_t milliseconds_io;
+    uint64_t weighted_milliseconds_io;
+} pmix_disk_stats_t;
+
+#define PMIX_DISK_STATS_STATIC_INIT     \
+{                                       \
+    .disk = NULL,                       \
+    .num_reads_completed = 0,           \
+    .num_reads_merged = 0,              \
+    .num_sectors_read = 0,              \
+    .milliseconds_reading = 0,          \
+    .num_writes_completed = 0,          \
+    .num_writes_merged = 0,             \
+    .num_sectors_written = 0,           \
+    .milliseconds_writing = 0,          \
+    .num_ios_in_progress = 0,           \
+    .milliseconds_io = 0,               \
+    .weighted_milliseconds_io = 0       \
+}
+
+#define PMIX_DISK_STATS_CREATE(m, n)                                                \
+    do {                                                                            \
+        if (0 == (n)) {                                                             \
+            (m) = NULL;                                                             \
+        } else {                                                                    \
+            (m) = (pmix_disk_stats_t*)pmix_malloc((n) * sizeof(pmix_disk_stats_t)); \
+            if (NULL != (m)) {                                                      \
+                memset((m), 0, (n) * sizeof(pmix_disk_stats_t));                    \
+            }                                                                       \
+        }                                                                           \
+    } while (0)
+
+#define PMIX_DISK_STATS_RELEASE(m)      \
+    do {                                \
+        PMIX_DISK_STATS_FREE((m), 1);   \
+    } while (0)
+
+#define PMIX_DISK_STATS_CONSTRUCT(m)                \
+    do {                                            \
+        memset((m), 0, sizeof(pmix_disk_stats_t));  \
+    } while (0)
+
+#define PMIX_DISK_STATS_DESTRUCT(m)     \
+    do {                                \
+        if (NULL != (m)->disk) {        \
+            pmix_free((m)->disk);       \
+            (m)->disk = NULL;           \
+        }                               \
+    } while(0)
+
+static inline void pmix_disk_stats_free(pmix_disk_stats_t *d, size_t n)
+{
+    size_t k;
+
+    if (NULL != d) {
+        for (k=0; k < n; k++) {
+            PMIX_DISK_STATS_DESTRUCT(&d[k]);
+        }
+    }
+}
+
+#define PMIX_DISK_STATS_FREE(m, n)  \
+do {                                \
+    pmix_disk_stats_free(m, n);     \
+    pmix_free(m);                   \
+    (m) = NULL;                     \
+} while(0)
+
+typedef struct {
+    char *net_interface;
+    uint64_t num_bytes_recvd;
+    uint64_t num_packets_recvd;
+    uint64_t num_recv_errs;
+    uint64_t num_bytes_sent;
+    uint64_t num_packets_sent;
+    uint64_t num_send_errs;
+} pmix_net_stats_t;
+
+#define PMIX_NET_STATS_STATIC_INIT  \
+{                                   \
+    .net_interface = NULL,          \
+    .num_bytes_recvd = 0,           \
+    .num_packets_recvd = 0,         \
+    .num_recv_errs = 0,             \
+    .num_bytes_sent = 0,            \
+    .num_packets_sent = 0,          \
+    .num_send_errs = 0              \
+}
+
+#define PMIX_NET_STATS_CREATE(m, n)                                                 \
+    do {                                                                            \
+        if (0 == (n)) {                                                             \
+            (m) = NULL;                                                             \
+        } else {                                                                    \
+            (m) = (pmix_net_stats_t*)pmix_malloc((n) * sizeof(pmix_net_stats_t));   \
+            if (NULL != (m)) {                                                      \
+                memset((m), 0, (n) * sizeof(pmix_net_stats_t));                     \
+            }                                                                       \
+        }                                                                           \
+    } while (0)
+
+#define PMIX_NET_STATS_RELEASE(m)       \
+    do {                                \
+        PMIX_NET_STATS_FREE((m), 1);    \
+    } while (0)
+
+#define PMIX_NET_STATS_CONSTRUCT(m)                 \
+    do {                                            \
+        memset((m), 0, sizeof(pmix_net_stats_t));   \
+    } while (0)
+
+#define PMIX_NET_STATS_DESTRUCT(m)          \
+    do {                                    \
+        if (NULL != (m)->net_interface) {   \
+            pmix_free((m)->net_interface);  \
+            (m)->net_interface = NULL;      \
+        }                                   \
+    } while(0)
+
+static inline void pmix_net_stats_free(pmix_net_stats_t *nst, size_t n)
+{
+    size_t k;
+
+    if (NULL != nst) {
+        for (k=0; k < n; k++) {
+            PMIX_NET_STATS_DESTRUCT(&nst[k]);
+        }
+    }
+}
+
+#define PMIX_NET_STATS_FREE(m, n)   \
+do {                                \
+    pmix_net_stats_free(m, n);      \
+    pmix_free(m);                   \
+    (m) = NULL;                     \
+} while(0)
+
+typedef struct {
+    char *node;
+    /* node-level load averages */
+    float la;
+    float la5;
+    float la15;
+    /* memory usage */
+    float total_mem;  /* in MBytes */
+    float free_mem;  /* in MBytes */
+    float buffers;  /* in MBytes */
+    float cached;   /* in MBytes */
+    float swap_cached;  /* in MBytes */
+    float swap_total;   /* in MBytes */
+    float swap_free;    /* in MBytes */
+    float mapped;       /* in MBytes */
+    /* time at which sample was taken */
+    struct timeval sample_time;
+    /* array of disk stats, one per disk */
+    pmix_disk_stats_t *diskstats;
+    size_t ndiskstats;
+    /* array of net stats, one per interface */
+    pmix_net_stats_t *netstats;
+    size_t nnetstats;
+} pmix_node_stats_t;
+
+#define PMIX_NODE_STATS_STATIC_INIT     \
+{                                       \
+    .node = NULL,                       \
+    .la = 0,                            \
+    .la5 = 0,                           \
+    .la15 = 0,                          \
+    .total_mem = 0.0,                   \
+    .free_mem = 0.0,                    \
+    .buffers = 0.0,                     \
+    .cached = 0.0,                      \
+    .swap_cached = 0.0,                 \
+    .swap_total = 0.0,                  \
+    .swap_free = 0.0,                   \
+    .mapped = 0.0,                      \
+    .sample_time = {0, 0},              \
+    .diskstats = NULL,                  \
+    .ndiskstats = 0,                    \
+    .netstats = NULL,                   \
+    .nnetstats = 0                      \
+}
+
+#define PMIX_NODE_STATS_CREATE(m, n)                                                \
+    do {                                                                            \
+        if (0 == (n)) {                                                             \
+            (m) = NULL;                                                             \
+        } else {                                                                    \
+            (m) = (pmix_node_stats_t*)pmix_malloc((n) * sizeof(pmix_node_stats_t)); \
+            if (NULL != (m)) {                                                      \
+                memset((m), 0, (n) * sizeof(pmix_node_stats_t));                    \
+            }                                                                       \
+        }                                                                           \
+    } while (0)
+
+#define PMIX_NODE_STATS_CONSTRUCT(m)                \
+    do {                                            \
+        memset((m), 0, sizeof(pmix_node_stats_t));  \
+    } while (0)
+
+#define PMIX_NODE_STATS_DESTRUCT(m)                                 \
+    do {                                                            \
+        if (NULL != (m)->node) {                                    \
+            pmix_free((m)->node);                                   \
+            (m)->node = NULL;                                       \
+        }                                                           \
+        if (NULL != (m)->diskstats) {                               \
+            PMIX_DISK_STATS_FREE((m)->diskstats, (m)->ndiskstats);  \
+        }                                                           \
+        if (NULL != (m)->netstats) {                                \
+            PMIX_NET_STATS_FREE((m)->netstats, (m)->nnetstats);     \
+        }                                                           \
+    } while(0)
+
+static inline void pmix_node_stats_free(pmix_node_stats_t *nd, size_t n)
+{
+    size_t k;
+
+    if (NULL != nd) {
+        for (k=0; k < n; k++) {
+            PMIX_NODE_STATS_DESTRUCT(&nd[k]);
+        }
+    }
+}
+
+#define PMIX_NODE_STATS_FREE(m, n)  \
+do {                                \
+    pmix_node_stats_free(m, n);     \
+    pmix_free(m);                   \
+    (m) = NULL;                     \
+} while(0)
+
+#define PMIX_NODE_STATS_RELEASE(m)  \
+    pmix_node_stats_free(m, 1)
+
+
+/****    PMIX VALUE STRUCT    ****/
+
+/* NOTE: operations can supply a collection of values under
+ * a single key by passing a pmix_value_t containing a
+ * data array of type PMIX_INFO, with each array element
+ * containing its own pmix_info_t object */
+
+typedef struct pmix_value {
+    pmix_data_type_t type;
+    union {
+        bool flag;
+        uint8_t byte;
+        char *string;
+        size_t size;
+        pid_t pid;
+        int integer;
+        int8_t int8;
+        int16_t int16;
+        int32_t int32;
+        int64_t int64;
+        unsigned int uint;
+        uint8_t uint8;
+        uint16_t uint16;
+        uint32_t uint32;
+        uint64_t uint64;
+        float fval;
+        double dval;
+        struct timeval tv;
+        time_t time;
+        pmix_status_t status;
+        pmix_rank_t rank;
+        pmix_nspace_t *nspace;
+        pmix_proc_t *proc;
+        pmix_byte_object_t bo;
+        pmix_persistence_t persist;
+        pmix_scope_t scope;
+        pmix_data_range_t range;
+        pmix_proc_state_t state;
+        pmix_proc_info_t *pinfo;
+        pmix_data_array_t *darray;
+        void *ptr;
+        pmix_alloc_directive_t adir;
+        pmix_envar_t envar;
+        pmix_coord_t *coord;
+        pmix_link_state_t linkstate;
+        pmix_job_state_t jstate;
+        pmix_topology_t *topo;
+        pmix_cpuset_t *cpuset;
+        pmix_locality_t locality;
+        pmix_geometry_t *geometry;
+        pmix_device_type_t devtype;
+        pmix_device_distance_t *devdist;
+        pmix_endpoint_t *endpoint;
+        pmix_data_buffer_t *dbuf;
+        pmix_proc_stats_t *pstats;
+        pmix_disk_stats_t *dkstats;
+        pmix_net_stats_t *netstats;
+        pmix_node_stats_t *ndstats;
+    } data;
+} pmix_value_t;
+
+#define PMIX_VALUE_STATIC_INIT  \
+{                               \
+    .type = PMIX_UNDEF,         \
+    .data.ptr = NULL            \
+}
+
+/* allocate and initialize a specified number of value structs */
+#define PMIX_VALUE_CREATE(m, n)                                             \
+    do {                                                                    \
+        if (0 == (n)) {                                                     \
+            (m) = NULL;                                                     \
+        } else {                                                            \
+            (m) = (pmix_value_t*)pmix_malloc((n) * sizeof(pmix_value_t));   \
+            if (NULL != (m)) {                                              \
+                memset((m), 0, (n)*sizeof(pmix_value_t));                   \
+            }                                                               \
+        }                                                                   \
+    } while (0)
+
+/* initialize a single value struct */
+#define PMIX_VALUE_CONSTRUCT(m)                 \
+    do {                                        \
+        memset((m), 0, sizeof(pmix_value_t));   \
+        (m)->type = PMIX_UNDEF;                 \
+    } while (0)
+
+#define PMIX_VALUE_GET_NUMBER(s, m, n, t)               \
+    do {                                                \
+        (s) = PMIX_SUCCESS;                             \
+        if (PMIX_SIZE == (m)->type) {                   \
+            (n) = (t)((m)->data.size);                  \
+        } else if (PMIX_INT == (m)->type) {             \
+            (n) = (t)((m)->data.integer);               \
+        } else if (PMIX_INT8 == (m)->type) {            \
+            (n) = (t)((m)->data.int8);                  \
+        } else if (PMIX_INT16 == (m)->type) {           \
+            (n) = (t)((m)->data.int16);                 \
+        } else if (PMIX_INT32 == (m)->type) {           \
+            (n) = (t)((m)->data.int32);                 \
+        } else if (PMIX_INT64 == (m)->type) {           \
+            (n) = (t)((m)->data.int64);                 \
+        } else if (PMIX_UINT == (m)->type) {            \
+            (n) = (t)((m)->data.uint);                  \
+        } else if (PMIX_UINT8 == (m)->type) {           \
+            (n) = (t)((m)->data.uint8);                 \
+        } else if (PMIX_UINT16 == (m)->type) {          \
+            (n) = (t)((m)->data.uint16);                \
+        } else if (PMIX_UINT32 == (m)->type) {          \
+            (n) = (t)((m)->data.uint32);                \
+        } else if (PMIX_UINT64 == (m)->type) {          \
+            (n) = (t)((m)->data.uint64);                \
+        } else if (PMIX_FLOAT == (m)->type) {           \
+            (n) = (t)((m)->data.fval);                  \
+        } else if (PMIX_DOUBLE == (m)->type) {          \
+            (n) = (t)((m)->data.dval);                  \
+        } else if (PMIX_PID == (m)->type) {             \
+            (n) = (t)((m)->data.pid);                   \
+        } else if (PMIX_PROC_RANK == (m)->type) {       \
+            (n) = (t)((m)->data.rank);                  \
+        } else {                                        \
+            (s) = PMIX_ERR_BAD_PARAM;                   \
+        }                                               \
+    } while(0)
+
+/****    PMIX INFO STRUCT    ****/
+typedef struct pmix_info {
+    pmix_key_t key;
+    pmix_info_directives_t flags;   // bit-mask of flags
+    pmix_value_t value;
+} pmix_info_t;
+
+#define PMIX_INFO_STATIC_INIT       \
+{                                   \
+    .key = {0},                     \
+    .flags = 0,                     \
+    .value = PMIX_VALUE_STATIC_INIT \
+}
+
+/* utility macros for working with pmix_info_t structs */
+#define PMIX_INFO_CONSTRUCT(m)                  \
+    do {                                        \
+        memset((m), 0, sizeof(pmix_info_t));    \
+        (m)->value.type = PMIX_UNDEF;           \
+    } while (0)
+
+#define PMIX_INFO_CREATE(m, n)                                          \
+    do {                                                                \
+        pmix_info_t *_i;                                                \
+        if (0 == (n)) {                                                 \
+            (m) = NULL;                                                 \
+        } else {                                                        \
+            (m) = (pmix_info_t*)pmix_malloc((n) * sizeof(pmix_info_t)); \
+            if (NULL != (m)) {                                          \
+                _i = (pmix_info_t*)(m);                                 \
+                memset((m), 0, (n) * sizeof(pmix_info_t));              \
+                _i[(n)-1].flags = PMIX_INFO_ARRAY_END;                  \
+            }                                                           \
+        }                                                               \
+    } while (0)
+
+/* macros for setting and unsetting the "reqd" flag
+ * in a pmix_info_t */
+#define PMIX_INFO_REQUIRED(m)       \
+    ((m)->flags |= PMIX_INFO_REQD)
+#define PMIX_INFO_OPTIONAL(m)       \
+    ((m)->flags &= ~PMIX_INFO_REQD)
+
+/* macros for testing the "reqd" flag in a pmix_info_t */
+#define PMIX_INFO_IS_REQUIRED(m)    \
+    ((m)->flags & PMIX_INFO_REQD)
+#define PMIX_INFO_IS_OPTIONAL(m)    \
+    !((m)->flags & PMIX_INFO_REQD)
+
+/* macros for setting and testing the "reqd processed" flag */
+#define PMIX_INFO_PROCESSED(m)  \
+    ((m)->flags |= PMIX_INFO_REQD_PROCESSED)
+#define PMIX_INFO_WAS_PROCESSED(m)  \
+    ((m)->flags & PMIX_INFO_REQD_PROCESSED)
+
+/* macro for testing end of the array */
+#define PMIX_INFO_SET_END(m)    \
+    ((m)->flags |= PMIX_INFO_ARRAY_END)
+#define PMIX_INFO_IS_END(m)         \
+    ((m)->flags & PMIX_INFO_ARRAY_END)
+
+/* macro for testing if qualifier */
+#define PMIX_INFO_SET_QUALIFIER(i)   \
+    ((i)->flags |= PMIX_INFO_QUALIFIER)
+#define PMIX_INFO_IS_QUALIFIER(i)    \
+    ((i)->flags & PMIX_INFO_QUALIFIER)
+
+/* macro for setting and testing the "donot release" flag */
+#define PMIX_INFO_SET_PERSISTENT(ii) \
+    ((ii)->flags |= PMIX_INFO_PERSISTENT)
+#define PMIX_INFO_IS_PERSISTENT(ii)  \
+    ((ii)->flags & PMIX_INFO_PERSISTENT)
+
+typedef enum {
+    PMIX_BOOL_TRUE,
+    PMIX_BOOL_FALSE,
+    PMIX_NON_BOOL
+} pmix_boolean_t;
+
+/**
+ * Provide a check to see if a value is "true" or
+ * "false", whether given as a string or boolean
+ * input.
+ */
+static inline pmix_boolean_t pmix_check_true(const pmix_value_t *value)
+{
+    char *ptr;
+
+    if (PMIX_UNDEF == value->type) {
+        return PMIX_BOOL_TRUE; // default to true
+    }
+    if (PMIX_BOOL == value->type) {
+        if (value->data.flag) {
+            return PMIX_BOOL_TRUE;
+        } else {
+            return PMIX_BOOL_FALSE;
+        }
+    }
+    if (PMIX_STRING == value->type) {
+        if (NULL == value->data.string) {
+            return PMIX_BOOL_TRUE;
+        }
+        ptr = value->data.string;
+        /* Trim leading whitespace */
+        while (isspace(*ptr)) {
+            ++ptr;
+        }
+        if ('\0' == *ptr) {
+            return PMIX_BOOL_TRUE;
+        }
+        if (isdigit(*ptr)) {
+            if (0 == atoi(ptr)) {
+                return PMIX_BOOL_FALSE;
+            } else {
+                return PMIX_BOOL_TRUE;
+            }
+        } else if (0 == strncasecmp(ptr, "yes", 3) ||
+                   0 == strncasecmp(ptr, "true", 4)) {
+            return PMIX_BOOL_TRUE;
+        } else if (0 == strncasecmp(ptr, "no", 2) ||
+                   0 == strncasecmp(ptr, "false", 5)) {
+            return PMIX_BOOL_FALSE;
+        }
+    }
+
+    return PMIX_NON_BOOL;
+}
+
+/* provide a macro version of it for those preferring
+ * that syntax in their codes where they know the
+ * value being checked IS a boolean of some form
+ */
+#define PMIX_CHECK_TRUE(a) \
+    (PMIX_BOOL_TRUE == pmix_check_true(a) ? true : false)
+
+#define PMIX_CHECK_BOOL(a) \
+    (PMIX_NON_BOOL == pmix_check_true(a) ? false : true)
+
+/* define a special macro for checking if a boolean
+ * info is true - when info structs are provided, a
+ * type of PMIX_UNDEF is taken to imply a boolean "true"
+ * as the presence of the key defaults to indicating
+ * "true". Also supports passing of string representations
+ * such as "t" or "f" */
+#define PMIX_INFO_TRUE(m)   \
+    (PMIX_BOOL_TRUE == pmix_check_true(&(m)->value) ? true : false)
+
+
+/****    PMIX LOOKUP RETURN STRUCT    ****/
+typedef struct pmix_pdata {
+    pmix_proc_t proc;
+    pmix_key_t key;
+    pmix_value_t value;
+} pmix_pdata_t;
+
+#define PMIX_LOOKUP_STATIC_INIT     \
+{                                   \
+    .proc = PMIX_PROC_STATIC_INIT,  \
+    .key = {0},                     \
+    .value = PMIX_VALUE_STATIC_INIT \
+}
+
+/* utility macros for working with pmix_pdata_t structs */
+#define PMIX_PDATA_CREATE(m, n)                                             \
+    do {                                                                    \
+        if (0 == (n)) {                                                     \
+            (m) = NULL;                                                     \
+        } else {                                                            \
+            (m) = (pmix_pdata_t*)pmix_malloc((n) * sizeof(pmix_pdata_t));   \
+            if (NULL != (m)) {                                              \
+                memset((m), 0, (n) * sizeof(pmix_pdata_t));                 \
+            }                                                               \
+        }                                                                   \
+    } while (0)
+
+#define PMIX_PDATA_CONSTRUCT(m)                 \
+    do {                                        \
+        memset((m), 0, sizeof(pmix_pdata_t));   \
+        (m)->value.type = PMIX_UNDEF;           \
+    } while (0)
+
+
+/****    PMIX APP STRUCT    ****/
+typedef struct pmix_app {
+    char *cmd;
+    char **argv;
+    char **env;
+    char *cwd;
+    int maxprocs;
+    pmix_info_t *info;
+    size_t ninfo;
+} pmix_app_t;
+
+#define PMIX_APP_STATIC_INIT    \
+{                               \
+    .cmd = NULL,                \
+    .argv = NULL,               \
+    .env = NULL,                \
+    .cwd = NULL,                \
+    .maxprocs = 0,              \
+    .info = NULL,               \
+    .ninfo = 0                  \
+}
+
+/* utility macros for working with pmix_app_t structs */
+#define PMIX_APP_CREATE(m, n)                                           \
+    do {                                                                \
+        if (0 == (n)) {                                                 \
+            (m) = NULL;                                                 \
+        } else {                                                        \
+            (m) = (pmix_app_t*)pmix_malloc((n) * sizeof(pmix_app_t));   \
+            if (NULL != (m)) {                                          \
+                memset((m), 0, (n) * sizeof(pmix_app_t));               \
+            }                                                           \
+        }                                                               \
+    } while (0)
+
+#define PMIX_APP_INFO_CREATE(m, n)                  \
+    do {                                            \
+        (m)->ninfo = (n);                           \
+        PMIX_INFO_CREATE((m)->info, (m)->ninfo);    \
+    } while(0)
+
+#define PMIX_APP_RELEASE(m)                     \
+    do {                                        \
+        PMIX_APP_DESTRUCT((m));                 \
+        pmix_free((m));                              \
+        (m) = NULL;                             \
+    } while (0)
+
+#define PMIX_APP_CONSTRUCT(m)                   \
+    do {                                        \
+        memset((m), 0, sizeof(pmix_app_t));     \
+    } while (0)
+
+
+/****    PMIX QUERY STRUCT    ****/
+typedef struct pmix_query {
+    char **keys;
+    pmix_info_t *qualifiers;
+    size_t nqual;
+} pmix_query_t;
+
+#define PMIX_QUERY_STATIC_INIT  \
+{                               \
+    .keys = NULL,               \
+    .qualifiers = NULL,         \
+    .nqual = 0                  \
+}
+
+/* utility macros for working with pmix_query_t structs */
+#define PMIX_QUERY_CREATE(m, n)                                             \
+    do {                                                                    \
+        if (0 == (n)) {                                                     \
+            (m) = NULL;                                                     \
+        } else {                                                            \
+            (m) = (pmix_query_t*)pmix_malloc((n) * sizeof(pmix_query_t));   \
+            if (NULL != (m)) {                                              \
+                memset((m), 0, (n) * sizeof(pmix_query_t));                 \
+            }                                                               \
+        }                                                                   \
+    } while (0)
+
+#define PMIX_QUERY_QUALIFIERS_CREATE(m, n)                  \
+    do {                                                    \
+        (m)->nqual = (n);                                   \
+        PMIX_INFO_CREATE((m)->qualifiers, (m)->nqual);      \
+    } while(0)
+
+#define PMIX_QUERY_RELEASE(m)       \
+    do {                            \
+        PMIX_QUERY_DESTRUCT((m));   \
+        pmix_free((m));                  \
+        (m) = NULL;                 \
+    } while (0)
+
+#define PMIX_QUERY_CONSTRUCT(m)                 \
+    do {                                        \
+        memset((m), 0, sizeof(pmix_query_t));   \
+    } while (0)
+
+#define PMIX_QUERY_DESTRUCT(m)                                  \
+    do {                                                        \
+        size_t _qi;                                             \
+        if (NULL != (m)->keys) {                                \
+            for (_qi=0; NULL != (m)->keys[_qi]; _qi++) {        \
+                pmix_free((m)->keys[_qi]);                           \
+            }                                                   \
+            pmix_free((m)->keys);                                    \
+            (m)->keys = NULL;                                   \
+        }                                                       \
+        if (NULL != (m)->qualifiers) {                          \
+            PMIX_INFO_FREE((m)->qualifiers, (m)->nqual);        \
+            (m)->qualifiers = NULL;                             \
+            (m)->nqual = 0;                                     \
+        }                                                       \
+    } while (0)
+
+#define PMIX_QUERY_FREE(m, n)                       \
+    do {                                            \
+        size_t _qs;                                 \
+        if (NULL != (m)) {                          \
+            for (_qs=0; _qs < (n); _qs++) {         \
+                PMIX_QUERY_DESTRUCT(&((m)[_qs]));   \
+            }                                       \
+            pmix_free((m));                              \
+            (m) = NULL;                             \
+        }                                           \
+    } while (0)
+
+/****    ATTRIBUTE REGISTRATION STRUCT   ****/
+typedef struct pmix_regattr_t {
+    char *name;
+    pmix_key_t string;
+    pmix_data_type_t type;
+    char **description;
+} pmix_regattr_t;
+
+#define PMIX_REGATTR_STATIC_INIT    \
+{                                   \
+    .name = NULL,                   \
+    .string = {0},                  \
+    .type = PMIX_UNDEF,             \
+    .description = NULL             \
+}
+
+#define PMIX_REGATTR_CONSTRUCT(a)                       \
+    do {                                                \
+        if (NULL != (a)) {                              \
+            (a)->name = NULL;                           \
+            memset((a)->string, 0, PMIX_MAX_KEYLEN+1);  \
+            (a)->type = PMIX_UNDEF;                     \
+            (a)->description = NULL;                    \
+        }                                               \
+    } while(0)
+
+#define PMIX_REGATTR_LOAD(a, n, k, t, v)                        \
+    do {                                                        \
+        pmix_status_t _rgl;                                     \
+        if (NULL != (n)) {                                      \
+            (a)->name = strdup((n));                            \
+        }                                                       \
+        if (NULL != (k)) {                                      \
+            PMIX_LOAD_KEY((a)->string, (k));                    \
+        }                                                       \
+        (a)->type = (t);                                        \
+        if (NULL != (v)) {                                      \
+            PMIX_ARGV_APPEND(_rgl, &(a)->description, (v));     \
+        }                                                       \
+    } while(0)
+
+#define PMIX_REGATTR_DESTRUCT(a)                    \
+    do {                                            \
+        if (NULL != (a)) {                          \
+            if (NULL != (a)->name) {                \
+                pmix_free((a)->name);                    \
+            }                                       \
+            if (NULL != (a)->description) {         \
+                PMIX_ARGV_FREE((a)->description);   \
+            }                                       \
+        }                                           \
+    } while(0)
+
+#define PMIX_REGATTR_CREATE(m, n)                                               \
+    do {                                                                        \
+        if (0 == (n)) {                                                         \
+            (m) = NULL;                                                         \
+        } else {                                                                \
+            (m) = (pmix_regattr_t*)pmix_malloc((n) * sizeof(pmix_regattr_t));   \
+            if (NULL != (m)) {                                                  \
+                memset((m), 0, (n) * sizeof(pmix_regattr_t));                   \
+            }                                                                   \
+        }                                                                       \
+    } while (0)
+
+#define PMIX_REGATTR_FREE(m, n)                         \
+    do {                                                \
+        size_t _ra;                                     \
+        if (NULL != (m)) {                              \
+            for (_ra=0; _ra < (n); _ra++) {             \
+                PMIX_REGATTR_DESTRUCT(&((m)[_ra]));     \
+            }                                           \
+            pmix_free((m));                                  \
+            (m) = NULL;                                 \
+        }                                               \
+    } while (0)
+
+#define PMIX_REGATTR_XFER(a, b)                                         \
+    do {                                                                \
+        size_t _n;                                                      \
+        PMIX_REGATTR_CONSTRUCT((a));                                    \
+        if (NULL != ((b)->name)) {                                      \
+            (a)->name = strdup((b)->name);                              \
+        }                                                               \
+        PMIX_LOAD_KEY((a)->string, (b)->string);                        \
+        (a)->type = (b)->type;                                          \
+        if (NULL != (b)->description) {                                 \
+            PMIX_ARGV_COPY((a)->description, (b)->description);         \
+        }                                                               \
+    } while(0)
+
+
+/****    FABRIC STRUCT    ****/
+/* Define a pmix_fabric_t struct for
+ * interacting with fabric-related interfaces */
+typedef struct pmix_fabric_s {
+    /* user-supplied name for this fabric */
+    char *name;
+    /* a PMIx-supplied index identifying this registration object */
+    size_t index;
+    /* array containing information (provided by the PMIx library)
+     * about the fabric */
+    pmix_info_t *info;
+    size_t ninfo;
+    /* object pointer for use by the PMIx library */
+    void *module;
+} pmix_fabric_t;
+
+#define PMIX_FABRIC_STATIC_INIT \
+{                               \
+    .name = NULL,               \
+    .index = 0,                 \
+    .info = NULL,               \
+    .ninfo = 0,                 \
+    .module = NULL              \
+}
+
+/* convenience macros to support pmix_fabric_t */
+#define PMIX_FABRIC_CONSTRUCT(x) \
+    memset(x, 0, sizeof(pmix_fabric_t))
+
+typedef enum {
+    PMIX_FABRIC_REQUEST_INFO,
+    PMIX_FABRIC_UPDATE_INFO
+} pmix_fabric_operation_t;
+
+/****    CALLBACK FUNCTIONS FOR NON-BLOCKING OPERATIONS    ****/
+
+typedef void (*pmix_release_cbfunc_t)(void *cbdata);
+
+/* define a callback function that is solely used by servers, and
+ * not clients, to return modex data in response to "fence" and "get"
+ * operations. The returned blob contains the data collected from each
+ * server participating in the operation.
+ *
+ * As the data is "owned" by the host server, provide a secondary
+ * callback function to notify the host server that we are done
+ * with the data so it can be released */
+typedef void (*pmix_modex_cbfunc_t)(pmix_status_t status,
+                                    const char *data, size_t ndata,
+                                    void *cbdata,
+                                    pmix_release_cbfunc_t release_fn,
+                                    void *release_cbdata);
+
+/* define a callback function for calls to PMIx_Spawn_nb - the function
+ * will be called upon completion of the spawn command. The status
+ * will indicate whether or not the spawn succeeded. The nspace
+ * of the spawned processes will be returned, along with any provided
+ * callback data. Note that the returned nspace value will be
+ * released by the library upon return from the callback function, so
+ * the receiver must copy it if it needs to be retained */
+typedef void (*pmix_spawn_cbfunc_t)(pmix_status_t status,
+                                    pmix_nspace_t nspace, void *cbdata);
+
+/* define a callback for common operations that simply return
+ * a status. Examples include the non-blocking versions of
+ * Fence, Connect, and Disconnect */
+typedef void (*pmix_op_cbfunc_t)(pmix_status_t status, void *cbdata);
+
+/* define a callback function for calls to PMIx_Lookup_nb - the
+ * function will be called upon completion of the command with the
+ * status indicating the success of failure of the request. Any
+ * retrieved data will be returned in an array of pmix_pdata_t structs.
+ * The nspace/rank of the process that provided each data element is
+ * also returned.
+ *
+ * Note that these structures will be released upon return from
+ * the callback function, so the receiver must copy/protect the
+ * data prior to returning if it needs to be retained */
+
+typedef void (*pmix_lookup_cbfunc_t)(pmix_status_t status,
+                                     pmix_pdata_t data[], size_t ndata,
+                                     void *cbdata);
+
+/* define a callback by which an event handler can notify the PMIx library
+ * that it has completed its response to the notification. The handler
+ * is _required_ to execute this callback so the library can determine
+ * if additional handlers need to be called. The handler shall return
+ * PMIX_SUCCESS if no further action is required. The return status
+ * of each event handler and any returned pmix_info_t structures
+ * will be added to the array of pmix_info_t passed to any subsequent
+ * event handlers to help guide their operation.
+ *
+ * If non-NULL, the provided callback function will be called to allow
+ * the event handler to release the provided info array.
+ */
+typedef void (*pmix_event_notification_cbfunc_fn_t)(pmix_status_t status,
+                                                    pmix_info_t *results, size_t nresults,
+                                                    pmix_op_cbfunc_t cbfunc, void *thiscbdata,
+                                                    void *notification_cbdata);
+
+/* define a callback function for the event handler. Upon receipt of an
+ * event notification, PMIx will execute the specified notification
+ * callback function, providing:
+ *
+ * evhdlr_registration_id - the returned registration number of
+ *                          the event handler being called
+ * status - the event that occurred
+ * source - the nspace and rank of the process that generated
+ *          the event. If the source is the resource manager,
+ *          then the nspace will be empty and the rank will
+ *          be PMIX_RANK_UNDEF
+ * info - any additional info provided regarding the event.
+ * ninfo - the number of info objects in the info array
+ * results - any provided results from event handlers called
+ *           prior to this one.
+ * nresults - number of info objects in the results array
+ * cbfunc - the function to be called upon completion of the handler
+ * cbdata - pointer to be returned in the completion cbfunc
+ *
+ * Note that different resource managers may provide differing levels
+ * of support for event notification to application processes. Thus, the
+ * info array may be NULL or may contain detailed information of the event.
+ * It is the responsibility of the application to parse any provided info array
+ * for defined key-values if it so desires.
+ *
+ * Possible uses of the pmix_info_t object include:
+ *
+ * - for the RM to alert the process as to planned actions, such as
+ *   to abort the session, in response to the reported event
+ *
+ * - provide a timeout for alternative action to occur, such as for
+ *   the application to request an alternate response to the event
+ *
+ * For example, the RM might alert the application to the failure of
+ * a node that resulted in termination of several processes, and indicate
+ * that the overall session will be aborted unless the application
+ * requests an alternative behavior in the next 5 seconds. The application
+ * then has time to respond with a checkpoint request, or a request to
+ * recover from the failure by obtaining replacement nodes and restarting
+ * from some earlier checkpoint.
+ *
+ * Support for these options is left to the discretion of the host RM. Info
+ * keys are included in the common definions above, but also may be augmented
+ * on a per-RM basis.
+ *
+ * On the server side, the notification function is used to inform the host
+ * server of a detected event in the PMIx subsystem and/or client
+ */
+typedef void (*pmix_notification_fn_t)(size_t evhdlr_registration_id,
+                                       pmix_status_t status,
+                                       const pmix_proc_t *source,
+                                       pmix_info_t info[], size_t ninfo,
+                                       pmix_info_t *results, size_t nresults,
+                                       pmix_event_notification_cbfunc_fn_t cbfunc,
+                                       void *cbdata);
+
+/* define a callback function for calls to register handlers, e.g., event
+ * notification and IOF requests
+ *
+ * status - PMIX_SUCCESS or an appropriate error constant
+ *
+ * refid - reference identifier assigned to the handler by PMIx,
+ *         used to deregister the handler
+ *
+ * cbdata - object provided to the registration call
+ */
+typedef void (*pmix_hdlr_reg_cbfunc_t)(pmix_status_t status,
+                                       size_t refid,
+                                       void *cbdata);
+/* retain the deprecated form */
+typedef void (*pmix_evhdlr_reg_cbfunc_t)(pmix_status_t status,
+                                         size_t refid,
+                                         void *cbdata);
+
+/* define a callback function for calls to PMIx_Get_nb. The status
+ * indicates if the requested data was found or not - a pointer to the
+ * pmix_value_t structure containing the found data is returned. The
+ * pointer will be NULL if the requested data was not found. */
+typedef void (*pmix_value_cbfunc_t)(pmix_status_t status,
+                                    pmix_value_t *kv, void *cbdata);
+
+/* define a callback function for calls to PMIx_Query. The status
+ * indicates if requested data was found or not - an array of
+ * pmix_info_t will contain the key/value pairs. */
+typedef void (*pmix_info_cbfunc_t)(pmix_status_t status,
+                                   pmix_info_t *info, size_t ninfo,
+                                   void *cbdata,
+                                   pmix_release_cbfunc_t release_fn,
+                                   void *release_cbdata);
+
+/* Define a callback function to return a requested security credential.
+ * Returned values include:
+ *
+ * status - PMIX_SUCCESS if a credential could be assigned as requested, or
+ *          else an appropriate error code indicating the problem
+ *
+ * credential - pointer to an allocated pmix_byte_object_t containing the
+ *              credential (as a opaque blob) and its size. Ownership of
+ *              the credential is transferred to the receiving function - thus,
+ *              responsibility for releasing the memory lies outside the
+ *              PMIx library.
+ *
+ * info - an array of pmix_info_t structures provided by the system to pass
+ *        any additional information about the credential - e.g., the identity
+ *        of the issuing agent. The info array is owned by the PMIx library
+ *        and is not to be released or altered by the receiving party. Note that
+ *        this array is not related to the pmix_info_t structures possibly
+ *        provided in the call to PMIx_Get_credential.
+ *
+ *        Information provided by the issuing agent can subsequently be used
+ *        by the application for a variety of purposes. Examples include:
+ *            - checking identified authorizations to determine what
+ *              requests/operations are feasible as a means to steering
+ *              workflows
+ *            - compare the credential type to that of the local SMS for
+ *              compatibility
+ *
+ * ninfo - number of elements in the info array
+ *
+ * cbdata - the caller's provided void* object
+ *
+ * NOTE: the credential is opaque and therefore understandable only by
+ *       a service compatible with the issuer.
+ */
+typedef void (*pmix_credential_cbfunc_t)(pmix_status_t status,
+                                         pmix_byte_object_t *credential,
+                                         pmix_info_t info[], size_t ninfo,
+                                         void *cbdata);
+
+
+/* Define a validation callback function to indicate if a provided
+ * credential is valid, and any corresponding information regarding
+ * authorizations and other security matters
+ * Returned values include:
+ *
+ * status - PMIX_SUCCESS if the provided credential is valid. An appropriate
+ *          error code indicating the issue if the credential is rejected.
+ *
+ * info - an array of pmix_info_t structures provided by the system to pass
+ *        any additional information about the authentication - e.g., the
+ *        effective userid and group id of the certificate holder, and any
+ *        related authorizations. The info array is owned by the PMIx library
+ *        and is not to be released or altered by the receiving party. Note that
+ *        this array is not related to the pmix_info_t structures possibly
+ *        provided in the call to PMIx_Validate_credential.
+ *
+ *        The precise contents of the array will depend on the host SMS and
+ *        its associated security system. At the minimum, it is expected (but
+ *        not required) that the array will contain entries for the PMIX_USERID
+ *        and PMIX_GROUPID of the client described in the credential.
+ *
+ * ninfo - number of elements in the info array
+ *
+ * cbdata - the caller's provided void* object
+ */
+typedef void (*pmix_validation_cbfunc_t)(pmix_status_t status,
+                                         pmix_info_t info[], size_t ninfo,
+                                         void *cbdata);
+
+/* Define a callback function to return device distance arrays
+ */
+typedef void (*pmix_device_dist_cbfunc_t)(pmix_status_t status,
+                                          pmix_device_distance_t *dist,
+                                          size_t ndist,
+                                          void *cbdata,
+                                          pmix_release_cbfunc_t release_fn,
+                                          void *release_cbdata);
+
+
+
+#define PMIX_DATA_ARRAY_INIT(m, t)      \
+    do {                                \
+        (m)->array = NULL;              \
+        (m)->type = (t);                \
+        (m)->size = 0;                  \
+    } while(0)
+
+#define PMIX_DATA_ARRAY_CONSTRUCT(m, n, t)                          \
+    do {                                                            \
+        (m)->type = (t);                                            \
+        (m)->size = (n);                                            \
+        if (0 < (n)) {                                              \
+            if (PMIX_INFO == (t)) {                                 \
+                PMIX_INFO_CREATE((m)->array, (n));                  \
+                                                                    \
+            } else if (PMIX_PROC == (t)) {                          \
+                PMIX_PROC_CREATE((m)->array, (n));                  \
+                                                                    \
+            } else if (PMIX_PROC_INFO == (t)) {                     \
+                PMIX_PROC_INFO_CREATE((m)->array, (n));             \
+                                                                    \
+            } else if (PMIX_ENVAR == (t)) {                         \
+                PMIX_ENVAR_CREATE((m)->array, (n));                 \
+                                                                    \
+            } else if (PMIX_VALUE == (t)) {                         \
+                PMIX_VALUE_CREATE((m)->array, (n));                 \
+                                                                    \
+            } else if (PMIX_PDATA == (t)) {                         \
+                PMIX_PDATA_CREATE((m)->array, (n));                 \
+                                                                    \
+            } else if (PMIX_QUERY == (t)) {                         \
+                PMIX_QUERY_CREATE((m)->array, (n));                 \
+                                                                    \
+            } else if (PMIX_APP == (t)) {                           \
+                PMIX_APP_CREATE((m)->array, (n));                   \
+                                                                    \
+            } else if (PMIX_BYTE_OBJECT == (t) ||                   \
+                       PMIX_COMPRESSED_STRING == (t)) {             \
+                PMIX_BYTE_OBJECT_CREATE((m)->array, (n));           \
+                                                                    \
+            } else if (PMIX_ALLOC_DIRECTIVE == (t) ||               \
+                       PMIX_PROC_STATE == (t) ||                    \
+                       PMIX_PERSIST == (t) ||                       \
+                       PMIX_SCOPE == (t) ||                         \
+                       PMIX_DATA_RANGE == (t) ||                    \
+                       PMIX_BYTE == (t) ||                          \
+                       PMIX_INT8 == (t) ||                          \
+                       PMIX_UINT8 == (t) ||                         \
+                       PMIX_POINTER == (t)) {                       \
+                (m)->array = pmix_calloc((n), sizeof(int8_t));           \
+                                                                    \
+            } else if (PMIX_STRING == (t)) {                        \
+                (m)->array = pmix_calloc((n), sizeof(char*));            \
+                                                                    \
+            } else if (PMIX_SIZE == (t)) {                          \
+                (m)->array = pmix_calloc((n), sizeof(size_t));           \
+                                                                    \
+            } else if (PMIX_PID == (t)) {                           \
+                (m)->array = pmix_calloc((n), sizeof(pid_t));            \
+                                                                    \
+            } else if (PMIX_INT == (t) ||                           \
+                       PMIX_UINT == (t) ||                          \
+                       PMIX_STATUS == (t)) {                        \
+                (m)->array = pmix_calloc((n), sizeof(int));              \
+                                                                    \
+            } else if (PMIX_IOF_CHANNEL == (t) ||                   \
+                       PMIX_DATA_TYPE == (t) ||                     \
+                       PMIX_INT16 == (t) ||                         \
+                       PMIX_UINT16 == (t)) {                        \
+                (m)->array = pmix_calloc((n), sizeof(int16_t));          \
+                                                                    \
+            } else if (PMIX_PROC_RANK == (t) ||                     \
+                       PMIX_INFO_DIRECTIVES == (t) ||               \
+                       PMIX_INT32 == (t) ||                         \
+                       PMIX_UINT32 == (t)) {                        \
+                (m)->array = pmix_calloc((n), sizeof(int32_t));          \
+                                                                    \
+            } else if (PMIX_INT64 == (t) ||                         \
+                       PMIX_UINT64 == (t)) {                        \
+                (m)->array = pmix_calloc((n), sizeof(int64_t));          \
+                                                                    \
+            } else if (PMIX_FLOAT == (t)) {                         \
+                (m)->array = pmix_calloc((n), sizeof(float));            \
+                                                                    \
+            } else if (PMIX_DOUBLE == (t)) {                        \
+                (m)->array = pmix_calloc((n), sizeof(double));           \
+                                                                    \
+            } else if (PMIX_TIMEVAL == (t)) {                       \
+                (m)->array = pmix_calloc((n), sizeof(struct timeval));   \
+                                                                    \
+            } else if (PMIX_TIME == (t)) {                          \
+                (m)->array = pmix_calloc((n), sizeof(time_t));           \
+                                                                    \
+            } else if (PMIX_REGATTR == (t)) {                       \
+                PMIX_REGATTR_CREATE((m)->array, (n));               \
+                                                                    \
+            } else if (PMIX_BOOL == (t)) {                          \
+                (m)->array = pmix_calloc((n), sizeof(bool));             \
+                                                                    \
+            } else if (PMIX_COORD == (t)) {                         \
+                (m)->array = pmix_calloc((n), sizeof(pmix_coord_t));  \
+                                                                    \
+            } else if (PMIX_LINK_STATE == (t)) {                    \
+                (m)->array = pmix_calloc((n), sizeof(pmix_link_state_t));  \
+                                                                    \
+            } else if (PMIX_ENDPOINT == (t)) {                         \
+                PMIX_ENDPOINT_CREATE((m)->array, n);                   \
+                                                                    \
+            } else if (PMIX_PROC_NSPACE == (t)) {                         \
+                (m)->array = pmix_calloc((n), sizeof(pmix_nspace_t));     \
+                                                                    \
+            } else if (PMIX_PROC_STATS == (t)) {                         \
+                PMIX_PROC_STATS_CREATE((m)->array, n);                   \
+                                                                    \
+            } else if (PMIX_DISK_STATS == (t)) {                         \
+                PMIX_DISK_STATS_CREATE((m)->array, n);                   \
+                                                                    \
+            } else if (PMIX_NET_STATS == (t)) {                         \
+                PMIX_NET_STATS_CREATE((m)->array, n);                   \
+                                                                    \
+            } else if (PMIX_NODE_STATS == (t)) {                         \
+                PMIX_NODE_STATS_CREATE((m)->array, n);                   \
+                                                                    \
+            } else if (PMIX_DEVICE_DIST == (t)) {                         \
+                PMIX_DEVICE_DIST_CREATE((m)->array, n);                   \
+                                                                    \
+            } else if (PMIX_GEOMETRY == (t)) {                         \
+                PMIX_GEOMETRY_CREATE((m)->array, n);                   \
+                                                                    \
+            } else if (PMIX_REGATTR == (t)) {                         \
+                PMIX_REGATTR_CREATE((m)->array, n);                   \
+                                                                    \
+            } else if (PMIX_PROC_CPUSET == (t)) {                         \
+                PMIX_CPUSET_CREATE((m)->array, n);                   \
+            } else {                                                \
+                (m)->array = NULL;                                  \
+                (m)->size = 0;                                      \
+            }                                                       \
+        } else {                                                    \
+            (m)->array = NULL;                                      \
+        }                                                           \
+    } while(0)
+#define PMIX_DATA_ARRAY_CREATE(m, n, t)                                     \
+    do {                                                                    \
+        (m) = (pmix_data_array_t*)pmix_malloc(sizeof(pmix_data_array_t));   \
+        if (NULL != (m)) {                                                  \
+            memset((m), 0, sizeof(pmix_data_array_t));                      \
+            PMIX_DATA_ARRAY_CONSTRUCT((m), (n), (t));                       \
+        }                                                                   \
+    } while(0)
+
+#include <pmix_deprecated.h>
+
+/********    STANDARD MACROS FOR DARRAY AND VALUE SUPPORT     ********/
+
+/* release the memory in the value struct data field */
+#define PMIX_VALUE_DESTRUCT(m) PMIx_Value_destruct(m)
+
+/* release a single pmix_value_t struct, including its data */
+#define PMIX_VALUE_RELEASE(m)       \
+    do {                            \
+        PMIX_VALUE_DESTRUCT((m));   \
+        pmix_free((m));                  \
+        (m) = NULL;                 \
+    } while (0)
+
+#define PMIX_VALUE_FREE(m, n)                           \
+    do {                                                \
+        size_t _vv;                                     \
+        if (NULL != (m)) {                              \
+            for (_vv=0; _vv < (n); _vv++) {             \
+                PMIX_VALUE_DESTRUCT(&((m)[_vv]));       \
+            }                                           \
+            pmix_free((m));                             \
+            (m) = NULL;                                 \
+        }                                               \
+    } while (0)
+
+#define PMIX_INFO_DESTRUCT(m)                   \
+    do {                                        \
+        if (!PMIX_INFO_IS_PERSISTENT((m))) {    \
+            PMIX_VALUE_DESTRUCT(&(m)->value);   \
+        }                                       \
+    } while (0)
+
+#define PMIX_INFO_FREE(m, n)                        \
+    do {                                            \
+        size_t _is;                                 \
+        if (NULL != (m)) {                          \
+            for (_is=0; _is < (n); _is++) {         \
+                PMIX_INFO_DESTRUCT(&((m)[_is]));    \
+            }                                       \
+            pmix_free((m));                         \
+            (m) = NULL;                             \
+        }                                           \
+    } while (0)
+
+#define PMIX_APP_DESTRUCT(m)                                    \
+    do {                                                        \
+        if (NULL != (m)->cmd) {                                 \
+            pmix_free((m)->cmd);                                \
+            (m)->cmd = NULL;                                    \
+        }                                                       \
+        if (NULL != (m)->argv) {                                \
+            pmix_argv_free((m)->argv);                          \
+            (m)->argv = NULL;                                   \
+        }                                                       \
+        if (NULL != (m)->env) {                                 \
+            pmix_argv_free((m)->env);                           \
+            (m)->env = NULL;                                    \
+        }                                                       \
+        if (NULL != (m)->cwd) {                                 \
+            pmix_free((m)->cwd);                                \
+            (m)->cwd = NULL;                                    \
+        }                                                       \
+        if (NULL != (m)->info) {                                \
+            PMIX_INFO_FREE((m)->info, (m)->ninfo);              \
+            (m)->info = NULL;                                   \
+            (m)->ninfo = 0;                                     \
+        }                                                       \
+    } while (0)
+
+static inline void pmix_app_free(pmix_app_t *ap, size_t n)
+{
+    size_t k;
+
+    if (NULL != ap) {
+        for (k=0; k < n; k++) {
+            PMIX_APP_DESTRUCT(&ap[k]);
+        }
+    }
+}
+
+#define PMIX_APP_FREE(m, n)     \
+    do {                        \
+        pmix_app_free(m, n);    \
+        pmix_free(m);           \
+        (m) = NULL;             \
+    } while (0)
+
+
+#define PMIX_DATA_ARRAY_DESTRUCT(m) PMIx_Data_array_destruct(m)
+
+#define PMIX_DATA_ARRAY_FREE(m)             \
+    do {                                    \
+        if (NULL != (m)) {                  \
+            PMIX_DATA_ARRAY_DESTRUCT(m);    \
+            pmix_free((m));                      \
+            (m) = NULL;                     \
+        }                                   \
+    } while(0)
+
+#define PMIX_PDATA_RELEASE(m)                   \
+    do {                                        \
+        PMIX_VALUE_DESTRUCT(&(m)->value);       \
+        pmix_free((m));                         \
+        (m) = NULL;                             \
+    } while (0)
+
+#define PMIX_PDATA_DESTRUCT(m)                  \
+    do {                                        \
+        PMIX_VALUE_DESTRUCT(&(m)->value);       \
+    } while (0)
+
+static inline void pmix_pdata_free(pmix_pdata_t *pd, size_t n)
+{
+    size_t k;
+
+    if (NULL != pd) {
+        for (k=0; k < n; k++) {
+            PMIX_PDATA_DESTRUCT(&pd[k]);
+        }
+    }
+}
+
+#define PMIX_PDATA_FREE(m, n)   \
+do {                            \
+    pmix_pdata_free(m, n);      \
+    pmix_free(m);               \
+    (m) = NULL;                 \
+} while(0)
+
+#if defined(c_plusplus) || defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/deps/pmix/include/pmix_deprecated.h b/deps/pmix/include/pmix_deprecated.h
new file mode 100644
index 000000000..554478ec7
--- /dev/null
+++ b/deps/pmix/include/pmix_deprecated.h
@@ -0,0 +1,324 @@
+/*
+ * Copyright (c) 2013-2020 Intel, Inc.  All rights reserved.
+ * Copyright (c) 2015      Artem Y. Polyakov <artpol84@gmail.com>.
+ *                         All rights reserved.
+ * Copyright (c) 2015      Research Organization for Information Science
+ *                         and Technology (RIST). All rights reserved.
+ * Copyright (c) 2021-2022 Nanook Consulting  All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer listed
+ *   in this license in the documentation and/or other materials
+ *   provided with the distribution.
+ *
+ * - Neither the name of the copyright holders nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ *
+ * The copyright holders provide no reassurances that the source code
+ * provided does not infringe any patent, copyright, or any other
+ * intellectual property rights of third parties.  The copyright holders
+ * disclaim any liability to any recipient for claims brought against
+ * recipient by any third party for infringement of that parties
+ * intellectual property rights.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $HEADER$
+ *
+ * PMIx provides a "function-shipping" approach to support for
+ * implementing the server-side of the protocol. This method allows
+ * resource managers to implement the server without being burdened
+ * with PMIx internal details. Accordingly, each PMIx API is mirrored
+ * here in a function call to be provided by the server. When a
+ * request is received from the client, the corresponding server function
+ * will be called with the information.
+ *
+ * Any functions not supported by the RM can be indicated by a NULL for
+ * the function pointer. Client calls to such functions will have a
+ * "not supported" error returned.
+ */
+
+#ifndef PMIx_DEPRECATED_H
+#define PMIx_DEPRECATED_H
+
+#if defined(c_plusplus) || defined(__cplusplus)
+extern "C" {
+#endif
+
+/***** v4 DECPRECATIONS/REMOVALS *****/
+/* APIs */
+PMIX_EXPORT pmix_status_t PMIx_tool_connect_to_server(pmix_proc_t *proc,
+                                                      pmix_info_t info[], size_t ninfo);
+
+/* DATATYPES */
+#define PMIX_BUFFER                     26
+
+/* CONSTANTS */
+#define PMIX_ERR_SILENT                             -2
+#define PMIX_ERR_DEBUGGER_RELEASE                   -3
+#define PMIX_ERR_PROC_ABORTED                       -7
+#define PMIX_ERR_PROC_REQUESTED_ABORT               -8
+#define PMIX_ERR_PROC_ABORTING                      -9
+#define PMIX_ERR_SERVER_FAILED_REQUEST              -10
+#define PMIX_EXISTS                                 -11
+#define PMIX_ERR_HANDSHAKE_FAILED                   -13
+#define PMIX_ERR_READY_FOR_HANDSHAKE                -14
+#define PMIX_ERR_PROC_ENTRY_NOT_FOUND               -17
+#define PMIX_ERR_PACK_MISMATCH                      -22
+#define PMIX_ERR_IN_ERRNO                           -26
+#define PMIX_ERR_DATA_VALUE_NOT_FOUND               -30
+#define PMIX_ERR_INVALID_ARG                        -33
+#define PMIX_ERR_INVALID_KEY                        -34
+#define PMIX_ERR_INVALID_KEY_LENGTH                 -35
+#define PMIX_ERR_INVALID_VAL                        -36
+#define PMIX_ERR_INVALID_VAL_LENGTH                 -37
+#define PMIX_ERR_INVALID_LENGTH                     -38
+#define PMIX_ERR_INVALID_NUM_ARGS                   -39
+#define PMIX_ERR_INVALID_ARGS                       -40
+#define PMIX_ERR_INVALID_NUM_PARSED                 -41
+#define PMIX_ERR_INVALID_KEYVALP                    -42
+#define PMIX_ERR_INVALID_SIZE                       -43
+#define PMIX_ERR_INVALID_NAMESPACE                  -44
+#define PMIX_ERR_SERVER_NOT_AVAIL                   -45
+#define PMIX_ERR_NOT_IMPLEMENTED                    -48
+#define PMIX_DEBUG_WAITING_FOR_NOTIFY               -58
+#define PMIX_ERR_FATAL                              -63
+#define PMIX_ERR_NOT_AVAILABLE                      -64
+#define PMIX_ERR_VALUE_OUT_OF_BOUNDS                -65
+#define PMIX_ERR_FILE_OPEN_FAILURE                  -67
+#define PMIX_ERR_FILE_READ_FAILURE                  -68
+#define PMIX_ERR_FILE_WRITE_FAILURE                 -69
+#define PMIX_ERR_SYS_LIMITS_PIPES                   -70
+#define PMIX_ERR_SYS_LIMITS_CHILDREN                -71
+#define PMIX_ERR_PIPE_SETUP_FAILURE                 -72
+#define PMIX_ERR_EXE_NOT_ACCESSIBLE                 -73
+#define PMIX_ERR_JOB_WDIR_NOT_ACCESSIBLE            -74
+#define PMIX_ERR_SYS_LIMITS_FILES                   -75
+#define PMIX_ERR_LOST_CONNECTION_TO_SERVER          -101
+#define PMIX_ERR_LOST_PEER_CONNECTION               -102
+#define PMIX_ERR_LOST_CONNECTION_TO_CLIENT          -103
+#define PMIX_NOTIFY_ALLOC_COMPLETE                  -105
+#define PMIX_ERR_INVALID_TERMINATION                -112
+#define PMIX_ERR_JOB_TERMINATED                     -145    // DEPRECATED NAME - non-error termination
+#define PMIX_ERR_UPDATE_ENDPOINTS                   -146
+#define PMIX_GDS_ACTION_COMPLETE                    -148
+#define PMIX_PROC_HAS_CONNECTED                     -149
+#define PMIX_CONNECT_REQUESTED                      -150
+#define PMIX_ERR_NODE_DOWN                          -231
+#define PMIX_ERR_NODE_OFFLINE                       -232
+
+#define PMIX_ERR_SYS_BASE                           PMIX_EVENT_SYS_BASE
+#define PMIX_ERR_SYS_OTHER                          PMIX_EVENT_SYS_OTHER
+
+#define PMIX_JOB_STATE_PREPPED                   1  // job is ready to be launched
+
+/* ATTRIBUTES */
+#define PMIX_EVENT_BASE                     "pmix.evbase"           // (struct event_base *) pointer to libevent event_base
+                                                                    // to use in place of the internal progress thread ***** DEPRECATED *****
+#define PMIX_TOPOLOGY                       "pmix.topo"             // (hwloc_topology_t) ***** DEPRECATED ***** pointer to the PMIx client's internal
+                                                                    //         topology object
+#define PMIX_DEBUG_JOB                      "pmix.dbg.job"          // (char*) ***** DEPRECATED ***** nspace of the job assigned to this debugger to be debugged. Note
+                                                                    //         that id's, pids, and other info on the procs is available
+                                                                    //         via a query for the nspace's local or global proctable
+#define PMIX_RECONNECT_SERVER               "pmix.cnct.recon"       // (bool) tool is requesting to change server connections
+
+/* attributes for the USOCK rendezvous socket  */
+#define PMIX_USOCK_DISABLE                  "pmix.usock.disable"    // (bool) disable legacy usock support
+#define PMIX_SOCKET_MODE                    "pmix.sockmode"         // (uint32_t) POSIX mode_t (9 bits valid)
+#define PMIX_SINGLE_LISTENER                "pmix.sing.listnr"      // (bool) use only one rendezvous socket, letting priorities and/or
+                                                                    //        MCA param select the active transport
+#define PMIX_ALLOC_NETWORK                  "pmix.alloc.net"        // (pmix_data_array_t*) ***** DEPRECATED *****
+#define PMIX_ALLOC_NETWORK_ID               "pmix.alloc.netid"      // (char*) ***** DEPRECATED *****
+#define PMIX_ALLOC_NETWORK_QOS              "pmix.alloc.netqos"     // (char*) ***** DEPRECATED *****
+#define PMIX_ALLOC_NETWORK_TYPE             "pmix.alloc.nettype"    // (char*) ***** DEPRECATED *****
+#define PMIX_ALLOC_NETWORK_PLANE            "pmix.alloc.netplane"   // (char*) ***** DEPRECATED *****
+#define PMIX_ALLOC_NETWORK_ENDPTS           "pmix.alloc.endpts"     // (size_t) ***** DEPRECATED *****
+#define PMIX_ALLOC_NETWORK_ENDPTS_NODE      "pmix.alloc.endpts.nd"  // (size_t) ***** DEPRECATED *****
+#define PMIX_ALLOC_NETWORK_SEC_KEY          "pmix.alloc.nsec"       // (pmix_byte_object_t) ***** DEPRECATED *****
+#define PMIX_PROC_DATA                      "pmix.pdata"            // (pmix_data_array_t*) ***** DEPRECATED ***** starts with rank, then contains more data
+#define PMIX_LOCALITY                       "pmix.loc"              // (uint16_t)  ***** DEPRECATED *****relative locality of two procs
+
+#define PMIX_LOCAL_TOPO                     "pmix.ltopo"            // (char*)  ***** DEPRECATED *****xml-representation of local node topology
+#define PMIX_TOPOLOGY_XML                   "pmix.topo.xml"         // (char*)  ***** DEPRECATED *****XML-based description of topology
+#define PMIX_TOPOLOGY_FILE                  "pmix.topo.file"        // (char*)  ***** DEPRECATED *****full path to file containing XML topology description
+#define PMIX_TOPOLOGY_SIGNATURE             "pmix.toposig"          // (char*)  ***** DEPRECATED *****topology signature string
+#define PMIX_HWLOC_SHMEM_ADDR               "pmix.hwlocaddr"        // (size_t)  ***** DEPRECATED *****address of HWLOC shared memory segment
+#define PMIX_HWLOC_SHMEM_SIZE               "pmix.hwlocsize"        // (size_t)  ***** DEPRECATED *****size of HWLOC shared memory segment
+#define PMIX_HWLOC_SHMEM_FILE               "pmix.hwlocfile"        // (char*)  ***** DEPRECATED *****path to HWLOC shared memory file
+#define PMIX_HWLOC_XML_V1                   "pmix.hwlocxml1"        // (char*)  ***** DEPRECATED ***** XML representation of local topology using HWLOC v1.x format
+#define PMIX_HWLOC_XML_V2                   "pmix.hwlocxml2"        // (char*) ***** DEPRECATED ***** XML representation of local topology using HWLOC v2.x format
+#define PMIX_HWLOC_SHARE_TOPO               "pmix.hwlocsh"          // (bool) ***** DEPRECATED ***** Share the HWLOC topology via shared memory
+#define PMIX_HWLOC_HOLE_KIND                "pmix.hwlocholek"       // (char*) ***** DEPRECATED ***** Kind of VM "hole" HWLOC should use for shared memory
+#define PMIX_DSTPATH                        "pmix.dstpath"          // (char*) ***** DEPRECATED ***** path to dstore files
+#define PMIX_COLLECTIVE_ALGO                "pmix.calgo"            // (char*) ***** DEPRECATED ***** comma-delimited list of algorithms to use for collective
+#define PMIX_COLLECTIVE_ALGO_REQD           "pmix.calreqd"          // (bool) ***** DEPRECATED ***** if true, indicates that the requested choice of algo is mandatory
+#define PMIX_PROC_BLOB                      "pmix.pblob"            // (pmix_byte_object_t) ***** DEPRECATED ***** packed blob of process data
+#define PMIX_MAP_BLOB                       "pmix.mblob"            // (pmix_byte_object_t) ***** DEPRECATED ***** packed blob of process location
+#define PMIX_MAPPER                         "pmix.mapper"           // (char*) ***** DEPRECATED ***** mapper to use for placing spawned procs
+#define PMIX_NON_PMI                        "pmix.nonpmi"           // (bool) ***** DEPRECATED ***** spawned procs will not call PMIx_Init
+#define PMIX_PROC_URI                       "pmix.puri"             // (char*) ***** DEPRECATED ***** URI containing contact info for proc
+#define PMIX_ARCH                           "pmix.arch"             // (uint32_t) ***** DEPRECATED ***** datatype architecture flag
+
+#define PMIX_DEBUG_JOB_DIRECTIVES           "pmix.dbg.jdirs"        // (pmix_data_array_t*) ***** DEPRECATED ***** array of job-level directives
+#define PMIX_DEBUG_APP_DIRECTIVES           "pmix.dbg.adirs"        // (pmix_data_array_t*) ***** DEPRECATED ***** array of app-level directives
+#define PMIX_EVENT_NO_TERMINATION           "pmix.evnoterm"         // (bool) ***** DEPRECATED ***** indicates that the handler has satisfactorily handled
+                                                                    //        the event and believes termination of the application is not required
+#define PMIX_EVENT_WANT_TERMINATION         "pmix.evterm"           // (bool) ***** DEPRECATED ***** indicates that the handler has determined that the
+                                                                    //        application should be terminated
+#define PMIX_TAG_OUTPUT                     "pmix.tagout"           // (bool) ***** DEPRECATED ***** tag application output with the ID of the source
+#define PMIX_TIMESTAMP_OUTPUT               "pmix.tsout"            // (bool) ***** DEPRECATED ***** timestamp output from applications
+#define PMIX_MERGE_STDERR_STDOUT            "pmix.mergeerrout"      // (bool) ***** DEPRECATED ***** merge stdout and stderr streams from application procs
+#define PMIX_OUTPUT_TO_FILE                 "pmix.outfile"          // (char*) ***** DEPRECATED ***** direct application output into files of form
+                                                                    //         "<filename>.rank" with both stdout and stderr redirected into it
+#define PMIX_OUTPUT_TO_DIRECTORY            "pmix.outdir"           // (char*) ***** DEPRECATED ***** direct application output into files of form
+                                                                    //         "<directory>/<jobid>/rank.<rank>/stdout[err]"
+#define PMIX_OUTPUT_NOCOPY                  "pmix.nocopy"           // (bool) ***** DEPRECATED ***** output only into designated files - do not also output
+                                                                    //        a copy to stdout/stderr
+
+/* attributes for GDS */
+#define PMIX_GDS_MODULE                     "pmix.gds.mod"          // (char*) ***** DEPRECATED ***** comma-delimited string of desired modules
+#define PMIX_BFROPS_MODULE                  "pmix.bfrops.mod"       // (char*) ***** INTERNAL ***** name of bfrops plugin in-use by a given nspace
+#define PMIX_PNET_SETUP_APP                 "pmix.pnet.setapp"      // (pmix_byte_object_t) ***** INTERNAL ***** blob containing info to be given to pnet framework on remote nodes
+
+#define PMIX_IOF_STOP                       "pmix.iof.stop"         // (bool) ***** DEPRECATED ***** Stop forwarding the specified channel(s)
+#define PMIX_NOTIFY_LAUNCH                  "pmix.note.lnch"        // (bool) ***** DEPRECATED ***** notify the requestor upon launch of the child job and return
+                                                                    //        its namespace in the event
+
+
+/* Bring some function definitions across from pmix.h for now-deprecated
+ * macros that utilize them. We have to do this as there are people who
+ * only included pmix_common.h if they were using macros but not APIs */
+
+PMIX_EXPORT pmix_status_t PMIx_Value_load(pmix_value_t *val,
+                                          const void *data,
+                                          pmix_data_type_t type);
+PMIX_EXPORT pmix_status_t PMIx_Value_unload(pmix_value_t *val,
+                                            void **data,
+                                            size_t *sz);
+PMIX_EXPORT void PMIx_Value_destruct(pmix_value_t *val);
+PMIX_EXPORT pmix_status_t PMIx_Value_xfer(pmix_value_t *dest,
+                                          const pmix_value_t *src);
+PMIX_EXPORT pmix_value_cmp_t PMIx_Value_compare(pmix_value_t *v1,
+                                                pmix_value_t *v2);
+PMIX_EXPORT void PMIx_Data_array_destruct(pmix_data_array_t *d);
+
+PMIX_EXPORT pmix_status_t PMIx_Info_load(pmix_info_t *info,
+                                         const char *key,
+                                         const void *data,
+                                         pmix_data_type_t type);
+
+PMIX_EXPORT pmix_status_t PMIx_Info_xfer(pmix_info_t *dest,
+                                         const pmix_info_t *src);
+
+PMIX_EXPORT void* PMIx_Info_list_start(void);
+
+PMIX_EXPORT pmix_status_t PMIx_Info_list_add(void *ptr,
+                                             const char *key,
+                                             const void *value,
+                                             pmix_data_type_t type);
+
+PMIX_EXPORT pmix_status_t PMIx_Info_list_insert(void *ptr, pmix_info_t *info);
+
+PMIX_EXPORT pmix_status_t PMIx_Info_list_xfer(void *ptr,
+                                              const pmix_info_t *info);
+
+PMIX_EXPORT pmix_status_t PMIx_Info_list_convert(void *ptr, pmix_data_array_t *par);
+
+PMIX_EXPORT void PMIx_Info_list_release(void *ptr);
+
+#define PMIX_VALUE_LOAD(v, d, t) \
+    PMIx_Value_load((v), (d), (t))
+
+#define PMIX_VALUE_UNLOAD(r, k, d, s)      \
+    (r) = PMIx_Value_unload((k), (d), (s))
+
+#define PMIX_VALUE_XFER(r, v, s)                                \
+    do {                                                        \
+        if (NULL == (v)) {                                      \
+            (v) = (pmix_value_t*)pmix_malloc(sizeof(pmix_value_t));  \
+            if (NULL == (v)) {                                  \
+                (r) = PMIX_ERR_NOMEM;                           \
+            } else {                                            \
+                (r) = PMIx_Value_xfer((v), (s));                \
+            }                                                   \
+        } else {                                                \
+            (r) = PMIx_Value_xfer((v), (s));                    \
+        }                                                       \
+    } while(0)
+
+#define PMIX_VALUE_XFER_DIRECT(r, v, s)     \
+    (r) = PMIx_Value_xfer((v), (s))
+
+#define PMIX_INFO_LOAD(i, k, d, t)  \
+    (void) PMIx_Info_load(i, k, d, t)
+
+#define PMIX_INFO_XFER(d, s)    \
+    (void) PMIx_Info_xfer(d, s)
+
+#define PMIX_PDATA_LOAD(m, p, k, v, t)                                      \
+    do {                                                                    \
+        if (NULL != (m)) {                                                  \
+            memset((m), 0, sizeof(pmix_pdata_t));                           \
+            PMIX_LOAD_NSPACE((m)->proc.nspace, (p)->nspace);                \
+            (m)->proc.rank = (p)->rank;                                     \
+            PMIX_LOAD_KEY((m)->key, k);                                     \
+            PMIx_Value_load(&((m)->value), (v), (t));                       \
+        }                                                                   \
+    } while (0)
+
+#define PMIX_PDATA_XFER(d, s)                                                   \
+    do {                                                                        \
+        if (NULL != (d)) {                                                      \
+            memset((d), 0, sizeof(pmix_pdata_t));                               \
+            PMIX_LOAD_NSPACE((d)->proc.nspace, (s)->proc.nspace);               \
+            (d)->proc.rank = (s)->proc.rank;                                    \
+            PMIX_LOAD_KEY((d)->key, (s)->key);                                  \
+            PMIx_Value_xfer(&((d)->value), &((s)->value));                      \
+        }                                                                       \
+    } while (0)
+
+#define PMIX_INFO_LIST_START(p)    \
+    (p) = PMIx_Info_list_start()
+
+#define PMIX_INFO_LIST_ADD(r, p, a, v, t)     \
+    (r) = PMIx_Info_list_add((p), (a), (v), (t))
+
+#define PMIX_INFO_LIST_INSERT(r, p, i)     \
+    (r) = PMIx_Info_list_insert((p), (i))
+
+#define PMIX_INFO_LIST_XFER(r, p, a)     \
+    (r) = PMIx_Info_list_xfer((p), (a))
+
+#define PMIX_INFO_LIST_CONVERT(r, p, m)     \
+    (r) = PMIx_Info_list_convert((p), (m))
+
+#define PMIX_INFO_LIST_RELEASE(p) \
+    PMIx_Info_list_release((p))
+
+#define PMIX_TOPOLOGY_DESTRUCT(x) \
+    PMIx_Topology_destruct(x)
+
+#if defined(c_plusplus) || defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/deps/pmix/include/pmix_version.h b/deps/pmix/include/pmix_version.h
new file mode 100644
index 000000000..616ed164c
--- /dev/null
+++ b/deps/pmix/include/pmix_version.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2016      Mellanox Technologies, Inc.
+ *                         All rights reserved.
+ * Copyright (c) 2018      IBM Corporation.  All rights reserved.
+ * Copyright (c) 2018      Intel, Inc. All rights reserved.
+ * Copyright (c) 2019      Research Organization for Information Science
+ *                         and Technology (RIST).  All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+
+#ifndef PMIx_VERSION_H
+#define PMIx_VERSION_H
+
+/* define PMIx version */
+#define PMIX_VERSION_MAJOR 4L
+#define PMIX_VERSION_MINOR 2L
+#define PMIX_VERSION_RELEASE 1L
+
+#define PMIX_NUMERIC_VERSION 0x00040201
+#endif
diff --git a/deps/pmix/update_pmix.sh b/deps/pmix/update_pmix.sh
new file mode 100755
index 000000000..dfae04512
--- /dev/null
+++ b/deps/pmix/update_pmix.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+#
+# Copyright 2016-2020 Intel Corporation
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+REPO=https://github.com/openpmix/openpmix.git
+
+install_path_root=`pwd`
+install_path=`pwd`/_install
+
+git clone $REPO
+cd ./openpmix
+git submodule update --init --recursive
+./autogen.pl
+./configure --prefix=$install_path
+
+cp $install_path_root/openpmix/include/pmix.h $install_path_root/include
+cp $install_path_root/openpmix/include/pmix_common.h $install_path_root/include
+cp $install_path_root/openpmix/include/pmix_deprecated.h $install_path_root/include
+cp $install_path_root/openpmix/include/pmix_version.h $install_path_root/include
+
+rm -rf $install_path_root/openpmix
diff --git a/doc/Dockerfile b/doc/Dockerfile
new file mode 100644
index 000000000..6768976cd
--- /dev/null
+++ b/doc/Dockerfile
@@ -0,0 +1,12 @@
+FROM python:3
+
+WORKDIR /app
+
+COPY requirements.txt requirements.txt
+RUN pip3 install -r requirements.txt
+
+COPY build_doc.sh Doxyfile copyright cclEULA.txt /app/
+
+RUN apt-get update && apt-get install -y doxygen
+
+CMD [ "./build_doc.sh" ]
\ No newline at end of file
diff --git a/doc/build_doc.md b/doc/build_doc.md
index 7f89fe3c3..1a5a16113 100644
--- a/doc/build_doc.md
+++ b/doc/build_doc.md
@@ -2,23 +2,26 @@
 
 ## Description ##
 
-The documentation is written using the restructured text markup language (also referred to as reST) and can be built with Doxygen and Sphinx. 
+The documentation is written using the restructured text markup language (also referred to as reST) and can be built with Doxygen and Sphinx.
 
-## Software Requirements ##
+## How to generate documentation ##
 
-* Doxygen 1.8.16
-* Python 3.7.1 (may or may not work with older Python*, untested)
+Install docker if absent and invoke
 
-Once you have the software requirements set up, go to the `doc` directory and run the commmands described in the next section.
+```bash
+doc/build_doc_by_docker.sh
+```
+
+Generated documentation can be found in: `doc/rst/build/html` directory.
 
 ## Configure Doxygen ##
 
-The Doxygen configuration lives in the `docs/Doxyfile` file.
+The Doxygen configuration lives in the `doc/Doxyfile` file.
 Please refer to the [Doxygen configuration reference](http://www.doxygen.nl/manual/config.html) for more information.
 
 
 ## Configure Sphinx ##
 
-You can create and modify Sphinx settings in the `docs/rst/source/conf.py` file.
+You can create and modify Sphinx settings in the `doc/rst/source/conf.py` file.
 
 For more details, please refer to the [Sphinx configuration reference](https://www.sphinx-doc.org/en/master/usage/configuration.html).
diff --git a/doc/build_doc_by_docker.sh b/doc/build_doc_by_docker.sh
new file mode 100755
index 000000000..43db17094
--- /dev/null
+++ b/doc/build_doc_by_docker.sh
@@ -0,0 +1,20 @@
+#!/bin/sh -e
+#
+# Copyright 2016-2020 Intel Corporation
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+cd $( dirname -- "$0"; )
+docker build -t ccldocgen .
+docker run -v $(pwd)/rst:/app/rst -v $(pwd)/../include:/include ccldocgen
diff --git a/doc/requirements.txt b/doc/requirements.txt
index 5c84e63d8..b88b9b932 100644
--- a/doc/requirements.txt
+++ b/doc/requirements.txt
@@ -1,61 +1,33 @@
 alabaster==0.7.12
-astroid==2.2.5
-atomicwrites==1.3.0
-attrs==18.2.0
-Babel==2.7.0
-beautifulsoup4==4.8.0
-breathe==4.13.1
-bs4==0.0.1
-certifi==2019.6.16
-chardet==3.0.4
-colorama==0.4.1
-docutils==0.15.2
-entrypoints==0.3
-exhale==0.2.3
-filelock==3.0.12
-flake8==3.7.8
-idna==2.8
-imagesize==1.1.0
-importlib-metadata==0.19
-isort==4.3.21
-Jinja2==2.11.3
-lazy-object-proxy==1.4.1
-lxml==4.6.3
-MarkupSafe==1.1.1
-mccabe==0.6.1
-more-itertools==7.2.0
-packaging==19.1
-pdflatex==0.1.3
-pluggy==0.12.0
-py==1.8.0
-pycodestyle==2.5.0
-pyflakes==2.1.1
-Pygments==2.7.4
-pylint==2.3.1
-pyparsing==2.4.2
-pytest==5.0.1
-pytest-html==1.22.0
-pytest-metadata==1.8.0
-pytz==2019.2
-requests==2.22.0
-six==1.12.0
-snowballstemmer==1.9.0
-soupsieve==1.9.2
-Sphinx==3.5.4
-sphinx-book-theme==0.1.0
-sphinx-rtd-theme==0.4.3
-sphinxcontrib-applehelp==1.0.1
-sphinxcontrib-devhelp==1.0.1
-sphinxcontrib-htmlhelp==1.0.2
+Babel==2.10.3
+beautifulsoup4==4.11.1
+breathe==4.34.0
+certifi==2022.9.24
+charset-normalizer==2.1.1
+docutils==0.17.1
+idna==3.4
+imagesize==1.4.1
+importlib-metadata==5.0.0
+Jinja2==3.1.2
+MarkupSafe==2.1.1
+packaging==21.3
+pydata-sphinx-theme==0.8.1
+Pygments==2.13.0
+pyparsing==3.0.9
+pytz==2022.5
+PyYAML==6.0
+requests==2.28.1
+snowballstemmer==2.2.0
+soupsieve==2.3.2.post1
+Sphinx==4.5.0
+sphinx-book-theme==0.3.3
+sphinx-prompt==1.5.0
+sphinx-tabs==3.3.1
+sphinxcontrib-applehelp==1.0.2
+sphinxcontrib-devhelp==1.0.2
+sphinxcontrib-htmlhelp==2.0.0
 sphinxcontrib-jsmath==1.0.1
-sphinxcontrib-qthelp==1.0.2
-sphinxcontrib-serializinghtml==1.1.3
-toml==0.10.0
-tox==3.13.2
-typed-ast==1.4.0
-urllib3==1.25.3
-virtualenv==16.7.2
-wcwidth==0.1.7
-wrapt==1.11.2
-zipp==0.5.2
-sphinx-prompt==1.1.0
+sphinxcontrib-qthelp==1.0.3
+sphinxcontrib-serializinghtml==1.1.5
+urllib3==1.26.12
+zipp==3.10.0
diff --git a/doc/rst/source/advanced-configuration/dmabuf.rst b/doc/rst/source/advanced-configuration/dmabuf.rst
index 56dd979bd..b944be120 100644
--- a/doc/rst/source/advanced-configuration/dmabuf.rst
+++ b/doc/rst/source/advanced-configuration/dmabuf.rst
@@ -47,7 +47,7 @@ OFI
 
 ::
 
-    cmake -DCMAKE_INSTALL_PREFIX=<ccl_install_dir> -DLIBFABRIC_DIR=<ofi_install_dir> -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=dpcpp -DCOMPUTE_BACKEND=dpcpp -DENABLE_OFI_HMEM=1 ..
+    cmake -DCMAKE_INSTALL_PREFIX=<ccl_install_dir> -DLIBFABRIC_DIR=<ofi_install_dir> -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DCOMPUTE_BACKEND=dpcpp -DENABLE_OFI_HMEM=1 ..
     make -j install
 
 
diff --git a/doc/rst/source/conf.py b/doc/rst/source/conf.py
index 909696a91..f04a1c83c 100755
--- a/doc/rst/source/conf.py
+++ b/doc/rst/source/conf.py
@@ -29,6 +29,7 @@
 .. |product_short| replace:: oneCCL
 .. |mpi| replace:: Intel\ |reg|\  MPI Library
 .. |reg| unicode:: U+000AE
+.. |tm| unicode:: U+2122
 .. |copy| unicode:: U+000A9
 .. |base_tk| replace:: Intel\ |reg|\  oneAPI Base Toolkit 
 .. |c_api| replace:: C API
diff --git a/doc/rst/source/env-variables.rst b/doc/rst/source/env-variables.rst
index cb9664ee8..e90ad7f42 100644
--- a/doc/rst/source/env-variables.rst
+++ b/doc/rst/source/env-variables.rst
@@ -97,6 +97,10 @@ Available algorithms for each collective operation (``<algo_name>``):
      - Recursive doubling algorithm
    * - ``2d``
      - Two-dimensional algorithm (reduce_scatter + allreduce + allgather)
+   * - ``topo``
+     - Optimized algorithm for GPU data and all-to-all network topology.
+       Use ``CCL_REDUCE_SCATTER_MONOLITHIC_KERNEL=1`` to use compute kernels, 
+       instead of copy engines, to move the data across the GPU.
 
 
 ``ALLTOALL`` algorithms
@@ -170,6 +174,10 @@ Available algorithms for each collective operation (``<algo_name>``):
      - Tree algorithm
    * - ``double_tree``
      - Double-tree algorithm
+   * - ``topo``
+     - Optimized algorithm for GPU data and all-to-all network topology.
+       Use ``CCL_REDUCE_SCATTER_MONOLITHIC_KERNEL=1`` to use compute kernels, 
+       instead of copy engines, to move the data across the GPU.
 
 
 ``REDUCE_SCATTER`` algorithms
@@ -599,9 +607,9 @@ CCL_ITT_LEVEL
 
 **Description**
 
-Set this environment variable to specify ``Intel(R) Instrumentation and Tracing Technology`` (ITT) profiling level.
+Set this environment variable to specify Intel\ |reg|\  Instrumentation and Tracing Technology (ITT) profiling level.
 Once the environment variable is enabled (value > 0), it is possible to collect and display profiling
-data for |product_short| using tools such as ``Intel(R) VTune(TM) Amplifier``.
+data for |product_short| using tools such as Intel\ |reg|\  VTune\ |tm|\  Profiler.
 
 
 Fusion
diff --git a/doc/rst/source/general-configuration/transport-selection.rst b/doc/rst/source/general-configuration/transport-selection.rst
index 99b19b8b8..b22798fc3 100644
--- a/doc/rst/source/general-configuration/transport-selection.rst
+++ b/doc/rst/source/general-configuration/transport-selection.rst
@@ -8,4 +8,4 @@ Transport Selection
 
 The transport selection is controlled by :ref:`CCL_ATL_TRANSPORT`.
 
-In case of MPI over libfabric implementation (for example, ``Intel(R) MPI Library 2021``) or in case of direct libfabric transport, the selection of specific libfabric provider is controlled by the ``FI_PROVIDER`` environment variable.
+In case of MPI over libfabric implementation (for example, |mpi| 2021) or in case of direct libfabric transport, the selection of specific libfabric provider is controlled by the ``FI_PROVIDER`` environment variable.
diff --git a/doc/rst/source/index.rst b/doc/rst/source/index.rst
index aa31c5a11..828df326c 100644
--- a/doc/rst/source/index.rst
+++ b/doc/rst/source/index.rst
@@ -13,7 +13,7 @@
 - Works across various interconnects: InfiniBand*, Cornelis Networks*, and Ethernet.
 - Provides common API sufficient to support communication workflows within Deep Learning / distributed frameworks (such as `PyTorch* <https://github.com/pytorch/pytorch>`_, `Horovod* <https://github.com/horovod/horovod>`_).
 
-|product_short| package comprises the |product_short| Software Development Kit (SDK) and the Intel(R) MPI Library Runtime components.
+|product_short| package comprises the |product_short| Software Development Kit (SDK) and the |mpi| Runtime components.
 
 
 .. toctree::
diff --git a/doc/rst/source/introduction/installation.rst b/doc/rst/source/introduction/installation.rst
index f779f5321..584090807 100644
--- a/doc/rst/source/introduction/installation.rst
+++ b/doc/rst/source/introduction/installation.rst
@@ -74,11 +74,11 @@ You can customize CLI-based installation (for example, specify directory, compil
 
 .. _enable_sycl:
 
-*  To enable ``SYCL`` devices communication support, specify ``SYCL`` compiler and set ``-DCOMPUTE_BACKEND`` (only DPC++ is supported):
+*  To enable ``SYCL`` devices communication support, specify ``SYCL`` compiler (only Intel\ |reg|\  oneAPI DPC++/C++ Compiler is supported):
 
   ::
 
-     cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=dpcpp -DCOMPUTE_BACKEND=dpcpp
+     cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DCOMPUTE_BACKEND=dpcpp
 
 * To specify the **build type**, modify the ``cmake`` command:
 
diff --git a/doc/rst/source/introduction/sample.rst b/doc/rst/source/introduction/sample.rst
index 4c6f3fda4..3ca73fadd 100644
--- a/doc/rst/source/introduction/sample.rst
+++ b/doc/rst/source/introduction/sample.rst
@@ -11,15 +11,15 @@ The sample code below shows how to use |product_short| API to perform allreduce
 Build details
 *************
 
-#. :ref:`Build <enable_sycl>` |product_short| with ``SYCL`` support (only DPC++ is supported).
+#. :ref:`Build <enable_sycl>` |product_short| with ``SYCL`` support (only Intel\ |reg|\  oneAPI DPC++/C++ Compiler is supported).
 
 #. :ref:`Set up <prerequisites>` the library environment.
 
-#. Use ``dpcpp`` compiler to build the sample:
+#. Use the C++ driver with the -fsycl option to build the sample:
 
    ::
 
-      dpcpp -o sample sample.cpp -lccl -lmpi
+      icpx -o sample sample.cpp -lccl -lmpi
 
 
 Run the sample
diff --git a/doc/rst/source/programming-model/limitations.rst b/doc/rst/source/programming-model/limitations.rst
index 53ebf3d94..dde24eea9 100644
--- a/doc/rst/source/programming-model/limitations.rst
+++ b/doc/rst/source/programming-model/limitations.rst
@@ -5,3 +5,7 @@ Limitations
 The list of scenarios not yet supported by oneCCL:
 
 - Creation of multiple ranks within single process
+
+
+.. note:: Currently, collectives are synchronizing the host thread.
+    
\ No newline at end of file
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 423563e77..92851fd8c 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -15,6 +15,19 @@
 #
 cmake_minimum_required (VERSION 2.8)
 
+set(PROJECT_NAME "oneCCL examples")
+project(${PROJECT_NAME})
+
+message(STATUS "PROJECT_NAME: ${PROJECT_NAME}")
+message(STATUS "PROJECT_SOURCE_DIR: ${PROJECT_SOURCE_DIR}")
+
+# include helpers if we build in standalone mode
+if (${CMAKE_PROJECT_NAME} STREQUAL ${PROJECT_NAME})
+    set(COMMON_CMAKE_DIR ${PROJECT_SOURCE_DIR}/../cmake)
+    include(${COMMON_CMAKE_DIR}/helpers.cmake)
+    define_compute_backend()
+endif()
+
 if (DEFINED ENV{CCL_CONFIGURATION})
     set(CCL_CONFIGURATION "$ENV{CCL_CONFIGURATION}")
     if(${CCL_CONFIGURATION} STREQUAL "cpu_gpu_dpcpp")
@@ -82,11 +95,10 @@ set(GCC_BF16_MIN_SUPPORTED "4.9.0")
 if (${CMAKE_C_COMPILER_ID} STREQUAL "Intel" OR ${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" AND NOT ${CMAKE_C_COMPILER_VERSION} VERSION_LESS ${GCC_BF16_MIN_SUPPORTED}))
     add_definitions(-DCCL_BF16_COMPILER)
     set(CCL_BF16_COMPILER ON)
-    message(STATUS "BF16 compiler: yes")
 else()
     set(CCL_BF16_COMPILER OFF)
-    message(STATUS "BF16 compiler: no")
 endif()
+message(STATUS "BF16 AVX512F compiler: ${CCL_BF16_COMPILER}")
 
 if (CCL_BF16_COMPILER)
     if ((${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_C_COMPILER_ID} STREQUAL "GNU"))
@@ -100,6 +112,7 @@ endif()
 include_directories(include)
 
 add_subdirectory(cpu)
+
 if ("${COMPUTE_BACKEND}" STREQUAL "dpcpp")
     add_subdirectory(sycl)
 endif()
diff --git a/examples/benchmark/CMakeLists.txt b/examples/benchmark/CMakeLists.txt
index 66dcffbf2..8e264e369 100644
--- a/examples/benchmark/CMakeLists.txt
+++ b/examples/benchmark/CMakeLists.txt
@@ -44,8 +44,6 @@ foreach(src ${sources})
     target_link_libraries(${executable} PUBLIC dl)
     target_link_libraries(${executable} PUBLIC -L${I_MPI_ROOT}/lib/release/)
     target_link_libraries(${executable} PUBLIC mpi)
-    target_link_libraries(${executable} PUBLIC -L${LIBFABRIC_LIB_DIR})
-    target_link_libraries(${executable} PUBLIC fabric)
     target_link_libraries(${executable} PUBLIC ${COMPUTE_BACKEND_TARGET_NAME})
     install(TARGETS ${executable} RUNTIME DESTINATION ${CCL_INSTALL_EXAMPLES}/benchmark OPTIONAL)
 endforeach()
diff --git a/examples/benchmark/src/alltoall/cpu_alltoall_coll.hpp b/examples/benchmark/src/alltoall/cpu_alltoall_coll.hpp
index 8cf27f734..724fdbe57 100644
--- a/examples/benchmark/src/alltoall/cpu_alltoall_coll.hpp
+++ b/examples/benchmark/src/alltoall/cpu_alltoall_coll.hpp
@@ -38,7 +38,7 @@ struct cpu_alltoall_coll : cpu_base_coll<Dtype, alltoall_strategy_impl> {
             for (size_t e_idx = 0; e_idx < elem_count * comm_size; e_idx++) {
                 value = ((Dtype*)send_bufs[b_idx][rank_idx])[e_idx];
                 Dtype rbuf_expected = get_val<Dtype>(static_cast<float>(e_idx / elem_count));
-                if (value != sbuf_expected) {
+                if (!base_coll::get_inplace() && (value != sbuf_expected)) {
                     std::cout << this->name() << " send_bufs: buf_idx " << b_idx << ", rank_idx "
                               << rank_idx << ", elem_idx " << e_idx << ", expected "
                               << sbuf_expected << ", got " << value << std::endl;
diff --git a/examples/benchmark/src/alltoall/sycl_alltoall_coll.hpp b/examples/benchmark/src/alltoall/sycl_alltoall_coll.hpp
index dd086e73a..2acdce434 100644
--- a/examples/benchmark/src/alltoall/sycl_alltoall_coll.hpp
+++ b/examples/benchmark/src/alltoall/sycl_alltoall_coll.hpp
@@ -69,7 +69,7 @@ struct sycl_alltoall_coll : sycl_base_coll<Dtype, alltoall_strategy_impl> {
             for (size_t e_idx = 0; e_idx < elem_count * comm_size; e_idx++) {
                 value = host_send_buf[e_idx];
                 Dtype rbuf_expected = get_val<Dtype>(static_cast<float>(e_idx / elem_count));
-                if (!base_coll::get_inplace() && value != sbuf_expected) {
+                if (!base_coll::get_inplace() && (value != sbuf_expected)) {
                     std::cout << this->name() << " send_bufs: buf_idx " << b_idx << ", rank_idx "
                               << rank_idx << ", elem_idx " << e_idx << ", expected "
                               << sbuf_expected << ", got " << value << std::endl;
diff --git a/examples/benchmark/src/alltoallv/cpu_alltoallv_coll.hpp b/examples/benchmark/src/alltoallv/cpu_alltoallv_coll.hpp
index 2be214dc7..374babdb9 100644
--- a/examples/benchmark/src/alltoallv/cpu_alltoallv_coll.hpp
+++ b/examples/benchmark/src/alltoallv/cpu_alltoallv_coll.hpp
@@ -37,7 +37,7 @@ struct cpu_alltoallv_coll : cpu_base_coll<Dtype, alltoallv_strategy_impl> {
             for (size_t e_idx = 0; e_idx < elem_count * comm_size; e_idx++) {
                 value = ((Dtype*)send_bufs[b_idx][rank_idx])[e_idx];
                 Dtype rbuf_expected = get_val<Dtype>(static_cast<float>(e_idx / elem_count));
-                if (value != sbuf_expected) {
+                if (!base_coll::get_inplace() && (value != sbuf_expected)) {
                     std::cout << this->name() << " send_bufs: buf_idx " << b_idx << ", rank_idx "
                               << rank_idx << ", elem_idx " << e_idx << ", expected "
                               << sbuf_expected << ", got " << value << std::endl;
diff --git a/examples/benchmark/src/alltoallv/sycl_alltoallv_coll.hpp b/examples/benchmark/src/alltoallv/sycl_alltoallv_coll.hpp
index ec59e8295..c8c4082fa 100644
--- a/examples/benchmark/src/alltoallv/sycl_alltoallv_coll.hpp
+++ b/examples/benchmark/src/alltoallv/sycl_alltoallv_coll.hpp
@@ -69,7 +69,7 @@ struct sycl_alltoallv_coll : sycl_base_coll<Dtype, alltoallv_strategy_impl> {
             for (size_t e_idx = 0; e_idx < elem_count * comm_size; e_idx++) {
                 value = host_send_buf[e_idx];
                 Dtype rbuf_expected = get_val<Dtype>(static_cast<float>(e_idx / elem_count));
-                if (!base_coll::get_inplace() && value != sbuf_expected) {
+                if (!base_coll::get_inplace() && (value != sbuf_expected)) {
                     std::cout << this->name() << " send_bufs: buf_idx " << b_idx << ", rank_idx "
                               << rank_idx << ", elem_idx " << e_idx << ", expected "
                               << sbuf_expected << ", got " << value << std::endl;
diff --git a/examples/benchmark/src/reduce_scatter/sycl_reduce_scatter_coll.hpp b/examples/benchmark/src/reduce_scatter/sycl_reduce_scatter_coll.hpp
index 6a03f0588..0980f18f2 100644
--- a/examples/benchmark/src/reduce_scatter/sycl_reduce_scatter_coll.hpp
+++ b/examples/benchmark/src/reduce_scatter/sycl_reduce_scatter_coll.hpp
@@ -67,10 +67,8 @@ struct sycl_reduce_scatter_coll : sycl_base_coll<Dtype, reduce_scatter_strategy_
                     .wait();
             }
 
-            Dtype value;
-
             for (size_t e_idx = 0; e_idx < elem_count; e_idx++) {
-                value = host_send_buf[e_idx];
+                Dtype value = host_send_buf[e_idx];
                 if (value != sbuf_expected) {
                     std::cout << this->name() << " send_bufs: buf_idx " << b_idx << ", rank_idx "
                               << rank_idx << ", elem_idx " << e_idx << ", expected "
diff --git a/examples/benchmark/src/transport_impl.hpp b/examples/benchmark/src/transport_impl.hpp
index 88dd23413..1c25dc089 100644
--- a/examples/benchmark/src/transport_impl.hpp
+++ b/examples/benchmark/src/transport_impl.hpp
@@ -93,9 +93,9 @@ std::vector<ccl::stream>& transport_data::get_bench_streams() {
 void transport_data::init_comms(user_options_t& options) {
     int ranks_per_proc = options.ranks_per_proc;
 
-    std::vector<int> local_ranks;
+    std::vector<int> proc_ranks;
     for (int idx = 0; idx < ranks_per_proc; idx++) {
-        local_ranks.push_back(rank * ranks_per_proc + idx);
+        proc_ranks.push_back(rank * ranks_per_proc + idx);
     }
 
     ccl::context context = ccl::create_context();
@@ -112,7 +112,7 @@ void transport_data::init_comms(user_options_t& options) {
 #ifdef CCL_ENABLE_SYCL
     else if (options.backend == BACKEND_SYCL) {
         auto sycl_queues = create_sycl_queues(
-            sycl_dev_names[options.sycl_dev_type], local_ranks, options.sycl_root_dev);
+            sycl_dev_names[options.sycl_dev_type], proc_ranks, options.sycl_root_dev);
         ASSERT(!sycl_queues.empty(), "queues should contain at least one queue");
         ASSERT(static_cast<size_t>(ranks_per_proc) == sycl_queues.size(),
                "ranks and queues sizes should match");
@@ -136,7 +136,7 @@ void transport_data::init_comms(user_options_t& options) {
     }
 
     for (int idx = 0; idx < ranks_per_proc; idx++) {
-        r2d_map.emplace(local_ranks[idx], devices[idx]);
+        r2d_map.emplace(proc_ranks[idx], devices[idx]);
     }
 
     comms = ccl::create_communicators(size * ranks_per_proc, r2d_map, context, kvs);
diff --git a/examples/common/CMakeLists.txt b/examples/common/CMakeLists.txt
index 17ae64189..238b26c9f 100644
--- a/examples/common/CMakeLists.txt
+++ b/examples/common/CMakeLists.txt
@@ -27,9 +27,5 @@ foreach(src ${sources})
     target_link_libraries(${executable} PUBLIC rt)
     target_link_libraries(${executable} PUBLIC m)
     target_link_libraries(${executable} PUBLIC dl)
-    target_link_libraries(${executable} PUBLIC -L${I_MPI_ROOT}/lib/release/)
-    target_link_libraries(${executable} PUBLIC mpi)
-    target_link_libraries(${executable} PUBLIC -L${LIBFABRIC_LIB_DIR})
-    target_link_libraries(${executable} PUBLIC fabric)
     install(TARGETS ${executable} RUNTIME DESTINATION ${CCL_INSTALL_EXAMPLES}/common OPTIONAL)
 endforeach()
diff --git a/examples/cpu/CMakeLists.txt b/examples/cpu/CMakeLists.txt
index 8ecb30556..0b9abcae8 100644
--- a/examples/cpu/CMakeLists.txt
+++ b/examples/cpu/CMakeLists.txt
@@ -29,8 +29,6 @@ foreach(src ${sources})
     target_link_libraries(${executable} PUBLIC stdc++)
     target_link_libraries(${executable} PUBLIC -L${I_MPI_ROOT}/lib/release/)
     target_link_libraries(${executable} PUBLIC mpi)
-    target_link_libraries(${executable} PUBLIC -L${LIBFABRIC_LIB_DIR})
-    target_link_libraries(${executable} PUBLIC fabric)
     target_link_libraries(${executable} PUBLIC ${COMPUTE_BACKEND_TARGET_NAME})
     install(TARGETS ${executable} RUNTIME DESTINATION ${CCL_INSTALL_EXAMPLES}/cpu OPTIONAL)
 endforeach()
diff --git a/examples/external_launcher/CMakeLists.txt b/examples/external_launcher/CMakeLists.txt
index 90d7f35c9..f898892d0 100644
--- a/examples/external_launcher/CMakeLists.txt
+++ b/examples/external_launcher/CMakeLists.txt
@@ -26,8 +26,6 @@ foreach(src ${sources})
     target_link_libraries(${executable} PRIVATE ccl)
     target_link_libraries(${executable} PUBLIC -L${I_MPI_ROOT}/lib/release/)
     target_link_libraries(${executable} PUBLIC mpi)
-    target_link_libraries(${executable} PUBLIC -L${LIBFABRIC_LIB_DIR})
-    target_link_libraries(${executable} PUBLIC fabric)
     target_link_libraries(${executable} PUBLIC ${COMPUTE_BACKEND_TARGET_NAME})
     install(TARGETS ${executable} RUNTIME DESTINATION ${CCL_INSTALL_EXAMPLES}/external_launcher OPTIONAL)
 endforeach()
diff --git a/examples/external_launcher/run_binary.sh b/examples/external_launcher/run_binary.sh
index 732c15748..eb84b552d 100755
--- a/examples/external_launcher/run_binary.sh
+++ b/examples/external_launcher/run_binary.sh
@@ -132,6 +132,7 @@ function run()
     host=`hostname`
 
     binary_env="FI_PROVIDER=tcp CCL_LOG_LEVEL=info"
+    binary_env="${binary_env} CCL_PROCESS_LAUNCHER=none CCL_LOCAL_SIZE=${LOCAL_SIZE} CCL_LOCAL_RANK=${LOCAL_RANK}"
     binary_path="$dir/external_launcher"
     binary_arg="$SIZE $RANK ${KVS_MODE} ${KVS_PARAM}"
 
diff --git a/examples/include/sycl_base.hpp b/examples/include/sycl_base.hpp
index f31c81553..2726dd037 100644
--- a/examples/include/sycl_base.hpp
+++ b/examples/include/sycl_base.hpp
@@ -16,7 +16,20 @@
 #pragma once
 
 #include <algorithm>
+#if __has_include(<sycl/sycl.hpp>)
+#include <sycl/sycl.hpp>
+#elif __has_include(<CL/sycl.hpp>)
 #include <CL/sycl.hpp>
+#else
+#error "Unsupported compiler"
+#endif
+#if __has_include(<CL/sycl/property_list.hpp>)
+#include <CL/sycl/property_list.hpp>
+#elif __has_include(<sycl/property_list.hpp>)
+#include <sycl/property_list.hpp>
+#else
+#error "Unsupported compiler"
+#endif
 #include <iostream>
 #include <map>
 #include <mpi.h>
@@ -25,16 +38,42 @@
 #include <string>
 #include <numeric>
 
-#include "CL/sycl/property_list.hpp"
 #include "base.hpp"
 #include "base_utils.hpp"
-
 #include "oneapi/ccl.hpp"
 
+#if defined(__INTEL_LLVM_COMPILER)
+#if (__INTEL_LLVM_COMPILER < 20230000)
+#define CCL_USE_SYCL121_API 1
+#else // (__INTEL_LLVM_COMPILER < 20230000)
+#define CCL_USE_SYCL121_API 0
+#endif // (__INTEL_LLVM_COMPILER < 20230000)
+#elif defined(__LIBSYCL_MAJOR_VERSION)
+#if (__LIBSYCL_MAJOR_VERSION < 6)
+#define CCL_USE_SYCL121_API 1
+#else // (__LIBSYCL_MAJOR_VERSION < 6)
+#define CCL_USE_SYCL121_API 0
+#endif // (__LIBSYCL_MAJOR_VERSION < 6)
+#else // __INTEL_LLVM_COMPILER || __LIBSYCL_MAJOR_VERSION
+#error "Unsupported compiler"
+#endif
+
 using namespace std;
 using namespace sycl;
 using namespace sycl::access;
 
+namespace ccl {
+#if CCL_USE_SYCL121_API
+const auto cpu_selector_v = ::sycl::cpu_selector{};
+const auto gpu_selector_v = ::sycl::gpu_selector{};
+const auto default_selector_v = ::sycl::default_selector{};
+#else // CCL_USE_SYCL121_API
+inline const auto& cpu_selector_v = ::sycl::cpu_selector_v;
+inline const auto& gpu_selector_v = ::sycl::gpu_selector_v;
+inline const auto& default_selector_v = ::sycl::default_selector_v;
+#endif // CCL_USE_SYCL121_API
+} // namespace ccl
+
 /* help functions for sycl-specific base implementation */
 inline bool has_gpu() {
     vector<device> devices = device::get_devices();
@@ -228,36 +267,30 @@ inline std::vector<sycl::queue> create_sycl_queues(const std::string& device_typ
     try {
         if (device_type.compare("gpu") == 0) {
             if (!has_gpu()) {
-                throw std::runtime_error("GPU is requested but not available.");
+                throw std::runtime_error("GPU is requested but not available");
             }
 
             /* GPU type has special handling to cover multi-tile case */
             devices = create_sycl_gpu_devices(select_root_devices);
         }
         else {
-            unique_ptr<device_selector> selector;
-
             if (device_type.compare("cpu") == 0) {
-                selector.reset(new cpu_selector());
-            }
-            else if (device_type.compare("host") == 0) {
-                selector.reset(new host_selector());
+                devices.push_back(device(ccl::cpu_selector_v));
             }
             else if (device_type.compare("default") == 0) {
                 if (!has_accelerator()) {
-                    selector.reset(new default_selector());
+                    devices.push_back(device(ccl::default_selector_v));
                 }
                 else {
-                    selector.reset(new host_selector());
+                    devices.push_back(device(ccl::cpu_selector_v));
                     cout
                         << "Accelerator is the first in device list, but unavailable for multiprocessing "
-                        << " host_selector has been created instead of default_selector.\n";
+                        << " cpu_selector has been created instead of default_selector.\n";
                 }
             }
             else {
-                throw std::runtime_error("Please provide device type: cpu | gpu | host | default");
+                throw std::runtime_error("Please provide device type: cpu | gpu | default");
             }
-            devices.push_back(sycl::device(*selector));
         }
     }
     catch (...) {
@@ -268,10 +301,47 @@ inline std::vector<sycl::queue> create_sycl_queues(const std::string& device_typ
         throw std::runtime_error("No devices of requested type available");
     }
 
-    std::vector<sycl::device> rank_devices;
+    int global_rank = 0, local_rank = 0;
+    int global_size = 0, local_size = 0;
+    MPI_Comm_rank(MPI_COMM_WORLD, &global_rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &global_size);
+
+    MPI_Comm local_comm;
+    MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &local_comm);
+    MPI_Comm_rank(local_comm, &local_rank);
+    MPI_Comm_size(local_comm, &local_size);
+    MPI_Comm_free(&local_comm);
+
+    std::stringstream error_msg;
+
+    if (local_rank > global_rank) {
+        error_msg << "Local rank should be less or equal to global rank (local_rank: " << local_rank
+                  << ", global_rank: " << global_rank << ")";
+        throw std::runtime_error(error_msg.str());
+    }
+
+    if (local_size > global_size) {
+        error_msg << "Local size should be less or equal to global size (local_size: " << local_size
+                  << ", global_size: " << global_size << ")";
+        throw std::runtime_error(error_msg.str());
+    }
+
+    if (ranks.size() != 1) {
+        error_msg << "Unexpected number of device ranks: " << ranks.size();
+        throw std::runtime_error(error_msg.str());
+    }
 
-    for (size_t idx = 0; idx < ranks.size(); idx++) {
-        rank_devices.push_back(devices[ranks[idx] % devices.size()]);
+    if (ranks[0] != global_rank) {
+        error_msg << "Unexpected device rank: " << ranks[0] << ", expected: " << global_rank;
+        throw std::runtime_error(error_msg.str());
+    }
+
+    // use local rank for device selection
+    std::vector<int> local_ranks(1, local_rank);
+
+    std::vector<sycl::device> rank_devices;
+    for (size_t idx = 0; idx < local_ranks.size(); idx++) {
+        rank_devices.push_back(devices[local_ranks[idx] % devices.size()]);
     }
 
     if (rank_devices.empty()) {
@@ -284,7 +354,7 @@ inline std::vector<sycl::queue> create_sycl_queues(const std::string& device_typ
         ctx = sycl::context(rank_devices);
     }
     catch (sycl::exception&) {
-        size_t preferred_idx = (ranks.back() / ranks.size()) % devices.size();
+        size_t preferred_idx = (local_ranks.back() / local_ranks.size()) % devices.size();
         cout << "Can not create context from all rank devices of type: " << device_type
              << ", create context from single device, idx " << preferred_idx << "\n";
         ctx = sycl::context(devices[preferred_idx]);
@@ -320,32 +390,14 @@ inline std::vector<sycl::queue> create_sycl_queues(const std::string& device_typ
     return queues;
 }
 
-inline bool create_sycl_queue(int argc, char* argv[], int rank, queue& q) {
-    if (argc >= 2) {
-        try {
-            std::vector<int> ranks = { rank };
-            q = create_sycl_queues(argv[1], ranks)[0];
-            return true;
-        }
-        catch (std::exception& e) {
-            cerr << e.what() << "\n";
-            return false;
-        }
-    }
-    else {
-        cerr << "Please provide device type: cpu | gpu | host | default\n";
-        return false;
-    }
-}
-
 inline bool create_sycl_queue(const std::string& type,
                               int rank,
                               queue& q,
                               const property_list& queue_props = {}) {
-    if (type == "gpu" || type == "cpu" || type == "host") {
+    if (type == "gpu" || type == "cpu" || type == "host" || type == "default") {
         try {
             std::vector<int> ranks = { rank };
-            q = create_sycl_queues(type.c_str(), ranks, false, queue_props)[0];
+            q = create_sycl_queues(type, ranks, false, queue_props)[0];
             return true;
         }
         catch (std::exception& e) {
@@ -354,10 +406,15 @@ inline bool create_sycl_queue(const std::string& type,
         }
     }
     else {
+        cerr << "Unknown device type: " << type << ", please provide: cpu | gpu | host | default\n";
         return false;
     }
 }
 
+inline bool create_sycl_queue(int argc, char* argv[], int rank, queue& q) {
+    return create_sycl_queue(((argc >= 2) ? argv[1] : "unknown"), rank, q, {});
+}
+
 inline bool handle_exception(queue& q) {
     try {
         q.wait_and_throw();
diff --git a/examples/sycl/CMakeLists.txt b/examples/sycl/CMakeLists.txt
index dc79210be..7426280bc 100644
--- a/examples/sycl/CMakeLists.txt
+++ b/examples/sycl/CMakeLists.txt
@@ -29,8 +29,6 @@ foreach(src ${sources})
     target_link_libraries(${executable} PRIVATE ccl)
     target_link_libraries(${executable} PUBLIC -L${I_MPI_ROOT}/lib/release/)
     target_link_libraries(${executable} PUBLIC mpi)
-    target_link_libraries(${executable} PUBLIC -L${LIBFABRIC_LIB_DIR})
-    target_link_libraries(${executable} PUBLIC fabric)
     target_link_libraries(${executable} PRIVATE ${COMPUTE_BACKEND_TARGET_NAME})
     install(TARGETS ${executable} RUNTIME DESTINATION ${CCL_INSTALL_EXAMPLES}/sycl OPTIONAL)
 endforeach()
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 7b4cff9bf..394837321 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -40,9 +40,10 @@ if (CCL_ENABLE_SYCL AND CCL_ENABLE_ZE)
         sched/entry/ze/ze_primitives.cpp
         sched/entry/ze/ze_reduce_local_entry.cpp
 
-        common/global/ze_data.cpp
+        common/global/ze/ze_data.cpp
+        common/global/ze/ze_fd_manager.cpp
         common/utils/sycl_utils.cpp
-        common/ze/ze_api_wrapper.cpp
+        common/api_wrapper/ze_api_wrapper.cpp
 
         sched/ze/ze_event_manager.cpp
         sched/ze/ze_handle_manager.cpp
@@ -89,7 +90,6 @@ set(CCL_SRC
     coll/coll_util.cpp
     coll/algorithms/allgatherv.cpp
     coll/algorithms/allreduce/allreduce.cpp
-    coll/algorithms/allreduce/allreduce_2d.cpp
     coll/algorithms/allreduce/allreduce_rma.cpp
     coll/algorithms/algorithm_utils.cpp
     coll/algorithms/alltoall.cpp
@@ -127,13 +127,19 @@ set(CCL_SRC
     common/log/log.cpp
     common/request/request.cpp
     common/stream/stream.cpp
-    common/utils/memcpy.cpp
+    common/utils/exchange_utils.cpp
     common/utils/fd_info.cpp
+    common/utils/memcpy.cpp
     common/utils/spinlock.cpp
     common/utils/utils.cpp
     common/utils/version.cpp
     common/utils/yield.cpp
 
+    common/api_wrapper/api_wrapper.cpp
+    common/api_wrapper/mpi_api_wrapper.cpp
+    common/api_wrapper/ofi_api_wrapper.cpp
+    common/api_wrapper/pmix_api_wrapper.cpp
+
     comp/bf16/bf16.cpp
     comp/bf16/bf16_intrisics.cpp
     comp/comp.cpp
@@ -216,7 +222,6 @@ set(SRC_INCLUDE_DIRS)
 set(SRC_LINK_DIRS)
 set(SRC_LINK_LIBS)
 
-# common settings of security options
 if (USE_SECURITY_FLAGS)
     set(SRC_C_FLAGS "${SRC_C_FLAGS} -Wformat -Wformat-security -D_FORTIFY_SOURCE=2 -fstack-protector")
     set(SRC_CXX_FLAGS "${SRC_CXX_FLAGS} -Wformat -Wformat-security -D_FORTIFY_SOURCE=2 -fstack-protector")
@@ -229,12 +234,16 @@ if (USE_SECURITY_FLAGS)
     endif()
 endif()
 
+if (ENABLE_LINKER_RUNPATH)
+    set(SRC_SHARED_LINKER_FLAGS "${SRC_SHARED_LINKER_FLAGS} -Wl,--enable-new-dtags -Wl,-rpath='$ORIGIN'")
+endif()
+
 set(SRC_SHARED_LINKER_FLAGS "${SRC_SHARED_LINKER_FLAGS} -Wl,--version-script=${PROJECT_SOURCE_DIR}/ccl.map")
 
 if (${CMAKE_C_COMPILER_ID} STREQUAL "Intel" OR ${CMAKE_CXX_COMPILER_ID} STREQUAL "Intel")
-    #To suppress for 'offsetof applied to non-POD (Plain Old Data) types is nonstandar'
+    # To suppress for 'offsetof applied to non-POD (Plain Old Data) types is nonstandar'
     set(SRC_CXX_FLAGS "${SRC_CXX_FLAGS} -diag-disable=1875")
-    # To supress "overloaded virtual function is only partially overridden in class", as we intentially
+    # To suppress "overloaded virtual function is only partially overridden in class", as we intentially
     # don't override some base class' overloads in order to keep a default implementation for them
     set(SRC_CXX_FLAGS "${SRC_CXX_FLAGS} -diag-disable=654")
 endif()
@@ -256,13 +265,20 @@ list(APPEND SRC_INCLUDE_DIRS
      ${HWLOC_INCLUDE_DIR}
      ${ITT_INCLUDE_DIR})
 
-list(APPEND SRC_LINK_DIRS ${LIBFABRIC_LIB_DIR})
+if (ENABLE_DRM)
+    list(APPEND SRC_SYSTEM_INCLUDE_DIRS
+         ${DRM_INCLUDE_DIR})
+endif()
+
+if (ENABLE_PMIX)
+    list(APPEND SRC_INCLUDE_DIRS
+         ${PMIX_INCLUDE_DIR})
+endif()
 
 list(APPEND SRC_LINK_LIBS
      dl
      pthread
      ${EXTERNAL_LIBS}
-     fabric
      ${HWLOC_LIB_DIR}/libhwloc.a
      ${ITT_LIB_DIR}/libittnotify.a)
 
@@ -270,8 +286,6 @@ if (ENABLE_MPI)
     set(SRC_C_FLAGS "${SRC_C_FLAGS} -DCCL_ENABLE_MPI")
     set(SRC_CXX_FLAGS "${SRC_CXX_FLAGS} -DCCL_ENABLE_MPI")
     list(APPEND SRC_INCLUDE_DIRS ${MPI_INCLUDE_DIR})
-    list(APPEND SRC_LINK_DIRS ${MPI_LIB_DIR})
-    list(APPEND SRC_LINK_LIBS mpi)
 endif()
 
 link_directories(${SRC_LINK_DIRS})
@@ -284,6 +298,7 @@ set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${SRC_SHARED_LINKER_
 add_library(ccl-objects OBJECT ${CCL_SRC})
 set_target_properties(ccl-objects PROPERTIES POSITION_INDEPENDENT_CODE 1)
 target_include_directories(ccl-objects PRIVATE ${SRC_INCLUDE_DIRS})
+target_include_directories(ccl-objects SYSTEM PRIVATE ${SRC_SYSTEM_INCLUDE_DIRS})
 
 if (COMPUTE_BACKEND_TARGET_NAME)
     target_include_directories(ccl-objects PUBLIC ${INTERFACE_INCLUDE_DIRECTORIES})
diff --git a/src/atl/atl_base_comm.cpp b/src/atl/atl_base_comm.cpp
index 1adbbac46..4e2f44f17 100644
--- a/src/atl/atl_base_comm.cpp
+++ b/src/atl/atl_base_comm.cpp
@@ -61,7 +61,7 @@ ccl_spinlock atl_base_comm::comm_id_storage_guard{};
 atl_base_comm::~atl_base_comm() {
     std::lock_guard<ccl_spinlock> lock{ comm_id_storage_guard };
     transport->get_comm_id_storage().release(comm_id);
-    tag.reset();
+    tag_creator.reset();
     comm_count--;
     if (comm_count.load() == 0) {
         transport->finalize(rank);
@@ -137,9 +137,10 @@ int atl_base_comm::create_comm_id() {
 }
 
 void atl_base_comm::init_tag() {
-    tag = std::shared_ptr<ccl_atl_tag>(new ccl_atl_tag(attr.out.tag_bits, attr.out.max_tag));
+    tag_creator =
+        std::shared_ptr<ccl_atl_tag>(new ccl_atl_tag(attr.out.tag_bits, attr.out.max_tag));
     if (rank == 0) {
-        LOG_DEBUG("atl tag: ", tag->to_string());
+        LOG_DEBUG("atl tag: ", tag_creator->to_string());
     }
 }
 
@@ -148,7 +149,7 @@ void atl_base_comm::update_executor() {
         if (rank < coord.local_count)
             LOG_INFO(
                 "start workers for local process [", coord.local_idx, ":", coord.local_count, "]");
-        executor->start_workers(coord.local_idx, coord.local_count);
+        executor->start_workers(coord);
     }
 }
 
diff --git a/src/atl/atl_base_comm.hpp b/src/atl/atl_base_comm.hpp
index 43142be24..6aa7da4db 100644
--- a/src/atl/atl_base_comm.hpp
+++ b/src/atl/atl_base_comm.hpp
@@ -222,10 +222,6 @@ class atl_base_comm {
         return rank2rank_map;
     }
 
-    std::vector<int> get_rank2proc_map() const {
-        return rank2proc_map;
-    }
-
     int create_comm_id();
 
     int get_comm_id() const {
@@ -236,7 +232,7 @@ class atl_base_comm {
         comm_id = atl_comm_id_storage::invalid_comm_id;
     }
 
-    std::shared_ptr<ccl_atl_tag> tag;
+    std::shared_ptr<ccl_atl_tag> tag_creator;
     static atl_attr_t attr;
 
 protected:
@@ -251,8 +247,8 @@ class atl_base_comm {
     int parent_rank;
     int parent_size;
 
-    std::vector<int> rank2rank_map;
-    std::vector<int> rank2proc_map;
+    std::vector<int> rank2rank_map{};
+    std::vector<int> rank2proc_map{};
     atl_proc_coord_t coord;
 
     int comm_id = atl_comm_id_storage::invalid_comm_id;
diff --git a/src/atl/atl_base_transport.hpp b/src/atl/atl_base_transport.hpp
index b9ccdb299..092e714b9 100644
--- a/src/atl/atl_base_transport.hpp
+++ b/src/atl/atl_base_transport.hpp
@@ -178,8 +178,8 @@ class atl_base_transport {
                                     size_t color,
                                     int local_idx) = 0;
 
-    virtual atl_status_t get_rank2rank_map(std::shared_ptr<ipmi> pmi,
-                                           std::vector<int>& rank2rank_map) = 0;
+    virtual atl_status_t get_rank2proc_map(std::shared_ptr<ipmi> pmi,
+                                           std::vector<int>& rank2proc_map) = 0;
 
     virtual std::string to_string() = 0;
 
diff --git a/src/atl/mpi/atl_mpi.cpp b/src/atl/mpi/atl_mpi.cpp
index f08653502..89d5ca261 100644
--- a/src/atl/mpi/atl_mpi.cpp
+++ b/src/atl/mpi/atl_mpi.cpp
@@ -52,7 +52,6 @@ atl_status_t atl_mpi::init(int* argc,
     }
 
     MPI_Initialized(&ctx.is_external_init);
-
     if (!ctx.is_external_init) {
         ret = MPI_Init_thread(argc, argv, required_thread_level, &provided_thread_level);
         if (provided_thread_level < required_thread_level) {
@@ -581,6 +580,10 @@ atl_status_t atl_mpi::comm_split(const std::vector<atl_ep_t>& base_eps,
         snprintf(mpi_ep_idx_str, MPI_MAX_INFO_VAL, "%zd", mpi_ep_idx);
         MPI_Info_set(info, ctx.EP_IDX_KEY, mpi_ep_idx_str);
 
+        /* pre-requisite for pref-nic hint */
+        MPI_Info_set(info, "mpi_assert_no_any_source", "true");
+        MPI_Info_set(info, "mpi_assert_no_any_tag", "true");
+
         if (ctx.mnic_type != ATL_MNIC_NONE) {
             /* set NIC index */
             nic_idx = idx;
@@ -656,7 +659,6 @@ atl_status_t atl_mpi::finalize(int global_idx) {
 
     int is_mpi_finalized = 0;
     MPI_Finalized(&is_mpi_finalized);
-
     if (!is_mpi_finalized) {
         ctx.bf16_finalize();
         ctx.fp16_finalize();
@@ -780,14 +782,14 @@ MPI_Datatype atl_mpi::atl2mpi_dtype(atl_datatype_t dtype) {
 }
 
 MPI_Op atl_mpi::atl2mpi_op(atl_reduction_t rtype, MPI_Datatype dtype) {
-#ifdef ATL_MPI_BF16
-    if (dtype == ctx.bf16.dtype)
+    if (dtype == ctx.bf16.dtype) {
         return ctx.atl2mpi_op_bf16(rtype);
-#endif // ATL_MPI_BF16
+    }
 
 #ifdef ATL_MPI_FP16
-    if (dtype == ctx.fp16.dtype)
+    if (dtype == ctx.fp16.dtype) {
         return ctx.atl2mpi_op_fp16(rtype);
+    }
 #endif // ATL_MPI_FP16
 
     (void)dtype;
diff --git a/src/atl/mpi/atl_mpi.hpp b/src/atl/mpi/atl_mpi.hpp
index fd7c32002..23a93a22a 100644
--- a/src/atl/mpi/atl_mpi.hpp
+++ b/src/atl/mpi/atl_mpi.hpp
@@ -17,10 +17,9 @@
 
 #ifdef CCL_ENABLE_MPI
 
-#include <mpi.h>
-
 #include "atl/atl_base_transport.hpp"
 #include "atl/mpi/atl_mpi_ctx.hpp"
+#include "common/api_wrapper/mpi_api_wrapper.hpp"
 
 #define ATL_MPI_RET(ret) (ret != MPI_SUCCESS) ? ATL_STATUS_FAILURE : ATL_STATUS_SUCCESS
 
@@ -214,8 +213,8 @@ class atl_mpi : public atl_base_transport {
                             size_t color,
                             int local_idx) override;
 
-    atl_status_t get_rank2rank_map(std::shared_ptr<ipmi> pmi,
-                                   std::vector<int>& rank2rank_map) override {
+    atl_status_t get_rank2proc_map(std::shared_ptr<ipmi> pmi,
+                                   std::vector<int>& rank2proc_map) override {
         return ATL_STATUS_UNSUPPORTED;
     }
 
diff --git a/src/atl/mpi/atl_mpi_comm.cpp b/src/atl/mpi/atl_mpi_comm.cpp
index 12f3f057f..ab94285cb 100644
--- a/src/atl/mpi/atl_mpi_comm.cpp
+++ b/src/atl/mpi/atl_mpi_comm.cpp
@@ -63,10 +63,6 @@ atl_mpi_comm::atl_mpi_comm(atl_mpi_comm* parent, int color) {
 
     rank2rank_map.resize(size);
     MPI_Allgather(&parent_rank, 1, MPI_INT, rank2rank_map.data(), 1, MPI_INT, mpi_ep->mpi_comm);
-
-    rank2proc_map.resize(size);
-    int parent_proc_idx = parent->rank2proc_map[parent_rank];
-    MPI_Allgather(&parent_proc_idx, 1, MPI_INT, rank2proc_map.data(), 1, MPI_INT, mpi_ep->mpi_comm);
 }
 
 void atl_mpi_comm::update_eps() {
@@ -122,9 +118,9 @@ atl_status_t atl_mpi_comm::init_transport(bool is_new,
         parent_rank = rank = coord.global_idx;
         parent_size = size = coord.global_count;
 
-        rank2proc_map.resize(size);
+        rank2rank_map.resize(size);
         for (int i = 0; i < size; i++) {
-            rank2proc_map[i] = i;
+            rank2rank_map[i] = i;
         }
     }
 
diff --git a/src/atl/mpi/atl_mpi_comm.hpp b/src/atl/mpi/atl_mpi_comm.hpp
index 904df46e8..64c62bf78 100644
--- a/src/atl/mpi/atl_mpi_comm.hpp
+++ b/src/atl/mpi/atl_mpi_comm.hpp
@@ -17,10 +17,9 @@
 
 #ifdef CCL_ENABLE_MPI
 
-#include <mpi.h>
-
 #include "atl/atl_base_comm.hpp"
 #include "atl/mpi/atl_mpi.hpp"
+#include "common/api_wrapper/mpi_api_wrapper.hpp"
 
 class atl_mpi_comm : public atl_base_comm {
 public:
diff --git a/src/atl/mpi/atl_mpi_ctx.cpp b/src/atl/mpi/atl_mpi_ctx.cpp
index 2fde9e5e2..06dcc6993 100644
--- a/src/atl/mpi/atl_mpi_ctx.cpp
+++ b/src/atl/mpi/atl_mpi_ctx.cpp
@@ -99,17 +99,11 @@ void FP16_TARGET_ATTRIBUTE_ALL fp16_max_op(void* in,
 }
 #endif // ATL_MPI_FP16
 
-#ifdef ATL_MPI_BF16
-
 void BF16_INLINE_TARGET_ATTRIBUTE_ALL bf16_base_op(void* in,
                                                    void* inout,
                                                    int* length,
                                                    ccl::reduction op) {
-    unsigned short* in_buf = (unsigned short*)in;
-    unsigned short* inout_buf = (unsigned short*)inout;
-
-    size_t len = *length;
-    ccl_bf16_reduce_impl(in_buf, inout_buf, len, op);
+    ccl_bf16_reduce(in, *length, inout, nullptr, op);
 }
 
 void BF16_TARGET_ATTRIBUTE_ALL bf16_sum_op(void* in,
@@ -143,7 +137,6 @@ void BF16_TARGET_ATTRIBUTE_ALL bf16_max_op(void* in,
     check_op_params(in, inout, length, datatype, __FUNCTION__);
     bf16_base_op(in, inout, length, ccl::reduction::max);
 }
-#endif // ATL_MPI_BF16
 
 size_t atl_mpi_ctx::get_nic_count(const char* nic_count_key) {
     size_t count = 1;
@@ -316,12 +309,6 @@ size_t atl_mpi_ctx::get_ep_count(const atl_attr_t& attr) {
 }
 
 int atl_mpi_ctx::bf16_init() {
-    if (ccl::global_data::env().bf16_impl_type <= ccl_bf16_no_hardware_support) {
-        return ATL_STATUS_SUCCESS;
-    }
-
-#ifdef ATL_MPI_BF16
-
     int ret = MPI_SUCCESS;
 
     // create custom MPI BF16 dtype
@@ -371,8 +358,6 @@ int atl_mpi_ctx::bf16_init() {
         return ATL_STATUS_FAILURE;
     }
 
-#endif // ATL_MPI_BF16
-
     return ATL_STATUS_SUCCESS;
 }
 
@@ -481,7 +466,6 @@ void atl_mpi_ctx::fp16_finalize() {
     }
 }
 
-#ifdef ATL_MPI_BF16
 MPI_Op atl_mpi_ctx::atl2mpi_op_bf16(atl_reduction_t rtype) {
     switch (rtype) {
         case ATL_REDUCTION_SUM: return bf16.sum_op;
@@ -491,7 +475,6 @@ MPI_Op atl_mpi_ctx::atl2mpi_op_bf16(atl_reduction_t rtype) {
         default: printf("unknown reduction type: %d\n", rtype); exit(1);
     }
 }
-#endif // ATL_MPI_BF16
 
 #ifdef ATL_MPI_FP16
 MPI_Op atl_mpi_ctx::atl2mpi_op_fp16(atl_reduction_t rtype) {
diff --git a/src/atl/mpi/atl_mpi_ctx.hpp b/src/atl/mpi/atl_mpi_ctx.hpp
index 1e669ee60..bb93a7608 100644
--- a/src/atl/mpi/atl_mpi_ctx.hpp
+++ b/src/atl/mpi/atl_mpi_ctx.hpp
@@ -17,15 +17,13 @@
 
 #ifdef CCL_ENABLE_MPI
 
-#include <mpi.h>
-
 #include "atl/atl_def.h"
+#include "common/api_wrapper/mpi_api_wrapper.hpp"
+#include "comp/bf16/bf16.hpp"
 #include "comp/bf16/bf16_intrisics.hpp"
 #include "comp/fp16/fp16_intrisics.hpp"
 
-#ifdef CCL_BF16_COMPILER
 #define ATL_MPI_BF16
-#endif // CCL_BF16_COMPILER
 
 #ifdef CCL_FP16_COMPILER
 #define ATL_MPI_FP16
diff --git a/src/atl/ofi/atl_ofi.cpp b/src/atl/ofi/atl_ofi.cpp
index 9d9a35373..e45c8e845 100644
--- a/src/atl/ofi/atl_ofi.cpp
+++ b/src/atl/ofi/atl_ofi.cpp
@@ -128,13 +128,14 @@ atl_status_t atl_ofi::init(int* argc,
     attr->out.max_tag = 0xFFFFFFFFFFFFFFFF;
 
 #ifdef CCL_ENABLE_OFI_HMEM
-    if (prov_env && strstr(prov_env, "verbs") && attr->in.enable_hmem) {
+    if (prov_env && (strstr(prov_env, "verbs") || strstr(prov_env, "cxi")) &&
+        attr->in.enable_hmem) {
         struct fi_info* hmem_hints = fi_dupinfo(base_hints);
         atl_attr_t hmem_attr = *attr;
 
         hmem_hints->caps |= FI_HMEM;
-        hmem_hints->domain_attr->mr_mode =
-            (FI_MR_ALLOCATED | FI_MR_PROV_KEY | FI_MR_VIRT_ADDR | FI_MR_LOCAL | FI_MR_HMEM);
+        hmem_hints->domain_attr->mr_mode = (FI_MR_ALLOCATED | FI_MR_PROV_KEY | FI_MR_VIRT_ADDR |
+                                            FI_MR_LOCAL | FI_MR_HMEM | FI_MR_ENDPOINT);
 
         /* TODO: enable shm with HMEM */
         hmem_attr.in.enable_shm = 0;
@@ -198,8 +199,6 @@ atl_status_t atl_ofi::init(int* argc,
         eps.push_back(ep);
     }
 
-    ATL_CHECK_STATUS(pmi->pmrt_barrier(), "barrier failed");
-
     max_retry_count_env = getenv(ATL_OFI_MAX_RETRY_COUNT_ENV);
     if (max_retry_count_env) {
         ctx.max_retry_count = safe_c_strtol(max_retry_count_env, nullptr, 10);
@@ -279,7 +278,7 @@ atl_status_t atl_ofi::update(std::shared_ptr<ipmi> pmi) {
     coord.validate();
 
     for (prov_idx = 0; prov_idx < ctx.prov_count; prov_idx++) {
-        ret = atl_ofi_prov_eps_connect(ctx, coord, prov_idx, pmi, ep_names);
+        ret = atl_ofi_prov_eps_connect(ctx, coord, prov_idx, pmi, ep_names[prov_idx]);
         if (ret)
             return ATL_OFI_RET(ret);
     }
@@ -351,7 +350,7 @@ atl_status_t atl_ofi::send(atl_ep_t& ep,
 
     ofi_req = ((atl_ofi_req_t*)req.internal);
 
-    cache.get(ep.idx, prov->domain, const_cast<void*>(buf), len, &ofi_req->mr);
+    cache.get(ep, prov, const_cast<void*>(buf), len, &ofi_req->mr);
     void* desc = (ofi_req->mr) ? fi_mr_desc(ofi_req->mr) : nullptr;
 
     struct iovec iov;
@@ -364,7 +363,7 @@ atl_status_t atl_ofi::send(atl_ep_t& ep,
     msg.iov_count = 1;
     msg.tag = tag;
     msg.ignore = 0;
-    msg.addr = atl_ofi_get_addr(ctx, prov, dst_proc_idx, ep.idx);
+    msg.addr = atl_ofi_get_addr(prov, dst_proc_idx, ep.idx);
     msg.context = &ofi_req->fi_ctx;
     msg.data = 0;
 
@@ -392,7 +391,7 @@ atl_status_t atl_ofi::recv(atl_ep_t& ep,
 
     ofi_req = ((atl_ofi_req_t*)req.internal);
 
-    cache.get(ep.idx, prov->domain, const_cast<void*>(buf), len, &ofi_req->mr);
+    cache.get(ep, prov, const_cast<void*>(buf), len, &ofi_req->mr);
     void* desc = (ofi_req->mr) ? fi_mr_desc(ofi_req->mr) : nullptr;
 
     struct iovec iov;
@@ -405,7 +404,7 @@ atl_status_t atl_ofi::recv(atl_ep_t& ep,
     msg.iov_count = 1;
     msg.tag = tag;
     msg.ignore = 0;
-    msg.addr = atl_ofi_get_addr(ctx, prov, src_proc_idx, ep.idx);
+    msg.addr = atl_ofi_get_addr(prov, src_proc_idx, ep.idx);
     msg.context = &ofi_req->fi_ctx;
     msg.data = 0;
 
@@ -459,7 +458,7 @@ atl_status_t atl_ofi::probe(atl_ep_t& ep,
         msg->msg_iov = nullptr;
         msg->desc = nullptr;
         msg->iov_count = 0;
-        msg->addr = atl_ofi_get_addr(ctx, prov, src_proc_idx, ep.idx);
+        msg->addr = atl_ofi_get_addr(prov, src_proc_idx, ep.idx);
         msg->tag = tag;
         msg->ignore = 0;
         msg->context = &(req->fi_ctx);
@@ -545,7 +544,7 @@ atl_status_t atl_ofi::read(atl_ep_t& ep,
                           buf,
                           len,
                           (void*)mr->local_key,
-                          atl_ofi_get_addr(ctx, prov, dst_proc_idx, ep.idx),
+                          atl_ofi_get_addr(prov, dst_proc_idx, ep.idx),
                           addr,
                           remote_key,
                           &ofi_req->fi_ctx),
@@ -579,7 +578,7 @@ atl_status_t atl_ofi::write(atl_ep_t& ep,
                            buf,
                            len,
                            (void*)mr->local_key,
-                           atl_ofi_get_addr(ctx, prov, dst_proc_idx, ep.idx),
+                           atl_ofi_get_addr(prov, dst_proc_idx, ep.idx),
                            addr,
                            remote_key,
                            &ofi_req->fi_ctx),
@@ -661,55 +660,146 @@ atl_status_t atl_ofi::check(atl_ep_t& ep, atl_req_t& req) {
     return status;
 }
 
-atl_status_t atl_ofi::get_rank2rank_map(std::shared_ptr<ipmi> pmi,
-                                        std::vector<int>& rank2rank_map) {
+atl_status_t atl_ofi::get_rank2proc_map(std::shared_ptr<ipmi> pmi,
+                                        std::vector<int>& rank2proc_map) {
+    std::lock_guard<ccl_spinlock> lock{ addr_table_guard };
+    size_t pmi_rank = pmi->get_rank();
+    size_t pmi_size = pmi->get_size();
+    CCL_THROW_IF_NOT(rank2proc_map.empty());
+    rank2proc_map.clear();
+    rank2proc_map.resize(pmi_size);
     int ret;
-    //TODO: add support for multi-provs
-    atl_ofi_prov_t* prov =
-        atl_ofi_get_prov(ctx, coord, eps[0], 0 /* peer_proc_idx */, 0 /* msg_size */);
-
-    std::vector<char> addr_name(prov->addr_len, '\0');
-
-    size_t local_rank = pmi->get_rank();
-    size_t local_size = pmi->get_size();
-    // TODO: uncomment after AV insert update
-    //    for (size_t ep_idx = 0; ep_idx < eps.size(); ep_idx++) {
-    ret = pmi->pmrt_kvs_put(
-        (char*)ATL_OFI_FI_ADDR_UPDATE_PM_KEY,
-        local_rank * ATL_OFI_PMI_PROC_MULTIPLIER, // +
-        //                                    prov_idx * ATL_OFI_PMI_PROV_MULTIPLIER + ep_idx,
-        prov->eps[0].name.addr,
-        //                            addr_name.data(),
-        prov->addr_len);
-    if (ret) {
-        LOG_ERROR("pmrt_kvs_put: ret: ", ret);
-        return ATL_STATUS_FAILURE;
+    if (!need_extra_exchange) {
+        for (size_t i = 0; i < rank2proc_map.size(); i++) {
+            rank2proc_map[i] = i;
+        }
+        need_extra_exchange = true;
+        return ATL_STATUS_SUCCESS;
     }
-    //    }
-    pmi->pmrt_barrier();
-    for (size_t i = 0; i < local_size; ++i) {
-        ret = pmi->pmrt_kvs_get(
-            (char*)ATL_OFI_FI_ADDR_UPDATE_PM_KEY,
-            i * ATL_OFI_PMI_PROC_MULTIPLIER, // +
-            //                                    prov_idx * ATL_OFI_PMI_PROV_MULTIPLIER + ep_idx,
-            (void*)addr_name.data(),
-            prov->addr_len);
-        if (ret) {
-            LOG_ERROR("pmrt_kvs_get: ret: ", ret);
-            return ATL_STATUS_FAILURE;
+
+    char my_hostname[ATL_MAX_HOSTNAME_LEN] = { 0 };
+    size_t my_hostname_len = 0;
+
+    gethostname(my_hostname, ATL_MAX_HOSTNAME_LEN - 1);
+    my_hostname_len = strlen(my_hostname);
+
+    CCL_THROW_IF_NOT(my_hostname_len < ATL_MAX_HOSTNAME_LEN,
+                     "unexpected my_hostname_len ",
+                     my_hostname_len,
+                     ", expected max ",
+                     (size_t)(ATL_MAX_HOSTNAME_LEN));
+
+    if (ATL_MAX_HOSTNAME_LEN - my_hostname_len <= 10) {
+        LOG_WARN("hostname is quite long, len: ", my_hostname_len, ", name: ", my_hostname);
+    }
+
+    snprintf(
+        my_hostname + my_hostname_len, ATL_MAX_HOSTNAME_LEN - my_hostname_len, "-%zu", pmi_rank);
+
+    ret = pmi->pmrt_kvs_put((char*)ATL_OFI_HOSTNAME_PM_KEY,
+                            pmi_rank * ATL_OFI_PMI_PROC_MULTIPLIER,
+                            my_hostname,
+                            ATL_MAX_HOSTNAME_LEN);
+
+    ATL_CHECK_STATUS(pmi->pmrt_barrier(), "barrier failed");
+
+    for (size_t prov_idx = 0; prov_idx < ep_names.size(); prov_idx++) {
+        size_t named_ep_count = (ctx.provs[prov_idx].sep ? 1 : ctx.ep_count);
+        for (size_t ep_idx = 0; ep_idx < named_ep_count; ep_idx++) {
+            ret = pmi->pmrt_kvs_put((char*)ATL_OFI_FI_ADDR_PM_KEY,
+                                    pmi_rank * ATL_OFI_PMI_PROC_MULTIPLIER +
+                                        prov_idx * ATL_OFI_PMI_PROV_MULTIPLIER + ep_idx,
+                                    ctx.provs[prov_idx].eps[ep_idx].name.addr,
+                                    ctx.provs[prov_idx].addr_len);
+            if (ret) {
+                LOG_ERROR("pmrt_kvs_put: ret: ", ret);
+                return ATL_STATUS_FAILURE;
+            }
         }
+    }
 
-        auto it = std::find(ep_names.begin(), ep_names.end(), addr_name);
-        if (it == ep_names.end()) {
-            LOG_ERROR("not found addr_name: ", i);
-            return ATL_STATUS_FAILURE;
+    for (size_t prov_idx = 0; prov_idx < ep_names.size(); prov_idx++) {
+        auto& prov = ctx.provs[prov_idx];
+        auto& prov_ep_names = ep_names[prov_idx];
+        size_t named_ep_count = (prov.sep ? 1 : ctx.ep_count);
+        std::vector<char> addr_name(prov.addr_len, '\0');
+        int new_ep_names_count = 0;
+        auto old_prov_ep_names_size = prov_ep_names.size();
+        ATL_CHECK_STATUS(pmi->pmrt_barrier(), "barrier failed");
+        for (size_t ep_idx = 0; ep_idx < named_ep_count; ep_idx++) {
+            for (size_t i = 0; i < pmi_size; i++) {
+                ret = pmi->pmrt_kvs_get((char*)ATL_OFI_FI_ADDR_PM_KEY,
+                                        i * ATL_OFI_PMI_PROC_MULTIPLIER +
+                                            prov_idx * ATL_OFI_PMI_PROV_MULTIPLIER + ep_idx,
+                                        (void*)addr_name.data(),
+                                        prov.addr_len);
+                if (ret) {
+                    LOG_ERROR("pmrt_kvs_get: ret: ", ret);
+                    return ATL_STATUS_FAILURE;
+                }
+                auto it = std::find(prov_ep_names.begin(), prov_ep_names.end(), addr_name);
+                if (it == prov_ep_names.end()) {
+                    prov_ep_names.push_back(addr_name);
+                    rank2proc_map[i] = (prov_ep_names.size() - 1) / named_ep_count;
+                    new_ep_names_count++;
+                }
+                else {
+                    rank2proc_map[i] = std::distance(prov_ep_names.begin(), it) / named_ep_count;
+                }
+            }
         }
 
-        size_t named_ep_count = (prov->sep ? 1 : ctx.ep_count);
-        rank2rank_map[i] = std::distance(ep_names.begin(), it) / named_ep_count;
+        if (new_ep_names_count > 0) {
+            CCL_THROW_IF_NOT(old_prov_ep_names_size < prov_ep_names.size());
+            if (prov.sep) {
+                prov.addr_table = (fi_addr_t*)realloc(
+                    prov.addr_table, prov_ep_names.size() * sizeof(fi_addr_t) * ctx.ep_count);
+            }
+            else {
+                prov.addr_table =
+                    (fi_addr_t*)realloc(prov.addr_table, prov_ep_names.size() * sizeof(fi_addr_t));
+            }
+
+            if (!prov.addr_table) {
+                LOG_ERROR("failed addr_table allocation");
+                return ATL_STATUS_FAILURE;
+            }
+
+            int insert_count = 0;
+            for (size_t i = old_prov_ep_names_size; i < prov_ep_names.size(); i++) {
+                insert_count += fi_av_insert(
+                    prov.av, prov_ep_names[i].data(), 1, &prov.addr_table[i], 0, nullptr);
+            }
+            if (insert_count != new_ep_names_count) {
+                LOG_ERROR("unexpected av_insert results: expected ",
+                          prov_ep_names.size(),
+                          " got ",
+                          insert_count);
+                return ATL_STATUS_FAILURE;
+            }
+            if (prov.sep) {
+                fi_addr_t* table;
+                table = (fi_addr_t*)calloc(1, new_ep_names_count * sizeof(fi_addr_t));
+                if (table == nullptr) {
+                    LOG_ERROR("memory allocaion failed");
+                    return ATL_STATUS_FAILURE;
+                }
+                memcpy(table,
+                       &prov.addr_table[old_prov_ep_names_size],
+                       new_ep_names_count * sizeof(fi_addr_t));
+
+                for (int i = 0; i < new_ep_names_count; i++) {
+                    for (size_t j = 0; j < ctx.ep_count; j++) {
+                        prov.addr_table[(old_prov_ep_names_size + i) * ctx.ep_count + j] =
+                            fi_rx_addr(table[i], j, prov.rx_ctx_bits);
+                    }
+                }
+                free(table);
+            }
+        }
     }
 
-    LOG_DEBUG("transport: rank2rank_map:", ccl::utils::vec_to_string(rank2rank_map));
+    LOG_DEBUG("transport: rank2proc_map:", ccl::utils::vec_to_string(rank2proc_map));
 
     return ATL_STATUS_SUCCESS;
 }
@@ -912,7 +1002,11 @@ atl_status_t atl_ofi::open_providers(char* prov_env,
         prov->idx = prov_idx;
         prov->is_shm = 1;
         ATL_CALL(atl_ofi_get_prov_list(ctx, prov_name, base_hints, &prov_list), goto err);
-        ATL_CALL(atl_ofi_prov_init(ctx, coord, prov_list, prov, attr, pmi, ep_names), goto err);
+        if (ep_names.size() < prov->idx + 1) {
+            ep_names.resize(prov->idx + 1);
+        }
+        ATL_CALL(atl_ofi_prov_init(ctx, coord, prov_list, prov, attr, pmi, ep_names[prov->idx]),
+                 goto err);
         free(prov_name);
         fi_freeinfo(prov_list);
         ctx.prov_count++;
@@ -953,8 +1047,8 @@ void atl_ofi::fi_cache::clear() {
     }
 }
 
-void atl_ofi::fi_cache::init(size_t instance_count, int enable_hmem) {
-    this->enable_hmem = enable_hmem;
+void atl_ofi::fi_cache::init(size_t instance_count, int ctx_enable_hmem) {
+    this->enable_hmem = ctx_enable_hmem;
     memory_regions.resize(instance_count);
 }
 
@@ -962,12 +1056,16 @@ atl_ofi::fi_cache::~fi_cache() {
     clear();
 }
 
-void atl_ofi::fi_cache::get(size_t idx, fid_domain* domain, void* buf, size_t bytes, fid_mr** mr) {
+void atl_ofi::fi_cache::get(atl_ep_t& ep,
+                            atl_ofi_prov_t* prov,
+                            void* buf,
+                            size_t bytes,
+                            fid_mr** mr) {
     CCL_THROW_IF_NOT(mr);
     *mr = nullptr;
 #ifdef CCL_ENABLE_OFI_HMEM
     if (enable_hmem) {
-        memory_regions.at(idx % memory_regions.size()).get(domain, buf, bytes, mr);
+        memory_regions.at(ep.idx % memory_regions.size()).get(ep, prov, buf, bytes, mr);
     }
 #endif // CCL_ENABLE_OFI_HMEM
 }
@@ -994,12 +1092,16 @@ void atl_ofi::mr_cache::clear() {
     cache.clear();
 }
 
-void atl_ofi::mr_cache::get(fid_domain* domain, void* buf, size_t bytes, fid_mr** mr) {
-    CCL_THROW_IF_NOT(domain);
+void atl_ofi::mr_cache::get(atl_ep_t& ep,
+                            atl_ofi_prov_t* prov,
+                            void* buf,
+                            size_t bytes,
+                            fid_mr** mr) {
+    CCL_THROW_IF_NOT(prov->domain);
     CCL_THROW_IF_NOT(mr);
 
     if (ccl::global_data::env().enable_atl_cache) {
-        key_t key(domain, buf, bytes);
+        key_t key(prov->domain, buf, bytes);
         auto key_value = cache.find(key);
         if (key_value != cache.end()) {
             *mr = key_value->second;
@@ -1056,7 +1158,7 @@ void atl_ofi::mr_cache::get(fid_domain* domain, void* buf, size_t bytes, fid_mr*
 #endif // CCL_ENABLE_OFI_HMEM
 
     int ofi_ret;
-    ATL_OFI_CALL(fi_mr_regattr(domain, &mr_attr, 0, mr),
+    ATL_OFI_CALL(fi_mr_regattr(prov->domain, &mr_attr, 0, mr),
                  ofi_ret,
                  CCL_THROW("failed to register mr, ret: ",
                            ofi_ret,
@@ -1067,8 +1169,13 @@ void atl_ofi::mr_cache::get(fid_domain* domain, void* buf, size_t bytes, fid_mr*
                            ", iface: ",
                            mr_attr.iface));
 
+    if (prov->info->domain_attr->mr_mode & FI_MR_ENDPOINT) {
+        fi_mr_bind(*mr, (fid_t)&ep, 0);
+        fi_mr_enable(*mr);
+    }
+
     if (ccl::global_data::env().enable_atl_cache) {
-        key_t key(domain, buf, bytes);
+        key_t key(prov->domain, buf, bytes);
         LOG_DEBUG("inserted to mr cache: buf: ", buf, ", bytes: ", bytes);
         cache.insert({ std::move(key), *mr });
     }
@@ -1082,3 +1189,8 @@ void atl_ofi::mr_cache::push(fid_mr* mr) {
     }
     fi_close(&mr->fid);
 }
+
+fi_addr_t atl_ofi::atl_ofi_get_addr(atl_ofi_prov_t* prov, int proc_idx, size_t ep_idx) {
+    std::lock_guard<ccl_spinlock> lock{ addr_table_guard };
+    return *(prov->addr_table + ((ctx.ep_count * (proc_idx - prov->first_proc_idx)) + ep_idx));
+}
diff --git a/src/atl/ofi/atl_ofi.hpp b/src/atl/ofi/atl_ofi.hpp
index 3cd9a6612..07d295a54 100644
--- a/src/atl/ofi/atl_ofi.hpp
+++ b/src/atl/ofi/atl_ofi.hpp
@@ -17,12 +17,13 @@
 
 #include <iostream>
 #include <memory>
-#include <rdma/fi_domain.h>
 #include <unordered_map>
 
 #include "atl/atl_base_transport.hpp"
 #include "atl/ofi/atl_ofi_helper.hpp"
+#include "common/api_wrapper/ofi_api_wrapper.hpp"
 #include "common/utils/hash.hpp"
+#include "common/utils/spinlock.hpp"
 
 class atl_ofi : public atl_base_transport {
 public:
@@ -173,8 +174,8 @@ class atl_ofi : public atl_base_transport {
         return ATL_STATUS_UNSUPPORTED;
     }
 
-    atl_status_t get_rank2rank_map(std::shared_ptr<ipmi> pmi,
-                                   std::vector<int>& rank2rank_map) override;
+    atl_status_t get_rank2proc_map(std::shared_ptr<ipmi> pmi,
+                                   std::vector<int>& rank2proc_map) override;
 
     std::string to_string() override;
 
@@ -194,6 +195,7 @@ class atl_ofi : public atl_base_transport {
                                 int fi_version,
                                 std::shared_ptr<ipmi> pmi,
                                 bool log_on_error);
+    fi_addr_t atl_ofi_get_addr(atl_ofi_prov_t* prov, int proc_idx, size_t ep_idx);
 
     atl_ofi_ctx_t ctx;
 
@@ -203,7 +205,7 @@ class atl_ofi : public atl_base_transport {
         ~mr_cache();
 
         void clear();
-        void get(fid_domain* domain, void* buf, size_t bytes, fid_mr** mr);
+        void get(atl_ep_t& ep, atl_ofi_prov_t* prov, void* buf, size_t bytes, fid_mr** mr);
         void push(fid_mr* mr);
 
     private:
@@ -223,8 +225,8 @@ class atl_ofi : public atl_base_transport {
 
         void clear();
 
-        void init(size_t instance_count, int enable_hmem);
-        void get(size_t idx, fid_domain* domain, void* buf, size_t bytes, fid_mr** mr);
+        void init(size_t instance_count, int ctx_enable_hmem);
+        void get(atl_ep_t& ep, atl_ofi_prov_t* prov, void* buf, size_t bytes, fid_mr** mr);
         void push(size_t idx, fid_mr* mr);
 
     private:
@@ -233,7 +235,11 @@ class atl_ofi : public atl_base_transport {
     };
 
     fi_cache cache{};
+    // accumulates ep names from all comms
+    // each new portion added into that vector corresponds to single process
+    // prov_idx : ep_idx : ep_name
+    std::vector<ep_names_t> ep_names{};
 
-    /* accumulates ep names from all comms */
-    std::list<std::vector<char>> ep_names{};
+    bool need_extra_exchange{ false };
+    ccl_spinlock addr_table_guard;
 };
diff --git a/src/atl/ofi/atl_ofi_comm.cpp b/src/atl/ofi/atl_ofi_comm.cpp
index 5dc4e3e27..f5dbfac2e 100644
--- a/src/atl/ofi/atl_ofi_comm.cpp
+++ b/src/atl/ofi/atl_ofi_comm.cpp
@@ -110,7 +110,7 @@ atl_status_t atl_ofi_comm::allgatherv(size_t ep_idx,
         if (peer == rank)
             continue;
 
-        uint64_t op_tag = tag->create(rank, tag_comm_id, tag_counter);
+        uint64_t op_tag = tag_creator->create(rank, tag_comm_id, tag_counter);
         // LOG_DEBUG("ofi_allgatherv: send: rank: ", rank,
         //     ", peer: ", peer,
         //     ", comm_id: ", comm_id,
@@ -128,7 +128,7 @@ atl_status_t atl_ofi_comm::allgatherv(size_t ep_idx,
             }
         } while (ret == ATL_STATUS_AGAIN);
 
-        op_tag = tag->create(peer, tag_comm_id, tag_counter);
+        op_tag = tag_creator->create(peer, tag_comm_id, tag_counter);
         // LOG_DEBUG("ofi_allgatherv: recv: rank: ", rank,
         //     ", peer: ", peer,
         //     ", comm_id: ", comm_id,
@@ -202,8 +202,7 @@ atl_ofi_comm::atl_ofi_comm(atl_ofi_comm* parent, int color) {
     coord.local_count = 0;
 
     std::vector<rank_info_t> ranks_info(parent_size);
-    int parent_proc_idx = parent->rank2proc_map[parent_rank];
-    rank_info_t rank_info{ color, parent_rank, parent_proc_idx, coord.hostname_hash };
+    rank_info_t rank_info{ color, parent_rank, coord.hostname_hash };
     std::vector<int> recv_lens(parent_size, sizeof(rank_info));
     std::vector<int> offsets(parent_size);
     offsets[0] = 0;
@@ -221,27 +220,26 @@ atl_ofi_comm::atl_ofi_comm(atl_ofi_comm* parent, int color) {
                        req);
     wait(0, req);
 
-    CCL_THROW_IF_NOT(rank2rank_map.empty());
     CCL_THROW_IF_NOT(rank2proc_map.empty());
+    CCL_THROW_IF_NOT(rank2rank_map.empty());
 
     size = 0;
 
     for (auto& it : ranks_info) {
         int recv_color;
         int recv_rank;
-        int recv_proc_idx;
         size_t recv_hash;
-        std::tie(recv_color, recv_rank, recv_proc_idx, recv_hash) = it;
+        std::tie(recv_color, recv_rank, recv_hash) = it;
         if (recv_color == color) {
+            rank2proc_map.push_back(parent->rank2proc_map[recv_rank]);
             rank2rank_map.push_back(recv_rank);
-            rank2proc_map.push_back(recv_proc_idx);
 
             if (recv_hash == coord.hostname_hash) {
                 coord.local_count++;
             }
 
             if (recv_rank == parent_rank) {
-                coord.global_idx = rank = rank2rank_map.size() - 1;
+                coord.global_idx = rank = rank2proc_map.size() - 1;
                 coord.local_idx = (coord.local_count - 1);
             }
             size++;
@@ -253,10 +251,6 @@ atl_ofi_comm::atl_ofi_comm(atl_ofi_comm* parent, int color) {
               color,
               ", ",
               to_string(coord),
-              ", rank2rank_map: ",
-              ccl::utils::vec_to_string(rank2rank_map),
-              ", parent rank2rank_map: ",
-              ccl::utils::vec_to_string(parent->rank2rank_map),
               ", rank2proc_map: ",
               ccl::utils::vec_to_string(rank2proc_map),
               ", parent rank2proc_map: ",
@@ -297,8 +291,11 @@ atl_status_t atl_ofi_comm::init_transport(bool is_new) {
         coord = transport->get_proc_coord();
         coord.validate(rank, size);
 
-        rank2proc_map.resize(size);
-        transport->get_rank2rank_map(pmi, rank2proc_map);
+        transport->get_rank2proc_map(pmi, rank2proc_map);
+        rank2rank_map.resize(size);
+        for (int i = 0; i < size; i++) {
+            rank2rank_map[i] = i;
+        }
     }
 
     init_tag();
diff --git a/src/atl/ofi/atl_ofi_comm.hpp b/src/atl/ofi/atl_ofi_comm.hpp
index bb7de9674..160468df9 100644
--- a/src/atl/ofi/atl_ofi_comm.hpp
+++ b/src/atl/ofi/atl_ofi_comm.hpp
@@ -168,8 +168,8 @@ class atl_ofi_comm : public atl_base_comm {
 private:
     friend atl_comm_manager;
 
-    // color, parent_rank, parent_proc_idx, hostname_hash
-    using rank_info_t = std::tuple<int, int, int, size_t>;
+    // color, parent_rank, hostname_hash
+    using rank_info_t = std::tuple<int, int, size_t>;
 
     atl_ofi_comm(atl_ofi_comm* parent, int color);
     atl_status_t init_transport(bool is_new);
diff --git a/src/atl/ofi/atl_ofi_helper.cpp b/src/atl/ofi/atl_ofi_helper.cpp
index 74284572c..7ece25d35 100644
--- a/src/atl/ofi/atl_ofi_helper.cpp
+++ b/src/atl/ofi/atl_ofi_helper.cpp
@@ -126,10 +126,6 @@ atl_ofi_prov_t* atl_ofi_get_prov(atl_ofi_ctx_t& ctx,
     return &(ctx.provs[prov_idx]);
 }
 
-fi_addr_t atl_ofi_get_addr(atl_ofi_ctx_t& ctx, atl_ofi_prov_t* prov, int proc_idx, size_t ep_idx) {
-    return *(prov->addr_table + ((ctx.ep_count * (proc_idx - prov->first_proc_idx)) + ep_idx));
-}
-
 atl_status_t atl_ofi_get_local_proc_coord(atl_proc_coord_t& coord, std::shared_ptr<ipmi> pmi) {
     atl_status_t ret = ATL_STATUS_SUCCESS;
     int i;
@@ -217,7 +213,7 @@ atl_status_t atl_ofi_prov_update_addr_table(atl_ofi_ctx_t& ctx,
                                             const atl_proc_coord_t& coord,
                                             size_t prov_idx,
                                             std::shared_ptr<ipmi> pmi,
-                                            std::list<std::vector<char>>& ep_names) {
+                                            ep_names_t& ep_names) {
     atl_ofi_prov_t* prov = &(ctx.provs[prov_idx]);
 
     atl_status_t ret = ATL_STATUS_SUCCESS;
@@ -441,7 +437,7 @@ atl_status_t atl_ofi_prov_eps_connect(atl_ofi_ctx_t& ctx,
                                       const atl_proc_coord_t& coord,
                                       size_t prov_idx,
                                       std::shared_ptr<ipmi> pmi,
-                                      std::list<std::vector<char>>& ep_names) {
+                                      ep_names_t& ep_names) {
     int ret;
     size_t ep_idx;
 
@@ -901,7 +897,7 @@ atl_status_t atl_ofi_prov_init(atl_ofi_ctx_t& ctx,
                                atl_ofi_prov_t* prov,
                                atl_attr_t* attr,
                                std::shared_ptr<ipmi> pmi,
-                               std::list<std::vector<char>>& ep_names) {
+                               ep_names_t& ep_names) {
     struct fi_av_attr av_attr;
     size_t ep_idx = 0;
     ssize_t ret = 0;
@@ -1201,7 +1197,7 @@ atl_status_t atl_ofi_open_nw_provs(atl_ofi_ctx_t& ctx,
                                    struct fi_info* base_hints,
                                    atl_attr_t* attr,
                                    std::shared_ptr<ipmi> pmi,
-                                   std::list<std::vector<char>>& ep_names,
+                                   std::vector<ep_names_t>& ep_names,
                                    bool log_on_error) {
     atl_status_t ret = ATL_STATUS_SUCCESS;
     struct fi_info* prov_list = nullptr;
@@ -1331,13 +1327,17 @@ atl_status_t atl_ofi_open_nw_provs(atl_ofi_ctx_t& ctx,
     /* 6. create network providers */
     LOG_INFO("found ", final_provs.size(), " nic(s) according to all filters");
     ctx.nw_prov_count = final_provs.size();
+    if (ep_names.size() < ctx.nw_prov_count + ctx.nw_prov_first_idx) {
+        ep_names.resize(ctx.nw_prov_count + ctx.nw_prov_first_idx);
+    }
     for (idx = 0; idx < ctx.nw_prov_count; idx++) {
         prov_idx = ctx.nw_prov_first_idx + idx;
         prov = &ctx.provs[prov_idx];
         prov->idx = prov_idx;
         prov->is_shm = 0;
-        ATL_CALL(atl_ofi_prov_init(ctx, coord, final_provs[idx], prov, attr, pmi, ep_names),
-                 goto err);
+        ATL_CALL(
+            atl_ofi_prov_init(ctx, coord, final_provs[idx], prov, attr, pmi, ep_names[prov->idx]),
+            goto err);
     }
 
 exit:
diff --git a/src/atl/ofi/atl_ofi_helper.hpp b/src/atl/ofi/atl_ofi_helper.hpp
index c79b30d06..607b1412c 100644
--- a/src/atl/ofi/atl_ofi_helper.hpp
+++ b/src/atl/ofi/atl_ofi_helper.hpp
@@ -18,10 +18,6 @@
 #include <dlfcn.h>
 #include <inttypes.h>
 #include <math.h>
-#include <rdma/fabric.h>
-#include <rdma/fi_cm.h>
-#include <rdma/fi_tagged.h>
-#include <rdma/fi_rma.h>
 #include <set>
 #include <sstream>
 #include <stdio.h>
@@ -35,6 +31,7 @@
 #include <errno.h>
 
 #include "atl/util/pm/pm_rt.h"
+#include "common/api_wrapper/ofi_api_wrapper.hpp"
 #include "common/global/global.hpp"
 #include "common/utils/utils.hpp"
 #include "hwloc/hwloc_wrapper.hpp"
@@ -241,6 +238,8 @@ typedef struct atl_ofi_global_data {
     }
 } atl_ofi_global_data_t;
 
+using ep_names_t = std::vector<std::vector<char>>;
+
 extern atl_ofi_global_data_t global_data;
 
 std::string atl_ofi_get_short_nic_name(const struct fi_info* prov);
@@ -250,19 +249,18 @@ atl_ofi_prov_t* atl_ofi_get_prov(atl_ofi_ctx_t& ctx,
                                  const atl_ep_t& ep,
                                  int peer_proc_idx,
                                  size_t msg_size);
-fi_addr_t atl_ofi_get_addr(atl_ofi_ctx_t& ctx, atl_ofi_prov_t* prov, int proc_idx, size_t ep_idx);
 atl_status_t atl_ofi_get_local_proc_coord(atl_proc_coord_t& coord, std::shared_ptr<ipmi> pmi);
 atl_status_t atl_ofi_prov_update_addr_table(atl_ofi_ctx_t& ctx,
                                             const atl_proc_coord_t& coord,
                                             size_t prov_idx,
                                             std::shared_ptr<ipmi> pmi,
-                                            std::list<std::vector<char>>& ep_names);
+                                            ep_names_t& ep_names);
 atl_status_t atl_ofi_prov_ep_get_name(atl_ofi_prov_t* prov, size_t ep_idx);
 atl_status_t atl_ofi_prov_eps_connect(atl_ofi_ctx_t& ctx,
                                       const atl_proc_coord_t& coord,
                                       size_t prov_idx,
                                       std::shared_ptr<ipmi> pmi,
-                                      std::list<std::vector<char>>& ep_names);
+                                      ep_names_t& ep_names);
 void atl_ofi_prov_ep_destroy(atl_ofi_prov_t* prov, atl_ofi_prov_ep_t* ep);
 void atl_ofi_prov_destroy(atl_ofi_ctx_t& ctx, atl_ofi_prov_t* prov);
 int atl_ofi_wait_cancel_cq(struct fid_cq* cq);
@@ -282,7 +280,7 @@ atl_status_t atl_ofi_prov_init(atl_ofi_ctx_t& ctx,
                                atl_ofi_prov_t* prov,
                                atl_attr_t* attr,
                                std::shared_ptr<ipmi> pmi,
-                               std::list<std::vector<char>>& ep_names);
+                               ep_names_t& ep_names);
 atl_status_t atl_ofi_adjust_out_tag(atl_ofi_prov_t* prov, atl_attr_t* attr);
 atl_status_t atl_ofi_parse_mnic_name(atl_ofi_ctx_t& ctx, std::string str_to_parse);
 int atl_ofi_is_allowed_nic_name(atl_ofi_ctx_t& ctx, struct fi_info* info);
@@ -291,6 +289,6 @@ atl_status_t atl_ofi_open_nw_provs(atl_ofi_ctx_t& ctx,
                                    struct fi_info* base_hints,
                                    atl_attr_t* attr,
                                    std::shared_ptr<ipmi> pmi,
-                                   std::list<std::vector<char>>& ep_names,
+                                   std::vector<ep_names_t>& ep_names,
                                    bool log_on_error);
 void atl_ofi_init_req(atl_req_t& req, atl_ofi_prov_ep_t* prov_ep, struct fid_ep* fi_ep);
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable.cpp b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable.cpp
index b642f3b5d..d4d1dcd6a 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable.cpp
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable.cpp
@@ -88,8 +88,8 @@ atl_status_t pmi_resizable::pmrt_init() {
     return ATL_STATUS_FAILURE;
 }
 
-atl_status_t pmi_resizable::pmrt_main_addr_reserve(char *main_addr) {
-    if (PMIR_Main_Addr_Reserve(main_addr) != KVS_STATUS_SUCCESS)
+atl_status_t pmi_resizable::pmrt_main_addr_reserve(char *addr) {
+    if (PMIR_Main_Addr_Reserve(addr) != KVS_STATUS_SUCCESS)
         return ATL_STATUS_FAILURE;
 
     return ATL_STATUS_SUCCESS;
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable.h b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable.h
index 6eaedca16..64ab9987a 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable.h
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable.h
@@ -56,7 +56,7 @@ class pmi_resizable final : public ipmi {
 
     int is_pm_resize_enabled() override;
 
-    atl_status_t pmrt_main_addr_reserve(char* main_addr) override;
+    atl_status_t pmrt_main_addr_reserve(char* addr) override;
 
     atl_status_t pmrt_set_resize_function(atl_resize_fn_t resize_fn) override;
 
@@ -102,15 +102,15 @@ class pmi_resizable final : public ipmi {
 private:
     bool is_finalized{ false };
     /*Was in API ->*/
-    kvs_status_t PMIR_Main_Addr_Reserve(char* main_addr);
+    kvs_status_t PMIR_Main_Addr_Reserve(char* addr);
 
-    kvs_status_t PMIR_Init(const char* main_addr);
+    kvs_status_t PMIR_Init(const char* addr);
 
     kvs_status_t PMIR_Finalize(void);
 
-    kvs_status_t PMIR_Get_size(int* size);
+    kvs_status_t PMIR_Get_size(int* size_ptr);
 
-    kvs_status_t PMIR_Get_rank(int* rank);
+    kvs_status_t PMIR_Get_rank(int* rank_ptr);
 
     kvs_status_t PMIR_KVS_Get_my_name(char* kvs_name, size_t length);
 
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/def.h b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/def.h
index da97bad25..36aab26d0 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/def.h
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/def.h
@@ -157,7 +157,7 @@ typedef enum { KVS_STATUS_SUCCESS, KVS_STATUS_FAILURE, KVS_STATUS_UNSUPPORTED }
 #define INITIAL_RANK_NUM   "0"
 #define MAX_CLEAN_CHECKS   3
 
-extern char my_hostname[MAX_KVS_VAL_LENGTH];
+extern char pmi_hostname[MAX_KVS_VAL_LENGTH];
 
 void inline kvs_str_copy(char* dst, const char* src, size_t bytes) {
     strncpy(dst, src, bytes - 1);
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/helper.cpp b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/helper.cpp
index 90f6df7f4..04b069db6 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/helper.cpp
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/helper.cpp
@@ -158,9 +158,9 @@ kvs_status_t helper::wait_accept(void) {
     my_rank = 0;
 
     while (1) {
-        KVS_CHECK_STATUS(get_value_by_name_key(KVS_ACCEPT, my_hostname, my_rank_str),
+        KVS_CHECK_STATUS(get_value_by_name_key(KVS_ACCEPT, pmi_hostname, my_rank_str),
                          "failed to get value");
-        if (my_rank_str.length() == 0)
+        if (my_rank_str.empty())
             continue;
         KVS_CHECK_STATUS(safe_strtol(my_rank_str.c_str(), my_rank), "failed to convert my_rank");
         break;
@@ -233,9 +233,8 @@ kvs_status_t helper::update_kvs_info(int new_rank) {
     char kvs_key[MAX_KVS_KEY_LENGTH];
     char kvs_val[MAX_KVS_VAL_LENGTH];
     size_t kvs_list_size = get_kvs_list_size(ST_CLIENT);
-    size_t k;
 
-    for (k = 0; k < kvs_list_size; k++) {
+    for (size_t kvs_idx = 0; kvs_idx < kvs_list_size; kvs_idx++) {
         cut_head(kvs_name, kvs_key, kvs_val, ST_CLIENT);
 
         KVS_CHECK_STATUS(remove_name_key(kvs_name, kvs_key), "failed to remove name and key");
@@ -257,9 +256,9 @@ kvs_status_t helper::move_to_new_rank(int new_rank) {
 
     SET_STR(rank_str, INT_STR_SIZE, RANK_TEMPLATE, new_rank);
 
-    //    request_set_val(KVS_POD_REQUEST, my_hostname, rank_str);
+    //    request_set_val(KVS_POD_REQUEST, pmi_hostname, rank_str);
 
-    KVS_CHECK_STATUS(set_value(KVS_POD_NUM, rank_str, my_hostname), "failed to update kvs info");
+    KVS_CHECK_STATUS(set_value(KVS_POD_NUM, rank_str, pmi_hostname), "failed to update kvs info");
     return KVS_STATUS_SUCCESS;
 }
 
@@ -317,19 +316,19 @@ kvs_status_t helper::post_my_info(void) {
 
     SET_STR(my_rank_str, INT_STR_SIZE, RANK_TEMPLATE, my_rank);
 
-    KVS_CHECK_STATUS(set_value(KVS_POD_NUM, my_rank_str, my_hostname), "failed to set rank");
+    KVS_CHECK_STATUS(set_value(KVS_POD_NUM, my_rank_str, pmi_hostname), "failed to set rank");
 
     KVS_CHECK_STATUS(get_barrier_idx(barrier_num), "failed to get barrier idx");
 
     SET_STR(barrier_num_str, INT_STR_SIZE, SIZE_T_TEMPLATE, barrier_num);
 
-    KVS_CHECK_STATUS(set_value(KVS_BARRIER, my_hostname, barrier_num_str),
+    KVS_CHECK_STATUS(set_value(KVS_BARRIER, pmi_hostname, barrier_num_str),
                      "failed to set barrier idx");
 
-    KVS_CHECK_STATUS(remove_name_key(KVS_ACCEPT, my_hostname),
+    KVS_CHECK_STATUS(remove_name_key(KVS_ACCEPT, pmi_hostname),
                      "failed to remove accepted hostname");
 
-    KVS_CHECK_STATUS(remove_name_key(KVS_APPROVED_NEW_POD, my_hostname),
+    KVS_CHECK_STATUS(remove_name_key(KVS_APPROVED_NEW_POD, pmi_hostname),
                      "failed to remove approved hostname");
 
     barrier_num++;
@@ -370,7 +369,7 @@ kvs_status_t helper::get_val_count(const char* name, const char* val, size_t& re
 
     if (count_values != 0) {
         for (i = 0; i < count_values; i++) {
-            if (!strcmp(val, kvs_values[i].c_str())) {
+            if (val == kvs_values[i]) {
                 res++;
             }
         }
@@ -388,7 +387,7 @@ kvs_status_t helper::get_occupied_ranks_count(char* rank, size_t& res) {
     KVS_CHECK_STATUS(get_value_by_name_key(KVS_POD_NUM, rank, occupied_rank_val_str),
                      "failed to get occupied rank");
 
-    is_occupied_rank = (occupied_rank_val_str.length() == 0) ? 0 : 1;
+    is_occupied_rank = (occupied_rank_val_str.empty()) ? 0 : 1;
 
     KVS_CHECK_STATUS(get_val_count(KVS_NEW_POD, rank, count_new_pod), "failed to get mew rank");
 
@@ -413,7 +412,7 @@ kvs_status_t helper::occupied_rank(char* rank) {
 
     KVS_CHECK_STATUS(get_value_by_name_key(KVS_UP, KVS_IDX, idx_val), "failed to get ID");
 
-    if ((idx_val.length() == 0) && (my_rank == 0)) {
+    if ((idx_val.empty()) && (my_rank == 0)) {
         KVS_CHECK_STATUS(set_value(KVS_UP, KVS_IDX, INITIAL_UPDATE_IDX),
                          "failed to set initial ID");
 
@@ -424,7 +423,7 @@ kvs_status_t helper::occupied_rank(char* rank) {
         KVS_CHECK_STATUS(update(clear_shift_list, clear_list, 0), "failed to initial update");
     }
     else {
-        KVS_CHECK_STATUS(set_value(KVS_NEW_POD, my_hostname, rank), "failed to set rank");
+        KVS_CHECK_STATUS(set_value(KVS_NEW_POD, pmi_hostname, rank), "failed to set rank");
     }
     return KVS_STATUS_SUCCESS;
 }
@@ -439,7 +438,7 @@ kvs_status_t helper::reg_rank(void) {
     size_t i;
 
     my_rank = 0;
-    KVS_CHECK_STATUS(set_value(KVS_POD_REQUEST, my_hostname, INITIAL_RANK_NUM),
+    KVS_CHECK_STATUS(set_value(KVS_POD_REQUEST, pmi_hostname, INITIAL_RANK_NUM),
                      "failed to set initial rank");
 
     SET_STR(rank_str, INT_STR_SIZE, RANK_TEMPLATE, my_rank);
@@ -455,7 +454,7 @@ kvs_status_t helper::reg_rank(void) {
         for (i = 0; i < count_values; i++) {
             if (!strcmp(kvs_values[i].c_str(), rank_str)) {
                 my_num_in_pod_request_line++;
-                if (!strcmp(kvs_keys[i].c_str(), my_hostname))
+                if (!strcmp(kvs_keys[i].c_str(), pmi_hostname))
                     break;
             }
         }
@@ -481,18 +480,18 @@ kvs_status_t helper::reg_rank(void) {
         if (!wait_shift) {
             my_rank++;
             SET_STR(rank_str, INT_STR_SIZE, RANK_TEMPLATE, my_rank);
-            KVS_CHECK_STATUS(set_value(KVS_POD_REQUEST, my_hostname, rank_str),
+            KVS_CHECK_STATUS(set_value(KVS_POD_REQUEST, pmi_hostname, rank_str),
                              "failed to set rank");
         }
     }
 
-    KVS_CHECK_STATUS(remove_name_key(KVS_POD_REQUEST, my_hostname), "failed to remove host info");
+    KVS_CHECK_STATUS(remove_name_key(KVS_POD_REQUEST, pmi_hostname), "failed to remove host info");
 
     return KVS_STATUS_SUCCESS;
 }
 
 kvs_status_t helper::get_replica_size(size_t& replica_size) {
-    return k->kvs_get_replica_size(replica_size);
+    return kvs->kvs_get_replica_size(replica_size);
 }
 
 kvs_status_t helper::up_kvs(const char* new_kvs_name, const char* old_kvs_name) {
@@ -541,30 +540,30 @@ kvs_status_t helper::get_keys_values_by_name(const std::string& kvs_name,
                                              std::vector<std::string>& kvs_keys,
                                              std::vector<std::string>& kvs_values,
                                              size_t& count) {
-    return k->kvs_get_keys_values_by_name(kvs_name, kvs_keys, kvs_values, count);
+    return kvs->kvs_get_keys_values_by_name(kvs_name, kvs_keys, kvs_values, count);
 }
 kvs_status_t helper::set_value(const std::string& kvs_name,
                                const std::string& kvs_key,
                                const std::string& kvs_val) {
-    return k->kvs_set_value(kvs_name, kvs_key, kvs_val);
+    return kvs->kvs_set_value(kvs_name, kvs_key, kvs_val);
 }
 kvs_status_t helper::remove_name_key(const std::string& kvs_name, const std::string& kvs_key) {
-    return k->kvs_remove_name_key(kvs_name, kvs_key);
+    return kvs->kvs_remove_name_key(kvs_name, kvs_key);
 }
 kvs_status_t helper::get_value_by_name_key(const std::string& kvs_name,
                                            const std::string& kvs_key,
                                            std::string& kvs_val) {
-    return k->kvs_get_value_by_name_key(kvs_name, kvs_key, kvs_val);
+    return kvs->kvs_get_value_by_name_key(kvs_name, kvs_key, kvs_val);
 }
 size_t helper::init(const char* main_addr) {
-    return k->kvs_init(main_addr);
+    return kvs->kvs_init(main_addr);
 }
 kvs_status_t helper::main_server_address_reserve(char* main_addr) {
-    return k->kvs_main_server_address_reserve(main_addr);
+    return kvs->kvs_main_server_address_reserve(main_addr);
 }
 kvs_status_t helper::get_count_names(const std::string& kvs_name, size_t& count_names) {
-    return k->kvs_get_count_names(kvs_name, count_names);
+    return kvs->kvs_get_count_names(kvs_name, count_names);
 }
 kvs_status_t helper::finalize(void) {
-    return k->kvs_finalize();
+    return kvs->kvs_finalize();
 }
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/helper.hpp b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/helper.hpp
index 62bed84d0..6f02bc038 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/helper.hpp
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/helper.hpp
@@ -46,7 +46,7 @@ extern int new_ranks_count;
 class helper {
 public:
     helper() = delete;
-    explicit helper(std::shared_ptr<ikvs_wrapper> k) : k(std::move(k)){};
+    explicit helper(std::shared_ptr<ikvs_wrapper> k) : kvs(std::move(k)){};
     ~helper() = default;
 
     kvs_status_t get_update_ranks(void);
@@ -125,6 +125,6 @@ class helper {
     kvs_status_t occupied_rank(char* rank);
 
     kvs_status_t up_kvs(const char* new_kvs_name, const char* old_kvs_name);
-    std::shared_ptr<ikvs_wrapper> k;
+    std::shared_ptr<ikvs_wrapper> kvs;
 };
 #endif
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs.cpp b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs.cpp
index a192e0299..95e2200ce 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs.cpp
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs.cpp
@@ -367,7 +367,7 @@ kvs_status_t internal_kvs::init_main_server_address(const char* main_addr) {
     }
 
     if (ip_getting_type) {
-        if (strstr(ip_getting_type, CCL_KVS_IP_EXCHANGE_VAL_ENV.c_str())) {
+        if (ip_getting_type == CCL_KVS_IP_EXCHANGE_VAL_ENV) {
             ip_getting_mode = IGT_ENV;
         }
         else {
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs_server.hpp b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs_server.hpp
index 0a4d543e2..7e287b636 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs_server.hpp
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs_server.hpp
@@ -67,7 +67,7 @@ class kvs_request_t {
         return KVS_STATUS_SUCCESS;
     }
     kvs_status_t put(int sock, std::mutex& memory_mutex, size_t put_buf) {
-        size_t sizeof_put_buf = sizeof(put_buf);
+        const size_t sizeof_put_buf = sizeof(put_buf);
         DO_RW_OP(write, sock, &put_buf, sizeof_put_buf, memory_mutex);
         return KVS_STATUS_SUCCESS;
     }
@@ -101,7 +101,7 @@ class kvs_request_t {
         return KVS_STATUS_SUCCESS;
     }
     kvs_status_t get(int sock, std::mutex& memory_mutex, size_t& get_buf) {
-        size_t sizeof_get_buf = sizeof(get_buf);
+        const size_t sizeof_get_buf = sizeof(get_buf);
         DO_RW_OP(read, sock, &get_buf, sizeof_get_buf, memory_mutex);
         return KVS_STATUS_SUCCESS;
     }
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/pmi_listener.cpp b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/pmi_listener.cpp
index 13bcffc2f..e5b79cdc0 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/pmi_listener.cpp
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/pmi_listener.cpp
@@ -43,7 +43,7 @@ void pmi_listener::set_applied_count(int count) {
 kvs_status_t pmi_listener::collect_sock_addr(std::shared_ptr<helper> h) {
     FILE* fp;
     size_t i, j;
-    kvs_status_t res = KVS_STATUS_SUCCESS;
+    kvs_status_t status = KVS_STATUS_SUCCESS;
     size_t glob_num_listeners;
     std::vector<std::string> sock_addr_str(1);
     std::vector<std::string> hosts_names_str(1);
@@ -67,20 +67,20 @@ kvs_status_t pmi_listener::collect_sock_addr(std::shared_ptr<helper> h) {
     num_listeners = glob_num_listeners;
 
     for (i = 0; i < num_listeners; i++) {
-        if (strstr(hosts_names_str[i].c_str(), my_hostname)) {
+        if (strstr(hosts_names_str[i].c_str(), pmi_hostname)) {
             num_listeners--;
             break;
         }
     }
 
     if (num_listeners == 0) {
-        res = KVS_STATUS_SUCCESS;
+        status = KVS_STATUS_SUCCESS;
         goto exit;
     }
 
     if ((sock_sender = socket(AF_INET, SOCK_DGRAM, 0)) < 0) {
         LOG_ERROR("Socket creation error");
-        res = KVS_STATUS_FAILURE;
+        status = KVS_STATUS_FAILURE;
         goto exit;
     }
 
@@ -91,7 +91,7 @@ kvs_status_t pmi_listener::collect_sock_addr(std::shared_ptr<helper> h) {
     server_addresses = (struct sockaddr_in*)malloc((num_listeners) * sizeof(struct sockaddr_in));
     if (server_addresses == NULL) {
         LOG_ERROR("nmemory allocation failed");
-        res = KVS_STATUS_FAILURE;
+        status = KVS_STATUS_FAILURE;
         goto exit;
     }
 
@@ -100,35 +100,35 @@ kvs_status_t pmi_listener::collect_sock_addr(std::shared_ptr<helper> h) {
         char* point_to_port = strstr(const_cast<char*>(sock_addr_str[j].c_str()), "_");
         if (point_to_port == NULL) {
             LOG_ERROR("Wrong address_port record: %s", sock_addr_str[j]);
-            res = KVS_STATUS_FAILURE;
+            status = KVS_STATUS_FAILURE;
             goto exit;
         }
         point_to_port[0] = NULL_CHAR;
         point_to_port++;
-        if (strstr(const_cast<char*>(hosts_names_str[j].c_str()), my_hostname)) {
+        if (strstr(const_cast<char*>(hosts_names_str[j].c_str()), pmi_hostname)) {
             i--;
             continue;
         }
 
         if (safe_strtol(point_to_port, server_addresses[i].sin_port) != KVS_STATUS_SUCCESS) {
             LOG_ERROR("failed to convert sin_port");
-            res = KVS_STATUS_FAILURE;
+            status = KVS_STATUS_FAILURE;
             goto exit;
         }
         server_addresses[i].sin_family = AF_INET;
 
         if (inet_pton(AF_INET, sock_addr_str[j].c_str(), &(server_addresses[i].sin_addr)) <= 0) {
             LOG_ERROR("Invalid address/ Address not supported: %s", sock_addr_str[j].c_str());
-            res = KVS_STATUS_FAILURE;
+            status = KVS_STATUS_FAILURE;
             goto exit;
         }
     }
 exit:
-    return res;
+    return status;
 }
 
 kvs_status_t pmi_listener::clean_listener(std::shared_ptr<helper> h) {
-    KVS_CHECK_STATUS(h->remove_name_key(KVS_LISTENER, my_hostname), "failed to remove host info");
+    KVS_CHECK_STATUS(h->remove_name_key(KVS_LISTENER, pmi_hostname), "failed to remove host info");
     close(sock_listener);
     return KVS_STATUS_SUCCESS;
 }
@@ -199,7 +199,7 @@ kvs_status_t pmi_listener::run_listener(std::shared_ptr<helper> h) {
 
         SET_STR(
             addr_for_kvs, REQUEST_POSTFIX_SIZE, KVS_NAME_TEMPLATE_I, my_ip, (size_t)addr.sin_port);
-        KVS_CHECK_STATUS(h->set_value(KVS_LISTENER, my_hostname, addr_for_kvs),
+        KVS_CHECK_STATUS(h->set_value(KVS_LISTENER, pmi_hostname, addr_for_kvs),
                          "failed to set addr info");
         if (setsockopt(sock_listener, SOL_SOCKET, SO_RCVTIMEO, &timeout, sizeof(timeout)) < 0) {
             perror("Error");
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/resizable_pmi.cpp b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/resizable_pmi.cpp
index 0190df9fb..00af8c1ee 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/resizable_pmi.cpp
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/resizable_pmi.cpp
@@ -22,7 +22,7 @@ static size_t ask_only_framework = 0;
 static size_t finalized = 0;
 static size_t extreme_finalize = 0;
 static struct sigaction old_act;
-char my_hostname[MAX_KVS_VAL_LENGTH];
+char pmi_hostname[MAX_KVS_VAL_LENGTH];
 
 // TODO: rework it for multi kvs
 static pmi_resizable* pmi_object;
@@ -224,7 +224,7 @@ kvs_status_t pmi_resizable::hard_finalize(int sig) {
 
     SET_STR(rank_str, INT_STR_SIZE, RANK_TEMPLATE, my_rank);
 
-    KVS_CHECK_STATUS(h->set_value(KVS_DEAD_POD, my_hostname, rank_str), "failed to set dead rank");
+    KVS_CHECK_STATUS(h->set_value(KVS_DEAD_POD, pmi_hostname, rank_str), "failed to set dead rank");
 
     KVS_CHECK_STATUS(listener.send_notification(sig, h), "failed to send notification");
 
@@ -236,31 +236,31 @@ kvs_status_t pmi_resizable::hard_finalize(int sig) {
     return KVS_STATUS_SUCCESS;
 }
 
-kvs_status_t pmi_resizable::PMIR_Main_Addr_Reserve(char* main_addr) {
-    return h->main_server_address_reserve(main_addr);
+kvs_status_t pmi_resizable::PMIR_Main_Addr_Reserve(char* addr) {
+    return h->main_server_address_reserve(addr);
 }
 
-kvs_status_t pmi_resizable::PMIR_Init(const char* main_addr) {
+kvs_status_t pmi_resizable::PMIR_Init(const char* addr) {
     struct sigaction act;
     FILE* fp;
     finalized = 0;
-    memset(my_hostname, 0, MAX_KVS_VAL_LENGTH);
+    memset(pmi_hostname, 0, MAX_KVS_VAL_LENGTH);
     if ((fp = popen("hostname", READ_ONLY)) == NULL) {
         printf("Can't get hostname\n");
         exit(1);
     }
-    CHECK_FGETS(fgets(my_hostname, MAX_KVS_VAL_LENGTH, fp), my_hostname);
+    CHECK_FGETS(fgets(pmi_hostname, MAX_KVS_VAL_LENGTH, fp), pmi_hostname);
     pclose(fp);
-    while (my_hostname[strlen(my_hostname) - 1] == '\n' ||
-           my_hostname[strlen(my_hostname) - 1] == ' ')
-        my_hostname[strlen(my_hostname) - 1] = '\0';
+    while (pmi_hostname[strlen(pmi_hostname) - 1] == '\n' ||
+           pmi_hostname[strlen(pmi_hostname) - 1] == ' ')
+        pmi_hostname[strlen(pmi_hostname) - 1] = '\0';
 
-    SET_STR(&(my_hostname[strlen(my_hostname)]),
-            MAX_KVS_VAL_LENGTH - (int)strlen(my_hostname) - 1,
+    SET_STR(&(pmi_hostname[strlen(pmi_hostname)]),
+            MAX_KVS_VAL_LENGTH - (int)strlen(pmi_hostname) - 1,
             "-%d",
             getpid());
 
-    KVS_CHECK_STATUS(h->init(main_addr), "failed to init");
+    KVS_CHECK_STATUS(h->init(addr), "failed to init");
 
     KVS_CHECK_STATUS(h->reg_rank(), "failed to rank register");
 
@@ -304,7 +304,8 @@ kvs_status_t pmi_resizable::PMIR_Finalize(void) {
     if (my_rank == 0 && extreme_finalize != 1) {
         KVS_CHECK_STATUS(h->remove_name_key(KVS_UP, KVS_IDX), "failed to remove IDx");
     }
-    KVS_CHECK_STATUS(h->remove_name_key(KVS_BARRIER, my_hostname), "failed to remove barrier info");
+    KVS_CHECK_STATUS(h->remove_name_key(KVS_BARRIER, pmi_hostname),
+                     "failed to remove barrier info");
 
     KVS_CHECK_STATUS(h->finalize(), "failed to finalize");
 
@@ -320,7 +321,7 @@ kvs_status_t pmi_resizable::PMIR_Barrier(void) {
 
     SET_STR(barrier_num_str, INT_STR_SIZE, SIZE_T_TEMPLATE, barrier_num);
 
-    KVS_CHECK_STATUS(h->set_value(KVS_BARRIER, my_hostname, barrier_num_str),
+    KVS_CHECK_STATUS(h->set_value(KVS_BARRIER, pmi_hostname, barrier_num_str),
                      "failed to set barrier info");
 
     KVS_CHECK_STATUS(h->get_barrier_idx(min_barrier_num), "failed to get barrier IDx");
@@ -335,13 +336,13 @@ kvs_status_t pmi_resizable::PMIR_Barrier(void) {
     return KVS_STATUS_SUCCESS;
 }
 
-kvs_status_t pmi_resizable::PMIR_Get_size(int* size) {
-    *size = count_pods;
+kvs_status_t pmi_resizable::PMIR_Get_size(int* size_ptr) {
+    *size_ptr = count_pods;
     return KVS_STATUS_SUCCESS;
 }
 
-kvs_status_t pmi_resizable::PMIR_Get_rank(int* rank) {
-    *rank = my_rank;
+kvs_status_t pmi_resizable::PMIR_Get_rank(int* rank_ptr) {
+    *rank_ptr = my_rank;
     return KVS_STATUS_SUCCESS;
 }
 
@@ -385,7 +386,7 @@ kvs_status_t pmi_resizable::PMIR_KVS_Get(const char* kvs_name,
     std::string value_vec;
     do {
         KVS_CHECK_STATUS(h->get_value_by_name_key(kvs_name, key, value_vec), "failed to get value");
-    } while (value_vec.length() == 0);
+    } while (value_vec.empty());
 
     snprintf(value, value_vec.length(), "%s", value_vec.c_str());
     return KVS_STATUS_SUCCESS;
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_simple.cpp b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_simple.cpp
index 58c961dbc..b12cbdb7a 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_simple.cpp
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_simple.cpp
@@ -87,7 +87,7 @@ atl_status_t pmi_resizable_simple::make_requested_info() {
     return ATL_STATUS_SUCCESS;
 }
 
-atl_status_t pmi_resizable_simple::pmrt_main_addr_reserve(char* main_addr) {
+atl_status_t pmi_resizable_simple::pmrt_main_addr_reserve(char* addr) {
     return ATL_STATUS_UNSUPPORTED;
 }
 
@@ -271,7 +271,7 @@ atl_status_t pmi_resizable_simple::kvs_get_value(const char* kvs_name,
         KVS_2_ATL_CHECK_STATUS(k->kvs_get_value_by_name_key(result_kvs_name, key, value_vec),
                                "failed to get value");
         kvs_get_time = time(NULL) - start_time;
-    } while (value_vec.length() == 0 && kvs_get_time < kvs_get_timeout);
+    } while (value_vec.empty() && kvs_get_time < kvs_get_timeout);
 
     if (kvs_get_time >= kvs_get_timeout) {
         LOG_ERROR("KVS get error: timeout limit (%zu > %zu), prefix: %s, key %s\n",
@@ -454,7 +454,7 @@ atl_status_t pmi_resizable_simple::get_local_kvs_id(size_t& res) {
 atl_status_t pmi_resizable_simple::set_local_kvs_id(size_t local_kvs_id) {
     /*TODO: change it for collect local_per_rank id, not global*/
     put_key(LOCAL_KVS_ID, "ID", std::to_string(local_kvs_id).c_str(), ST_CLIENT);
-    return (k->kvs_set_value(LOCAL_KVS_ID, "ID", std::to_string(local_kvs_id).c_str()) ==
+    return (k->kvs_set_value(LOCAL_KVS_ID, "ID", std::to_string(local_kvs_id)) ==
             KVS_STATUS_SUCCESS)
                ? ATL_STATUS_SUCCESS
                : ATL_STATUS_FAILURE;
@@ -467,8 +467,7 @@ pmi_resizable_simple::~pmi_resizable_simple() {
 atl_status_t pmi_resizable_simple::remove_initial_data() {
     std::string result_kvs_name = std::string(RANKS_PER_THREAD) + std::to_string(0);
     remove_val(result_kvs_name.c_str(), std::to_string(ranks[0]).c_str(), ST_CLIENT);
-    return k->kvs_remove_name_key(result_kvs_name.c_str(), std::to_string(ranks[0]).c_str()) ==
-                   KVS_STATUS_SUCCESS
+    return k->kvs_remove_name_key(result_kvs_name, std::to_string(ranks[0])) == KVS_STATUS_SUCCESS
                ? ATL_STATUS_SUCCESS
                : ATL_STATUS_FAILURE;
 }
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_simple_internal.cpp b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_simple_internal.cpp
index 46149fad4..4138297c4 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_simple_internal.cpp
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_simple_internal.cpp
@@ -148,7 +148,7 @@ atl_status_t pmi_resizable_simple_internal::barrier_reg() {
     return ATL_STATUS_SUCCESS;
 }
 
-atl_status_t pmi_resizable_simple_internal::pmrt_main_addr_reserve(char* main_addr) {
+atl_status_t pmi_resizable_simple_internal::pmrt_main_addr_reserve(char* addr) {
     LOG_ERROR("unsupported");
     return ATL_STATUS_UNSUPPORTED;
 }
@@ -239,7 +239,7 @@ atl_status_t pmi_resizable_simple_internal::pmrt_kvs_get(char* kvs_key,
                                                          size_t kvs_val_len) {
     int ret;
     char key_storage[max_keylen];
-    std::string val_storage;
+    std::string val_storage_str;
 
     ret = snprintf(key_storage, max_keylen - 1, RESIZABLE_PMI_RT_KEY_FORMAT, kvs_key, proc_idx);
     if (ret < 0) {
@@ -247,9 +247,9 @@ atl_status_t pmi_resizable_simple_internal::pmrt_kvs_get(char* kvs_key,
         return ATL_STATUS_FAILURE;
     }
 
-    ATL_CHECK_STATUS(kvs_get_value(KVS_NAME, key_storage, val_storage), "failed to get val");
+    ATL_CHECK_STATUS(kvs_get_value(KVS_NAME, key_storage, val_storage_str), "failed to get val");
 
-    ret = decode(val_storage.c_str(), kvs_val, kvs_val_len);
+    ret = decode(val_storage_str.c_str(), kvs_val, kvs_val_len);
     if (ret) {
         LOG_ERROR("decode failed");
         return ATL_STATUS_FAILURE;
@@ -299,7 +299,7 @@ atl_status_t pmi_resizable_simple_internal::kvs_get_value(const std::string& kvs
         KVS_2_ATL_CHECK_STATUS(k->kvs_get_value_by_name_key(result_kvs_name, key, value),
                                "failed to get value");
         kvs_get_time = time(NULL) - start_time;
-    } while (value.length() == 0 && kvs_get_time < kvs_get_timeout);
+    } while (value.empty() && kvs_get_time < kvs_get_timeout);
 
     if (kvs_get_time >= kvs_get_timeout) {
         LOG_ERROR("KVS get error: timeout limit (%zu > %zu), prefix: %s, key %s\n",
@@ -324,8 +324,7 @@ atl_status_t pmi_resizable_simple_internal::get_local_kvs_id(size_t& res) {
 
 atl_status_t pmi_resizable_simple_internal::set_local_kvs_id(size_t local_kvs_id) {
     /*TODO: change it for collect local_per_rank id, not global*/
-    return k->kvs_set_value(LOCAL_KVS_ID, "ID", std::to_string(local_kvs_id).c_str()) ==
-                   KVS_STATUS_SUCCESS
+    return k->kvs_set_value(LOCAL_KVS_ID, "ID", std::to_string(local_kvs_id)) == KVS_STATUS_SUCCESS
                ? ATL_STATUS_SUCCESS
                : ATL_STATUS_FAILURE;
 }
diff --git a/src/ccl_api_functions.cpp b/src/ccl_api_functions.cpp
index ef59bc0d6..d4b8c3492 100644
--- a/src/ccl_api_functions.cpp
+++ b/src/ccl_api_functions.cpp
@@ -952,18 +952,18 @@ API_COMM_OP_PTR_EXPLICIT_INSTANTIATION(double);
 #define COMMA ,
 #endif
 
-API_COMM_OP_REF_EXPLICIT_INSTANTIATION(cl::sycl::buffer<int8_t COMMA 1>);
-API_COMM_OP_REF_EXPLICIT_INSTANTIATION(cl::sycl::buffer<uint8_t COMMA 1>);
-API_COMM_OP_REF_EXPLICIT_INSTANTIATION(cl::sycl::buffer<int16_t COMMA 1>);
-API_COMM_OP_REF_EXPLICIT_INSTANTIATION(cl::sycl::buffer<uint16_t COMMA 1>);
-API_COMM_OP_REF_EXPLICIT_INSTANTIATION(cl::sycl::buffer<int32_t COMMA 1>);
-API_COMM_OP_REF_EXPLICIT_INSTANTIATION(cl::sycl::buffer<uint32_t COMMA 1>);
-API_COMM_OP_REF_EXPLICIT_INSTANTIATION(cl::sycl::buffer<int64_t COMMA 1>);
-API_COMM_OP_REF_EXPLICIT_INSTANTIATION(cl::sycl::buffer<uint64_t COMMA 1>);
-/*API_COMM_OP_REF_EXPLICIT_INSTANTIATION(cl::sycl::buffer<ccl::float16 COMMA 1>);*/
-API_COMM_OP_REF_EXPLICIT_INSTANTIATION(cl::sycl::buffer<float COMMA 1>);
-API_COMM_OP_REF_EXPLICIT_INSTANTIATION(cl::sycl::buffer<double COMMA 1>);
-/*API_COMM_OP_REF_EXPLICIT_INSTANTIATION(cl::sycl::buffer<ccl::bfloat16 COMMA 1>);*/
+API_COMM_OP_REF_EXPLICIT_INSTANTIATION(sycl::buffer<int8_t COMMA 1>);
+API_COMM_OP_REF_EXPLICIT_INSTANTIATION(sycl::buffer<uint8_t COMMA 1>);
+API_COMM_OP_REF_EXPLICIT_INSTANTIATION(sycl::buffer<int16_t COMMA 1>);
+API_COMM_OP_REF_EXPLICIT_INSTANTIATION(sycl::buffer<uint16_t COMMA 1>);
+API_COMM_OP_REF_EXPLICIT_INSTANTIATION(sycl::buffer<int32_t COMMA 1>);
+API_COMM_OP_REF_EXPLICIT_INSTANTIATION(sycl::buffer<uint32_t COMMA 1>);
+API_COMM_OP_REF_EXPLICIT_INSTANTIATION(sycl::buffer<int64_t COMMA 1>);
+API_COMM_OP_REF_EXPLICIT_INSTANTIATION(sycl::buffer<uint64_t COMMA 1>);
+/*API_COMM_OP_REF_EXPLICIT_INSTANTIATION(sycl::buffer<ccl::float16 COMMA 1>);*/
+API_COMM_OP_REF_EXPLICIT_INSTANTIATION(sycl::buffer<float COMMA 1>);
+API_COMM_OP_REF_EXPLICIT_INSTANTIATION(sycl::buffer<double COMMA 1>);
+/*API_COMM_OP_REF_EXPLICIT_INSTANTIATION(sycl::buffer<ccl::bfloat16 COMMA 1>);*/
 
 #undef COMMA
 #endif // CCL_ENABLE_SYCL
diff --git a/src/coll/algorithms/algorithms.hpp b/src/coll/algorithms/algorithms.hpp
index 5ff41e2fe..38f56a942 100644
--- a/src/coll/algorithms/algorithms.hpp
+++ b/src/coll/algorithms/algorithms.hpp
@@ -23,51 +23,49 @@
 
 #define CCL_UNDEFINED_ALGO_ID (-1)
 
-ccl::status ccl_coll_build_naive_bcast(ccl_sched* sched,
-                                       ccl_buffer buf,
-                                       size_t count,
-                                       const ccl_datatype& dtype,
-                                       int root,
-                                       ccl_comm* comm);
-
-ccl::status ccl_coll_build_scatter_ring_allgather_bcast(ccl_sched* sched,
-                                                        ccl_buffer buf,
-                                                        size_t count,
-                                                        const ccl_datatype& dtype,
-                                                        int root,
-                                                        ccl_comm* comm);
-
-#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
-ccl::status ccl_coll_build_topo_bcast(ccl_sched* sched,
-                                      ccl_buffer buf,
-                                      size_t count,
-                                      const ccl_datatype& dtype,
-                                      int root,
-                                      ccl_comm* comm);
-#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
-
-ccl::status ccl_coll_build_dissemination_barrier(ccl_sched* sched, ccl_comm* comm);
-
-ccl::status ccl_coll_build_rabenseifner_reduce(ccl_sched* sched,
-                                               ccl_buffer send_buf,
-                                               ccl_buffer recv_buf,
-                                               size_t count,
-                                               const ccl_datatype& dtype,
-                                               ccl::reduction reduction,
-                                               int root,
-                                               ccl_comm* comm);
-
+// allgatherv
+ccl::status ccl_coll_build_direct_allgatherv(ccl_sched* sched,
+                                             ccl_buffer send_buf,
+                                             size_t send_count,
+                                             ccl_buffer recv_buf,
+                                             const size_t* recv_counts,
+                                             const ccl_datatype& dtype,
+                                             ccl_comm* comm);
+ccl::status ccl_coll_build_naive_allgatherv(ccl_sched* sched,
+                                            ccl_buffer send_buf,
+                                            size_t send_count,
+                                            ccl_buffer recv_buf,
+                                            const size_t* recv_counts,
+                                            const ccl_datatype& dtype,
+                                            ccl_comm* comm);
+ccl::status ccl_coll_build_ring_allgatherv(ccl_sched* sched,
+                                           ccl_buffer send_buf,
+                                           size_t send_count,
+                                           ccl_buffer recv_buf,
+                                           const size_t* recv_counts,
+                                           const ccl_datatype& dtype,
+                                           ccl_comm* comm);
+ccl::status ccl_coll_build_flat_allgatherv(ccl_sched* main_sched,
+                                           std::vector<ccl_sched*>& scheds,
+                                           const ccl_coll_param& coll_param);
+ccl::status ccl_coll_build_multi_bcast_allgatherv(ccl_sched* main_sched,
+                                                  std::vector<ccl_sched*>& scheds,
+                                                  const ccl_coll_param& coll_param,
+                                                  size_t data_partition_count);
 #if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
-ccl::status ccl_coll_build_topo_reduce(ccl_sched* sched,
-                                       ccl_buffer send_buf,
-                                       ccl_buffer recv_buf,
-                                       size_t count,
-                                       const ccl_datatype& dtype,
-                                       ccl::reduction reduction,
-                                       int root,
-                                       ccl_comm* comm);
+ccl::status ccl_coll_build_topo_allgatherv(ccl_sched* main_sched,
+                                           std::vector<ccl_sched*>& scheds,
+                                           const ccl_coll_param& coll_param);
 #endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
 
+// allreduce
+ccl::status ccl_coll_build_direct_allreduce(ccl_sched* sched,
+                                            ccl_buffer send_buf,
+                                            ccl_buffer recv_buf,
+                                            size_t count,
+                                            const ccl_datatype& dtype,
+                                            ccl::reduction reduction,
+                                            ccl_comm* comm);
 ccl::status ccl_coll_build_rabenseifner_allreduce(ccl_sched* sched,
                                                   ccl_buffer send_buf,
                                                   ccl_buffer recv_buf,
@@ -75,16 +73,13 @@ ccl::status ccl_coll_build_rabenseifner_allreduce(ccl_sched* sched,
                                                   const ccl_datatype& dtype,
                                                   ccl::reduction reduction,
                                                   ccl_comm* comm);
-
-ccl::status ccl_coll_build_binomial_reduce(ccl_sched* sched,
-                                           ccl_buffer send_buf,
-                                           ccl_buffer recv_buf,
-                                           size_t count,
-                                           const ccl_datatype& dtype,
-                                           ccl::reduction reduction,
-                                           int root,
-                                           ccl_comm* comm);
-
+ccl::status ccl_coll_build_nreduce_allreduce(ccl_sched* sched,
+                                             ccl_buffer send_buf,
+                                             ccl_buffer recv_buf,
+                                             size_t count,
+                                             const ccl_datatype& dtype,
+                                             ccl::reduction reduction,
+                                             ccl_comm* comm);
 ccl::status ccl_coll_build_ring_allreduce(ccl_sched* sched,
                                           ccl_buffer send_buf,
                                           ccl_buffer recv_buf,
@@ -92,7 +87,6 @@ ccl::status ccl_coll_build_ring_allreduce(ccl_sched* sched,
                                           const ccl_datatype& dtype,
                                           ccl::reduction reduction,
                                           ccl_comm* comm);
-
 ccl::status ccl_coll_build_ring_rma_allreduce(ccl_sched* sched,
                                               ccl_buffer send_buf,
                                               ccl_buffer recv_buf,
@@ -100,7 +94,6 @@ ccl::status ccl_coll_build_ring_rma_allreduce(ccl_sched* sched,
                                               const ccl_datatype& dtype,
                                               ccl::reduction reduction,
                                               ccl_comm* comm);
-
 ccl::status ccl_coll_build_recursive_doubling_allreduce(ccl_sched* sched,
                                                         ccl_buffer send_buf,
                                                         ccl_buffer recv_buf,
@@ -108,15 +101,13 @@ ccl::status ccl_coll_build_recursive_doubling_allreduce(ccl_sched* sched,
                                                         const ccl_datatype& dtype,
                                                         ccl::reduction reduction,
                                                         ccl_comm* comm);
-
-ccl::status ccl_coll_build_nreduce_allreduce(ccl_sched* sched,
-                                             ccl_buffer send_buf,
-                                             ccl_buffer recv_buf,
-                                             size_t count,
-                                             const ccl_datatype& dtype,
-                                             ccl::reduction reduction,
-                                             ccl_comm* comm);
-
+ccl::status ccl_coll_build_2d_allreduce(ccl_sched* sched,
+                                        ccl_buffer send_buf,
+                                        ccl_buffer recv_buf,
+                                        size_t count,
+                                        const ccl_datatype& dtype,
+                                        ccl::reduction reduction,
+                                        ccl_comm* comm);
 #if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
 ccl::status ccl_coll_build_topo_allreduce(ccl_sched* sched,
                                           ccl_buffer send_buf,
@@ -125,83 +116,68 @@ ccl::status ccl_coll_build_topo_allreduce(ccl_sched* sched,
                                           const ccl_datatype& dtype,
                                           ccl::reduction reduction,
                                           ccl_comm* comm);
-
 #endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
 
-ccl::status ccl_coll_build_naive_allgatherv(ccl_sched* sched,
-                                            ccl_buffer send_buf,
-                                            size_t send_count,
-                                            ccl_buffer recv_buf,
-                                            const size_t* recv_counts,
-                                            const ccl_datatype& dtype,
-                                            ccl_comm* comm);
-
-class ccl_double_tree;
-ccl::status ccl_coll_build_double_tree_op(ccl_sched* sched,
-                                          ccl_coll_type coll_type,
-                                          ccl_buffer send_buf,
-                                          ccl_buffer recv_buf,
-                                          size_t count,
-                                          const ccl_datatype& dtype,
-                                          ccl::reduction reduction,
-                                          const ccl_double_tree& dtree,
-                                          ccl_comm* comm);
-
-ccl::status ccl_coll_build_ring_reduce_scatter(ccl_sched* sched,
-                                               ccl_buffer send_buf,
-                                               ccl_buffer recv_buf,
-                                               size_t send_count,
-                                               const ccl_datatype& dtype,
-                                               ccl::reduction reduction,
-                                               ccl_comm* comm);
-
-ccl::status ccl_coll_build_ring_reduce_scatter_block(ccl_sched* sched,
-                                                     ccl_buffer send_buf,
-                                                     ccl_buffer recv_buf,
-                                                     size_t recv_count,
-                                                     const ccl_datatype& dtype,
-                                                     ccl::reduction reduction,
-                                                     ccl_comm* comm);
-
-ccl::status ccl_coll_build_ring_allgatherv(ccl_sched* sched,
+// alltoall(v)
+ccl::status ccl_coll_build_direct_alltoall(ccl_sched* sched,
                                            ccl_buffer send_buf,
-                                           size_t send_count,
                                            ccl_buffer recv_buf,
-                                           const size_t* recv_counts,
+                                           size_t count,
                                            const ccl_datatype& dtype,
                                            ccl_comm* comm);
 
-ccl::status ccl_coll_build_flat_allgatherv(ccl_sched* main_sched,
-                                           std::vector<ccl_sched*>& scheds,
-                                           const ccl_coll_param& coll_param);
-
-ccl::status ccl_coll_build_multi_bcast_allgatherv(ccl_sched* main_sched,
-                                                  std::vector<ccl_sched*>& scheds,
-                                                  const ccl_coll_param& coll_param,
-                                                  size_t data_partition_count);
-
-ccl::status ccl_coll_build_topo_allgatherv(ccl_sched* main_sched,
-                                           std::vector<ccl_sched*>& scheds,
-                                           const ccl_coll_param& coll_param);
-
+ccl::status ccl_coll_build_direct_alltoallv(ccl_sched* sched,
+                                            ccl_buffer send_buf,
+                                            const size_t* send_counts,
+                                            ccl_buffer recv_buf,
+                                            const size_t* recv_counts,
+                                            const ccl_datatype& dtype,
+                                            ccl_comm* comm);
 ccl::status ccl_coll_build_naive_alltoallv(ccl_sched* main_sched,
                                            std::vector<ccl_sched*>& scheds,
                                            const ccl_coll_param& coll_param);
-
 ccl::status ccl_coll_build_scatter_alltoallv(ccl_sched* main_sched,
                                              std::vector<ccl_sched*>& scheds,
                                              const ccl_coll_param& coll_param);
-
 #if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
 ccl::status ccl_coll_build_topo_alltoallv(ccl_sched* main_sched,
                                           std::vector<ccl_sched*>& scheds,
                                           const ccl_coll_param& coll_param);
 #endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
 
-/* direct algorithms - i.e. direct mapping on collective API from transport level */
-
+// barrier
 ccl::status ccl_coll_build_direct_barrier(ccl_sched* sched, ccl_comm* comm);
+ccl::status ccl_coll_build_dissemination_barrier(ccl_sched* sched, ccl_comm* comm);
 
+// bcast
+ccl::status ccl_coll_build_direct_bcast(ccl_sched* sched,
+                                        ccl_buffer buf,
+                                        size_t count,
+                                        const ccl_datatype& dtype,
+                                        int root,
+                                        ccl_comm* comm);
+ccl::status ccl_coll_build_scatter_ring_allgather_bcast(ccl_sched* sched,
+                                                        ccl_buffer buf,
+                                                        size_t count,
+                                                        const ccl_datatype& dtype,
+                                                        int root,
+                                                        ccl_comm* comm);
+ccl::status ccl_coll_build_naive_bcast(ccl_sched* sched,
+                                       ccl_buffer buf,
+                                       size_t count,
+                                       const ccl_datatype& dtype,
+                                       int root,
+                                       ccl_comm* comm);
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+ccl::status ccl_coll_build_topo_bcast(ccl_sched* sched,
+                                      ccl_buffer buf,
+                                      size_t count,
+                                      const ccl_datatype& dtype,
+                                      int root,
+                                      ccl_comm* comm);
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
+
+// reduce
 ccl::status ccl_coll_build_direct_reduce(ccl_sched* sched,
                                          ccl_buffer send_buf,
                                          ccl_buffer recv_buf,
@@ -210,45 +186,34 @@ ccl::status ccl_coll_build_direct_reduce(ccl_sched* sched,
                                          ccl::reduction reduction,
                                          int root,
                                          ccl_comm* comm);
-
-ccl::status ccl_coll_build_direct_allgatherv(ccl_sched* sched,
-                                             ccl_buffer send_buf,
-                                             size_t send_count,
-                                             ccl_buffer recv_buf,
-                                             const size_t* recv_counts,
-                                             const ccl_datatype& dtype,
-                                             ccl_comm* comm);
-
-ccl::status ccl_coll_build_direct_allreduce(ccl_sched* sched,
-                                            ccl_buffer send_buf,
-                                            ccl_buffer recv_buf,
-                                            size_t count,
-                                            const ccl_datatype& dtype,
-                                            ccl::reduction reduction,
-                                            ccl_comm* comm);
-
-ccl::status ccl_coll_build_direct_alltoall(ccl_sched* sched,
+ccl::status ccl_coll_build_rabenseifner_reduce(ccl_sched* sched,
+                                               ccl_buffer send_buf,
+                                               ccl_buffer recv_buf,
+                                               size_t count,
+                                               const ccl_datatype& dtype,
+                                               ccl::reduction reduction,
+                                               int root,
+                                               ccl_comm* comm);
+ccl::status ccl_coll_build_binomial_reduce(ccl_sched* sched,
                                            ccl_buffer send_buf,
                                            ccl_buffer recv_buf,
                                            size_t count,
                                            const ccl_datatype& dtype,
+                                           ccl::reduction reduction,
+                                           int root,
                                            ccl_comm* comm);
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+ccl::status ccl_coll_build_topo_reduce(ccl_sched* sched,
+                                       ccl_buffer send_buf,
+                                       ccl_buffer recv_buf,
+                                       size_t count,
+                                       const ccl_datatype& dtype,
+                                       ccl::reduction reduction,
+                                       int root,
+                                       ccl_comm* comm);
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
 
-ccl::status ccl_coll_build_direct_alltoallv(ccl_sched* sched,
-                                            ccl_buffer send_buf,
-                                            const size_t* send_counts,
-                                            ccl_buffer recv_buf,
-                                            const size_t* recv_counts,
-                                            const ccl_datatype& dtype,
-                                            ccl_comm* comm);
-
-ccl::status ccl_coll_build_direct_bcast(ccl_sched* sched,
-                                        ccl_buffer buf,
-                                        size_t count,
-                                        const ccl_datatype& dtype,
-                                        int root,
-                                        ccl_comm* comm);
-
+// reduce_scatter
 ccl::status ccl_coll_build_direct_reduce_scatter(ccl_sched* sched,
                                                  ccl_buffer send_buf,
                                                  ccl_buffer recv_buf,
@@ -256,7 +221,21 @@ ccl::status ccl_coll_build_direct_reduce_scatter(ccl_sched* sched,
                                                  const ccl_datatype& dtype,
                                                  ccl::reduction reduction,
                                                  ccl_comm* comm);
-
+ccl::status ccl_coll_build_ring_reduce_scatter(ccl_sched* sched,
+                                               ccl_buffer send_buf,
+                                               ccl_buffer recv_buf,
+                                               size_t send_count,
+                                               const ccl_datatype& dtype,
+                                               ccl::reduction reduction,
+                                               ccl_comm* comm);
+ccl::status ccl_coll_build_ring_reduce_scatter_block(ccl_sched* sched,
+                                                     ccl_buffer send_buf,
+                                                     ccl_buffer recv_buf,
+                                                     size_t recv_count,
+                                                     const ccl_datatype& dtype,
+                                                     ccl::reduction reduction,
+                                                     ccl_comm* comm);
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
 ccl::status ccl_coll_build_topo_reduce_scatter(ccl_sched* sched,
                                                ccl_buffer send_buf,
                                                ccl_buffer recv_buf,
@@ -264,3 +243,15 @@ ccl::status ccl_coll_build_topo_reduce_scatter(ccl_sched* sched,
                                                const ccl_datatype& dtype,
                                                ccl::reduction reduction,
                                                ccl_comm* comm);
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
+
+class ccl_double_tree;
+ccl::status ccl_coll_build_double_tree_op(ccl_sched* sched,
+                                          ccl_coll_type coll_type,
+                                          ccl_buffer send_buf,
+                                          ccl_buffer recv_buf,
+                                          size_t count,
+                                          const ccl_datatype& dtype,
+                                          ccl::reduction reduction,
+                                          const ccl_double_tree& dtree,
+                                          ccl_comm* comm);
diff --git a/src/coll/algorithms/allgatherv.cpp b/src/coll/algorithms/allgatherv.cpp
index f2569cff0..674ee3e42 100644
--- a/src/coll/algorithms/allgatherv.cpp
+++ b/src/coll/algorithms/allgatherv.cpp
@@ -162,7 +162,6 @@ ccl::status ccl_coll_get_allgatherv_bufs_and_offsets(const ccl_coll_param& coll_
     }
     else {
         size_t offset = 0;
-        size_t dtype_size = coll_param.dtype.size();
         for (int idx = 0; idx < comm_size; idx++) {
             size_t bytes = coll_param.get_recv_count(idx) * dtype_size;
             recv_bufs[idx].set(coll_param.get_recv_buf(), offset + bytes, offset);
@@ -338,6 +337,7 @@ ccl::status ccl_coll_build_topo_allgatherv(ccl_sched* main_sched,
     ccl_comm* pair_comm = comm->get_pair_comm().get();
     ccl_comm* even_comm = comm->get_even_comm().get();
     ccl_comm* node_comm = comm->get_node_comm().get();
+    ccl_comm* r2r_comm = comm->get_r2r_comm().get();
 
     const int lead_rank = ccl::global_data::env().kernel_1s_lead;
     const bool is_lead_rank = pair_comm->rank() == lead_rank;
@@ -345,7 +345,7 @@ ccl::status ccl_coll_build_topo_allgatherv(ccl_sched* main_sched,
     const int even_comm_size = even_comm->size();
     const bool is_multi_card = (even_comm_size > 1);
     const ccl::topo_manager& topo_manager = comm->get_topo_manager();
-    CCL_THROW_IF_NOT(topo_manager.is_single_card != is_multi_card);
+    bool is_single_node = topo_manager.is_single_node;
 
     /* IPC exchange */
     std::vector<ze_handle_exchange_entry::mem_desc_t> in_buffers{
@@ -367,14 +367,150 @@ ccl::status ccl_coll_build_topo_allgatherv(ccl_sched* main_sched,
     // using add_sched_barrier_for_parallel_copies function
     std::list<ze_event_handle_t> parallel_copy_events;
 
-    // for small msg sizes we get more performance without main blitter using (overhead?)
-    const bool can_use_main_blitter = (send_count * dtype.size()) > 1048576;
+    auto add_sched_barrier_for_parallel_copies = [&]() {
+        wait_events.insert(
+            wait_events.end(), parallel_copy_events.begin(), parallel_copy_events.end());
+        parallel_copy_events.clear();
+        sched->add_barrier();
+    };
+
+    if (!is_single_node) {
+        if (!is_inplace) {
+            // copy data from my send_buf to my recv_buf
+            copy_attr attr{};
+            attr.direction = copy_direction::d2d;
+            auto entry = entry_factory::create<ze_copy_entry>(sched,
+                                                              send_buf,
+                                                              recv_bufs[comm->rank()],
+                                                              recv_counts[comm->rank()],
+                                                              dtype,
+                                                              attr,
+                                                              wait_events);
+            parallel_copy_events.push_back(entry->entry_event);
+        }
+
+        // pack data to be used for scaleout
+        std::vector<ccl_buffer> recv_bufs_r2r;
+        std::vector<size_t> recv_counts_r2r;
+        for (int i = 0; i < r2r_comm->size(); i++) {
+            const int global_rank = r2r_comm->get_global_rank(i);
+            recv_bufs_r2r.push_back(recv_bufs[global_rank]);
+            recv_counts_r2r.push_back(recv_counts[global_rank]);
+        }
+
+        if (!is_single_node) {
+            ccl_coll_entry_param coll_param_scaleout{};
+            coll_param_scaleout.ctype = ccl_coll_allgatherv;
+            coll_param_scaleout.send_buf = send_buf;
+            coll_param_scaleout.recv_bufs = recv_bufs_r2r;
+            coll_param_scaleout.send_count = send_count;
+            coll_param_scaleout.recv_counts = recv_counts_r2r.data();
+            coll_param_scaleout.dtype = dtype;
+            coll_param_scaleout.comm = r2r_comm;
+
+            ccl::add_scaleout(sched, coll_param_scaleout, is_single_node, wait_events);
+        }
+
+        ccl::add_comm_barrier(sched, even_comm, wait_events);
+        auto recv_send_peers = [&](ccl_comm* recv_comm,
+                                   ccl_comm* send_comm,
+                                   size_t scaleout_offset = 0,
+                                   bool is_inplace = false) {
+            for (int peer_idx = 1; peer_idx < recv_comm->size(); peer_idx++) {
+                // copy data from all peers in even_comm
+                const int peer_rank = (recv_comm->rank() + peer_idx) % recv_comm->size();
+                CCL_THROW_IF_NOT(recv_comm->rank() != peer_rank, "Do not copy from own rank");
+                const int global_rank =
+                    (recv_comm->get_global_rank(peer_rank) + scaleout_offset) % comm->size();
+                copy_attr attr{};
+                attr.peer_rank = peer_rank;
+                if (is_inplace)
+                    attr.peer_buf_idx = recv_buf_idx_start + global_rank;
+                else
+                    attr.peer_buf_idx = send_buf_idx;
+                attr.direction = copy_direction::c2c;
+                attr.map_comm = recv_comm;
+                attr.hint_queue_index = parallel_copy_events.size();
+                auto entry = entry_factory::create<ze_copy_entry>(sched,
+                                                                  ccl_buffer(),
+                                                                  recv_bufs[global_rank],
+                                                                  recv_counts[global_rank],
+                                                                  dtype,
+                                                                  attr,
+                                                                  wait_events);
+                parallel_copy_events.push_back(entry->entry_event);
+
+                // do not do mdfi copy if only one tile is used
+                if (send_comm->size() == 1) {
+                    continue;
+                }
+
+                // copy the data recieved from even_comm peer (xelink) to pair_comm peer (mdfi)
+                int send_rank = (send_comm->rank() + 1) % send_comm->size();
+                copy_attr attr_send{};
+                attr_send.peer_rank = send_rank;
+                attr_send.peer_buf_idx = recv_buf_idx_start + global_rank;
+                attr_send.direction = copy_direction::t2t;
+                attr_send.map_comm = send_comm;
+                auto entry_send =
+                    entry_factory::create<ze_copy_entry>(sched,
+                                                         recv_bufs[global_rank],
+                                                         ccl_buffer(),
+                                                         recv_counts[global_rank],
+                                                         dtype,
+                                                         attr_send,
+                                                         wait_events,
+                                                         std::vector{ entry->entry_event });
+                parallel_copy_events.push_back(entry_send->entry_event);
+            }
+        };
+
+        size_t node_offset = 0;
+        // in case of scaleout, data is already copied to recv_buf and we can use in_place
+        bool is_use_inplace = is_inplace || !is_single_node;
+        for (int r2r_rank = 0; r2r_rank < r2r_comm->size(); r2r_rank++) {
+            // copy data from even_comm peers (xelink) that they recieved during scaleout
+            // and write the copied data to pair_comm peer (mdfi)
+            recv_send_peers(even_comm, pair_comm, node_offset, is_use_inplace);
+            node_offset += node_comm->size();
+
+            // do not do mdfi copy if only one tile is used
+            if (pair_comm->size() == 1) {
+                continue;
+            }
+
+            // write the data recieved during scaleout to pair_comm peer (mdfi)
+            int send_rank = (pair_comm->rank() + 1) % pair_comm->size();
+            copy_attr attr_send{};
+            attr_send.peer_rank = send_rank;
+            const int global_rank = r2r_comm->get_global_rank(r2r_rank);
+            attr_send.peer_buf_idx = recv_buf_idx_start + global_rank;
+            attr_send.direction = copy_direction::t2t;
+            attr_send.map_comm = pair_comm;
+            auto entry_send = entry_factory::create<ze_copy_entry>(sched,
+                                                                   recv_bufs[global_rank],
+                                                                   ccl_buffer(),
+                                                                   recv_counts[global_rank],
+                                                                   dtype,
+                                                                   attr_send,
+                                                                   wait_events);
+
+            parallel_copy_events.push_back(entry_send->entry_event);
+        }
+        add_sched_barrier_for_parallel_copies();
+        ccl::add_comm_barrier(sched, pair_comm, wait_events);
+
+        return ccl::status::success;
+    }
+
+    // for small msg sizes we get more performance without main CE using (main CE has overhead)
+    const bool can_use_small_msg_optimization = (send_count * dtype.size()) <= (1 * 1024 * 1024);
 
     // we use small scale algorithm by default and enable large scale algorithm using knob,
     // because small scale algorithm show more perfomance for today
     // TODO: and also we need to replace this knob with more intelligent switching in the future
     const bool can_use_large_scale_algorithm = ccl::global_data::env().allgatherv_topo_large_scale;
-    //ccl::global_data::env().ze_copy_engine != ccl_ze_copy_engine_none && is_multi_card &&
+    //comm->get_env()->get_ze_copy_engine() != ccl::ze::copy_engine_mode::none && is_multi_card &&
     //ccl::global_data::env().ze_max_copy_queues /* here must be real queue count */ >= even_comm_size-1) or unspecified;
 
     auto send_to_peers = [&](ccl_comm* comm, ccl_buffer in_buf, size_t count, size_t peer_buf_idx) {
@@ -384,10 +520,11 @@ ccl::status ccl_coll_build_topo_allgatherv(ccl_sched* main_sched,
             copy_attr attr{};
             attr.peer_rank = peer_rank;
             attr.peer_buf_idx = peer_buf_idx;
-            attr.direction = copy_direction::d2d;
+            // using of link CE for small msgs give us more performance
+            const bool use_c2c_direction = (comm == even_comm) || can_use_small_msg_optimization;
+            attr.direction = (use_c2c_direction) ? copy_direction::c2c : copy_direction::d2d;
             attr.map_comm = comm;
             attr.hint_queue_index = parallel_copy_events.size();
-            attr.is_peer_card_copy = (comm == even_comm) ? true : !can_use_main_blitter;
             auto entry = entry_factory::create<ze_copy_entry>(
                 sched, in_buf, ccl_buffer(), count, dtype, attr, wait_events);
             parallel_copy_events.push_back(entry->entry_event);
@@ -402,10 +539,10 @@ ccl::status ccl_coll_build_topo_allgatherv(ccl_sched* main_sched,
             copy_attr attr{};
             attr.peer_rank = peer_rank;
             attr.peer_buf_idx = send_buf_idx;
-            attr.direction = copy_direction::d2d;
+            const bool use_c2c_direction = (comm == even_comm) || can_use_small_msg_optimization;
+            attr.direction = (use_c2c_direction) ? copy_direction::c2c : copy_direction::d2d;
             attr.map_comm = comm;
             attr.hint_queue_index = parallel_copy_events.size();
-            attr.is_peer_card_copy = (comm == even_comm) ? true : !can_use_main_blitter;
             auto entry = entry_factory::create<ze_copy_entry>(sched,
                                                               ccl_buffer(),
                                                               recv_bufs[global_rank],
@@ -417,19 +554,12 @@ ccl::status ccl_coll_build_topo_allgatherv(ccl_sched* main_sched,
         }
     };
 
-    auto add_sched_barrier_for_parallel_copies = [&]() {
-        wait_events.insert(
-            wait_events.end(), parallel_copy_events.begin(), parallel_copy_events.end());
-        parallel_copy_events.clear();
-        sched->add_barrier();
-    };
-
     const bool do_self_copy = !is_inplace;
     if (do_self_copy) {
         /* copy data from my send_buf to my recv_buf */
         copy_attr attr{};
         attr.hint_queue_index = parallel_copy_events.size();
-        attr.is_peer_card_copy = true;
+        attr.direction = copy_direction::t2t;
         auto entry = entry_factory::create<ze_copy_entry>(sched,
                                                           send_buf,
                                                           recv_bufs[comm->rank()],
@@ -442,11 +572,11 @@ ccl::status ccl_coll_build_topo_allgatherv(ccl_sched* main_sched,
 
     const bool is_small_scale_algorithm = is_multi_card && !can_use_large_scale_algorithm;
     if (is_small_scale_algorithm) {
-        LOG_DEBUG("Use small scale algorithm");
+        LOG_DEBUG("use small scale algorithm");
     }
     const bool is_large_scale_algorithm = is_multi_card && can_use_large_scale_algorithm;
     if (is_large_scale_algorithm) {
-        LOG_DEBUG("Use large scale algorithm");
+        LOG_DEBUG("use large scale algorithm");
     }
 
     if (is_small_scale_algorithm) {
@@ -458,8 +588,7 @@ ccl::status ccl_coll_build_topo_allgatherv(ccl_sched* main_sched,
         /* Small scale algorithm: step 2 & 3. intra-card copy */
         LOG_DEBUG("topo/scale_up/intra: copy from self to peers");
         if (!is_lead_rank && !ccl::global_data::env().enable_ze_bidir_algo) {
-            auto barrier_event = ccl::add_comm_barrier(sched, pair_comm, wait_events);
-            wait_events.push_back(barrier_event);
+            ccl::add_comm_barrier(sched, pair_comm, wait_events);
         }
 
         for (int rank = pair_comm->rank(); rank < comm->size(); rank += pair_comm->size()) {
@@ -468,8 +597,7 @@ ccl::status ccl_coll_build_topo_allgatherv(ccl_sched* main_sched,
         add_sched_barrier_for_parallel_copies();
 
         if (is_lead_rank && !ccl::global_data::env().enable_ze_bidir_algo) {
-            auto barrier_event = ccl::add_comm_barrier(sched, pair_comm, wait_events);
-            wait_events.push_back(barrier_event);
+            ccl::add_comm_barrier(sched, pair_comm, wait_events);
         }
     }
     else {
@@ -477,14 +605,12 @@ ccl::status ccl_coll_build_topo_allgatherv(ccl_sched* main_sched,
         /* Large scale algorithm: step 1 & 2. intra-card copy */
         LOG_DEBUG("topo/scale_up/intra: copy to self from peers");
         if (!is_lead_rank && !ccl::global_data::env().enable_ze_bidir_algo) {
-            auto barrier_event = ccl::add_comm_barrier(sched, pair_comm, wait_events);
-            wait_events.push_back(barrier_event);
+            ccl::add_comm_barrier(sched, pair_comm, wait_events);
         }
         recv_from_peers(pair_comm);
         add_sched_barrier_for_parallel_copies();
         if (is_lead_rank && !ccl::global_data::env().enable_ze_bidir_algo) {
-            auto barrier_event = ccl::add_comm_barrier(sched, pair_comm, wait_events);
-            wait_events.push_back(barrier_event);
+            ccl::add_comm_barrier(sched, pair_comm, wait_events);
         }
     }
 
@@ -503,10 +629,7 @@ ccl::status ccl_coll_build_topo_allgatherv(ccl_sched* main_sched,
     }
 
     ccl_comm* barrier_comm = (is_large_scale_algorithm) ? even_comm : pair_comm;
-    auto barrier_event = ccl::add_comm_barrier(sched, barrier_comm, wait_events);
-    wait_events.push_back(barrier_event);
-
-    // TODO: scaleout here
+    ccl::add_comm_barrier(sched, barrier_comm, wait_events);
 
     return ccl::status::success;
 }
diff --git a/src/coll/algorithms/allreduce/allreduce.cpp b/src/coll/algorithms/allreduce/allreduce.cpp
index e3bae7d18..808e26d84 100644
--- a/src/coll/algorithms/allreduce/allreduce.cpp
+++ b/src/coll/algorithms/allreduce/allreduce.cpp
@@ -269,115 +269,6 @@ ccl::status ccl_coll_build_rabenseifner_allreduce(ccl_sched* sched,
     return status;
 }
 
-ccl::status ccl_coll_build_recursive_doubling_allreduce(ccl_sched* sched,
-                                                        ccl_buffer send_buf,
-                                                        ccl_buffer recv_buf,
-                                                        size_t count,
-                                                        const ccl_datatype& dtype,
-                                                        ccl::reduction op,
-                                                        ccl_comm* comm) {
-    LOG_DEBUG("build recursive_doubling allreduce");
-
-    ccl::status status = ccl::status::success;
-
-    int pof2, rem, comm_size, rank;
-    int newrank, mask, newdst, dst;
-
-    comm_size = comm->size();
-    rank = comm->rank();
-
-    size_t dtype_size = dtype.size();
-
-    ccl_buffer tmp_buf = sched->alloc_buffer({ count * dtype_size, send_buf });
-
-    /* copy local data into recv_buf */
-    if (send_buf != recv_buf) {
-        entry_factory::create<copy_entry>(sched, send_buf, recv_buf, count, dtype);
-        sched->add_barrier();
-    }
-
-    if (comm_size == 1)
-        return status;
-
-    /* get nearest power-of-two less than or equal to comm_size */
-    pof2 = comm->pof2();
-    rem = comm_size - pof2;
-
-    /* In the non-power-of-two case, all even-numbered
-     * processes of rank < 2*rem send their data to
-     * (rank+1). These even-numbered processes no longer
-     * participate in the algorithm until the very end. The
-     * remaining processes form a nice power-of-two. */
-
-    if (rank < 2 * rem) {
-        if (rank % 2 == 0) { /* even */
-            entry_factory::create<send_entry>(sched, recv_buf, count, dtype, rank + 1, comm);
-            sched->add_barrier();
-
-            /* temporarily set the rank to -1 so that this
-             * process does not pariticipate in recursive
-             * doubling */
-            newrank = -1;
-        }
-        else { /* odd */
-            entry_factory::create<recv_entry>(sched, tmp_buf, count, dtype, rank - 1, comm);
-            sched->add_barrier();
-
-            /* do the reduction on received data. since the
-             * ordering is right, it doesn't matter whether
-             * the operation is commutative or not. */
-
-            entry_factory::create<reduce_local_entry>(
-                sched, tmp_buf, count, recv_buf, nullptr, dtype, op);
-            sched->add_barrier();
-
-            /* change the rank */
-            newrank = rank / 2;
-        }
-    }
-    else /* rank >= 2*rem */
-        newrank = rank - rem;
-
-    if (newrank != -1) {
-        mask = 0x1;
-        while (mask < pof2) {
-            newdst = newrank ^ mask;
-            /* find real rank of dest */
-            dst = (newdst < rem) ? newdst * 2 + 1 : newdst + rem;
-
-            /* Send the most current data, which is in recv_buf. Recv
-             * into tmp_buf */
-            entry_factory::create<recv_entry>(sched, tmp_buf, count, dtype, dst, comm);
-            /* sendrecv, no barrier here */
-            entry_factory::create<send_entry>(sched, recv_buf, count, dtype, dst, comm);
-            sched->add_barrier();
-
-            /* tmp_buf contains data received in this step.
-             * recv_buf contains data accumulated so far */
-            entry_factory::create<reduce_local_entry>(
-                sched, tmp_buf, count, recv_buf, nullptr, dtype, op);
-            sched->add_barrier();
-
-            mask <<= 1;
-        }
-    }
-
-    /* In the non-power-of-two case, all odd-numbered
-     * processes of rank < 2*rem send the result to
-     * (rank-1), the ranks who didn't participate above. */
-    if (rank < 2 * rem) {
-        if (rank % 2) { /* odd */
-            entry_factory::create<send_entry>(sched, recv_buf, count, dtype, rank - 1, comm);
-        }
-        else { /* even */
-            entry_factory::create<recv_entry>(sched, recv_buf, count, dtype, rank + 1, comm);
-        }
-        sched->add_barrier();
-    }
-
-    return status;
-}
-
 ccl::status ccl_coll_build_nreduce_allreduce(ccl_sched* sched,
                                              ccl_buffer send_buf,
                                              ccl_buffer recv_buf,
@@ -559,6 +450,290 @@ ccl::status ccl_coll_build_ring_allreduce(ccl_sched* sched,
     return status;
 }
 
+ccl::status ccl_coll_build_recursive_doubling_allreduce(ccl_sched* sched,
+                                                        ccl_buffer send_buf,
+                                                        ccl_buffer recv_buf,
+                                                        size_t count,
+                                                        const ccl_datatype& dtype,
+                                                        ccl::reduction op,
+                                                        ccl_comm* comm) {
+    LOG_DEBUG("build recursive_doubling allreduce");
+
+    ccl::status status = ccl::status::success;
+
+    int pof2, rem, comm_size, rank;
+    int newrank, mask, newdst, dst;
+
+    comm_size = comm->size();
+    rank = comm->rank();
+
+    size_t dtype_size = dtype.size();
+
+    ccl_buffer tmp_buf = sched->alloc_buffer({ count * dtype_size, send_buf });
+
+    /* copy local data into recv_buf */
+    if (send_buf != recv_buf) {
+        entry_factory::create<copy_entry>(sched, send_buf, recv_buf, count, dtype);
+        sched->add_barrier();
+    }
+
+    if (comm_size == 1)
+        return status;
+
+    /* get nearest power-of-two less than or equal to comm_size */
+    pof2 = comm->pof2();
+    rem = comm_size - pof2;
+
+    /* In the non-power-of-two case, all even-numbered
+     * processes of rank < 2*rem send their data to
+     * (rank+1). These even-numbered processes no longer
+     * participate in the algorithm until the very end. The
+     * remaining processes form a nice power-of-two. */
+
+    if (rank < 2 * rem) {
+        if (rank % 2 == 0) { /* even */
+            entry_factory::create<send_entry>(sched, recv_buf, count, dtype, rank + 1, comm);
+            sched->add_barrier();
+
+            /* temporarily set the rank to -1 so that this
+             * process does not pariticipate in recursive
+             * doubling */
+            newrank = -1;
+        }
+        else { /* odd */
+            entry_factory::create<recv_entry>(sched, tmp_buf, count, dtype, rank - 1, comm);
+            sched->add_barrier();
+
+            /* do the reduction on received data. since the
+             * ordering is right, it doesn't matter whether
+             * the operation is commutative or not. */
+
+            entry_factory::create<reduce_local_entry>(
+                sched, tmp_buf, count, recv_buf, nullptr, dtype, op);
+            sched->add_barrier();
+
+            /* change the rank */
+            newrank = rank / 2;
+        }
+    }
+    else /* rank >= 2*rem */
+        newrank = rank - rem;
+
+    if (newrank != -1) {
+        mask = 0x1;
+        while (mask < pof2) {
+            newdst = newrank ^ mask;
+            /* find real rank of dest */
+            dst = (newdst < rem) ? newdst * 2 + 1 : newdst + rem;
+
+            /* Send the most current data, which is in recv_buf. Recv
+             * into tmp_buf */
+            entry_factory::create<recv_entry>(sched, tmp_buf, count, dtype, dst, comm);
+            /* sendrecv, no barrier here */
+            entry_factory::create<send_entry>(sched, recv_buf, count, dtype, dst, comm);
+            sched->add_barrier();
+
+            /* tmp_buf contains data received in this step.
+             * recv_buf contains data accumulated so far */
+            entry_factory::create<reduce_local_entry>(
+                sched, tmp_buf, count, recv_buf, nullptr, dtype, op);
+            sched->add_barrier();
+
+            mask <<= 1;
+        }
+    }
+
+    /* In the non-power-of-two case, all odd-numbered
+     * processes of rank < 2*rem send the result to
+     * (rank-1), the ranks who didn't participate above. */
+    if (rank < 2 * rem) {
+        if (rank % 2) { /* odd */
+            entry_factory::create<send_entry>(sched, recv_buf, count, dtype, rank - 1, comm);
+        }
+        else { /* even */
+            entry_factory::create<recv_entry>(sched, recv_buf, count, dtype, rank + 1, comm);
+        }
+        sched->add_barrier();
+    }
+
+    return status;
+}
+
+static void ccl_allreduce_2d_add_allreduce_allgather(ccl_sched* sched,
+                                                     ccl_buffer send_buf,
+                                                     ccl_buffer recv_buf,
+                                                     size_t count,
+                                                     const ccl_datatype& dtype,
+                                                     ccl::reduction op,
+                                                     ccl_comm* comm,
+                                                     ccl_comm* first_dim_comm,
+                                                     ccl_comm* second_dim_comm,
+                                                     size_t chunk_idx,
+                                                     size_t chunk_count) {
+    size_t dtype_size = dtype.size();
+    size_t main_chunk_size = count / chunk_count;
+    size_t last_chunk_size = main_chunk_size + count % chunk_count;
+    size_t cnt = (chunk_idx == (chunk_count - 1)) ? last_chunk_size : main_chunk_size;
+    ccl_buffer rbuf = recv_buf + chunk_idx * main_chunk_size * dtype_size;
+
+    size_t main_block_count = cnt / first_dim_comm->size();
+    size_t last_block_count = main_block_count + cnt % first_dim_comm->size();
+    size_t ar_count = (first_dim_comm->rank() == (first_dim_comm->size() - 1)) ? last_block_count
+                                                                               : main_block_count;
+
+    if (ar_count) {
+        // TODO: add second level selection to distinguish high and low level algorithms
+        ccl_buffer ar_buf = rbuf + first_dim_comm->rank() * main_block_count * dtype_size;
+        ccl_coll_build_nreduce_allreduce(
+            sched, ar_buf, ar_buf, ar_count, dtype, op, second_dim_comm);
+        sched->add_barrier();
+    }
+
+    std::vector<size_t> ag_recv_counts(first_dim_comm->size(), main_block_count);
+    ag_recv_counts[first_dim_comm->size() - 1] = last_block_count;
+
+    // TODO: skip direct algo since it may be started
+    // with different order on different ranks
+    sched->hint_algo.allgatherv = ccl_coll_allgatherv_ring;
+    ccl_coll_build_allgatherv(
+        sched, rbuf, ar_count, rbuf, ag_recv_counts.data(), dtype, first_dim_comm);
+    sched->hint_algo.allgatherv = ccl_coll_allgatherv_undefined;
+}
+
+static void ccl_allreduce_2d_add_reduce_scatter_allreduce_allgather(ccl_sched* sched,
+                                                                    ccl_buffer send_buf,
+                                                                    ccl_buffer recv_buf,
+                                                                    size_t count,
+                                                                    const ccl_datatype& dtype,
+                                                                    ccl::reduction op,
+                                                                    ccl_comm* comm,
+                                                                    ccl_comm* first_dim_comm,
+                                                                    ccl_comm* second_dim_comm,
+                                                                    size_t chunk_idx,
+                                                                    size_t chunk_count) {
+    size_t dtype_size = dtype.size();
+    size_t main_chunk_size = count / chunk_count;
+    size_t last_chunk_size = main_chunk_size + count % chunk_count;
+    size_t cnt = (chunk_idx == (chunk_count - 1)) ? last_chunk_size : main_chunk_size;
+    ccl_buffer sbuf = send_buf + chunk_idx * main_chunk_size * dtype_size;
+    ccl_buffer rbuf = recv_buf + chunk_idx * main_chunk_size * dtype_size;
+
+    ccl_coll_build_reduce_scatter(sched, sbuf, rbuf, cnt, dtype, op, first_dim_comm, true);
+    sched->add_barrier();
+
+    if (chunk_idx == (chunk_count - 1) || (chunk_count == 1)) {
+        ccl_allreduce_2d_add_allreduce_allgather(sched,
+                                                 send_buf,
+                                                 recv_buf,
+                                                 count,
+                                                 dtype,
+                                                 op,
+                                                 comm,
+                                                 first_dim_comm,
+                                                 second_dim_comm,
+                                                 chunk_idx,
+                                                 chunk_count);
+    }
+    else {
+        entry_factory::create<subsched_entry>(
+            sched,
+            chunk_idx,
+            [send_buf,
+             recv_buf,
+             count,
+             &dtype,
+             op,
+             comm,
+             first_dim_comm,
+             second_dim_comm,
+             chunk_idx,
+             chunk_count](ccl_sched* s) {
+                ccl_allreduce_2d_add_allreduce_allgather(s,
+                                                         send_buf,
+                                                         recv_buf,
+                                                         count,
+                                                         dtype,
+                                                         op,
+                                                         comm,
+                                                         first_dim_comm,
+                                                         second_dim_comm,
+                                                         chunk_idx,
+                                                         chunk_count);
+            },
+            "AR_AG");
+
+        entry_factory::create<subsched_entry>(
+            sched,
+            chunk_idx + 1,
+            [send_buf,
+             recv_buf,
+             count,
+             &dtype,
+             op,
+             comm,
+             first_dim_comm,
+             second_dim_comm,
+             chunk_idx,
+             chunk_count](ccl_sched* s) {
+                ccl_allreduce_2d_add_reduce_scatter_allreduce_allgather(s,
+                                                                        send_buf,
+                                                                        recv_buf,
+                                                                        count,
+                                                                        dtype,
+                                                                        op,
+                                                                        comm,
+                                                                        first_dim_comm,
+                                                                        second_dim_comm,
+                                                                        chunk_idx + 1,
+                                                                        chunk_count);
+            },
+            "RS_AR_AG");
+    }
+}
+
+ccl::status ccl_coll_build_2d_allreduce(ccl_sched* sched,
+                                        ccl_buffer send_buf,
+                                        ccl_buffer recv_buf,
+                                        size_t count,
+                                        const ccl_datatype& dtype,
+                                        ccl::reduction op,
+                                        ccl_comm* comm) {
+    ccl::status status = ccl::status::success;
+
+    size_t chunk_count = ccl::global_data::env().allreduce_2d_chunk_count;
+
+    bool switch_dims = ccl::global_data::env().allreduce_2d_switch_dims;
+    ccl_comm* first_dim_comm =
+        (switch_dims) ? comm->get_r2r_comm().get() : comm->get_node_comm().get();
+    ccl_comm* second_dim_comm =
+        (switch_dims) ? comm->get_node_comm().get() : comm->get_r2r_comm().get();
+
+    LOG_DEBUG("build 2d allreduce: chunk_count: ",
+              chunk_count,
+              ", switch_dims: ",
+              switch_dims,
+              ", comm: ",
+              comm->to_string(),
+              ", 1st dim comm: ",
+              first_dim_comm->to_string(),
+              ", 2nd dim comm: ",
+              second_dim_comm->to_string());
+
+    ccl_allreduce_2d_add_reduce_scatter_allreduce_allgather(sched,
+                                                            send_buf,
+                                                            recv_buf,
+                                                            count,
+                                                            dtype,
+                                                            op,
+                                                            comm,
+                                                            first_dim_comm,
+                                                            second_dim_comm,
+                                                            0 /* chunk_idx */,
+                                                            chunk_count);
+
+    return status;
+}
+
 #if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
 
 ccl::status ccl_coll_build_topo_allreduce(ccl_sched* sched,
@@ -613,140 +788,169 @@ ccl::status ccl_coll_build_topo_allreduce(ccl_sched* sched,
     sched->try_enable_ze_single_list();
     std::vector<ze_event_handle_t> wait_events;
 
-    if (is_single_card && ccl::global_data::env().enable_ze_bidir_algo) {
-        LOG_DEBUG("topo/bidir: each rank uses ze_onesided_allreduce");
+    size_t base_count = count;
+    size_t pair_comm_offset = 0;
+    size_t pair_comm_offset_bytes = 0;
 
-        size_t base_count = count / pair_comm->size();
-        size_t offset = base_count * pair_comm->rank();
+    if (ccl::global_data::env().enable_ze_bidir_algo) {
+        base_count = count / pair_comm->size();
+        pair_comm_offset = base_count * pair_comm->rank();
+        pair_comm_offset_bytes = pair_comm_offset * dtype.size();
 
         if (pair_comm->rank() == pair_comm->size() - 1)
             base_count += count % pair_comm->size();
+    }
 
-        LOG_DEBUG("rank: ", pair_comm->rank(), ", count: ", base_count, ", offset: ", offset);
+    else if (pair_comm->rank() != ccl::global_data::env().kernel_1s_lead) {
+        ccl::add_comm_barrier(sched, pair_comm, ipc_event_pool, ipc_event_count++);
+        CCL_THROW_IF_NOT(ipc_event_count <= max_ipc_event_count,
+                         "unexpected ipc_event_count ",
+                         ipc_event_count,
+                         ", expected max ",
+                         max_ipc_event_count);
+        return ccl::status::success;
+    }
 
-        entry_factory::create<ze_onesided_allreduce_entry>(
-            sched, send_buf, recv_buf, base_count, dtype, op, pair_comm, wait_events, offset);
+    size_t main_block_count = base_count / even_comm_size;
+    size_t block_count = main_block_count;
+    if (even_comm->rank() == even_comm_size - 1) {
+        block_count += base_count % even_comm_size;
+    }
+    size_t even_comm_offset_bytes = main_block_count * even_comm->rank() * dtype.size();
+    ccl_buffer pair_comm_send_buf = send_buf + pair_comm_offset_bytes;
+    ccl_buffer pair_comm_recv_buf = recv_buf + pair_comm_offset_bytes;
+    ccl_buffer even_comm_recv_buf = recv_buf + pair_comm_offset_bytes + even_comm_offset_bytes;
+
+    LOG_DEBUG("rank: ",
+              pair_comm->rank(),
+              ", count: ",
+              base_count,
+              ", pair_comm_offset: ",
+              pair_comm_offset);
+    if (is_single_card) {
+        entry_factory::create<ze_onesided_allreduce_entry>(sched,
+                                                           pair_comm_send_buf,
+                                                           pair_comm_recv_buf,
+                                                           base_count,
+                                                           dtype,
+                                                           op,
+                                                           pair_comm,
+                                                           wait_events,
+                                                           pair_comm_offset);
     }
-    else if (pair_comm->rank() == ccl::global_data::env().kernel_1s_lead) {
-        if (is_single_card) {
-            LOG_DEBUG("topo/scale_up/intra: use ze_onesided_allreduce");
-            auto entry = entry_factory::create<ze_onesided_allreduce_entry>(
-                sched, send_buf, recv_buf, count, dtype, op, pair_comm, wait_events);
+    else {
+        LOG_DEBUG("topo/scale_up/intra: use ze_onesided_reduce");
+        auto entry = entry_factory::create<ze_onesided_reduce_entry>(sched,
+                                                                     pair_comm_send_buf,
+                                                                     pair_comm_recv_buf,
+                                                                     base_count,
+                                                                     dtype,
+                                                                     op,
+                                                                     pair_comm->rank(),
+                                                                     pair_comm,
+                                                                     wait_events,
+                                                                     pair_comm_offset);
+        wait_events.push_back(entry->entry_event);
+    }
+    sched->add_barrier();
+
+    bool is_read_allgatherv = ccl::global_data::env().allgatherv_topo_read;
+    if (is_multi_card) {
+        ccl::add_comm_barrier(sched, even_comm, wait_events, ipc_event_pool, ipc_event_count++);
+        // cannot use ze_a2a_allreduce_entry with allgatherv_read
+        // since comm_barrier is not available inside the entry
+        if (is_single_node && !is_read_allgatherv) {
+            LOG_DEBUG("topo/scale_up/intra: use ze_a2a_allreduce_entry");
+            auto entry = entry_factory::create<ze_a2a_allreduce_entry>(sched,
+                                                                       pair_comm_recv_buf,
+                                                                       pair_comm_recv_buf,
+                                                                       base_count,
+                                                                       dtype,
+                                                                       op,
+                                                                       even_comm,
+                                                                       wait_events,
+                                                                       recv_buf_idx,
+                                                                       recv_buf_idx,
+                                                                       pair_comm_offset);
             wait_events.push_back(entry->entry_event);
+            ccl::add_comm_barrier(sched, even_comm, wait_events, ipc_event_pool, ipc_event_count++);
         }
         else {
-            LOG_DEBUG("topo/scale_up/intra: use ze_onesided_reduce");
-            auto entry = entry_factory::create<ze_onesided_reduce_entry>(sched,
-                                                                         send_buf,
-                                                                         recv_buf,
-                                                                         count,
-                                                                         dtype,
-                                                                         op,
-                                                                         pair_comm->rank(),
-                                                                         pair_comm,
-                                                                         wait_events);
+            LOG_DEBUG("topo/scale_up/inter: use ze_a2a_reduce_scatter_entry");
+            std::vector<size_t> block_counts(even_comm->size(), main_block_count);
+            block_counts.back() += base_count % even_comm_size;
+            auto entry = entry_factory::create<ze_a2a_reduce_scatter_entry>(sched,
+                                                                            pair_comm_recv_buf,
+                                                                            even_comm_recv_buf,
+                                                                            block_counts.data(),
+                                                                            dtype,
+                                                                            op,
+                                                                            even_comm,
+                                                                            wait_events,
+                                                                            recv_buf_idx,
+                                                                            pair_comm_offset);
             wait_events.push_back(entry->entry_event);
+            ccl::add_comm_barrier(sched, even_comm, wait_events, ipc_event_pool, ipc_event_count++);
         }
-        sched->add_barrier();
+    }
 
-        size_t main_block_count = count / even_comm_size;
-        size_t block_count = main_block_count;
-        if (even_comm->rank() == even_comm_size - 1) {
-            block_count += count % even_comm_size;
+    ccl_coll_entry_param coll_param{ .ctype = ccl_coll_allreduce,
+                                     .send_buf = even_comm_recv_buf,
+                                     .recv_buf = even_comm_recv_buf,
+                                     .count = block_count,
+                                     .dtype = dtype,
+                                     .reduction = op,
+                                     .comm = r2r_comm };
+
+    ccl::add_scaleout(sched, coll_param, is_single_node, wait_events);
+
+    if (is_multi_card && (!is_single_node || is_read_allgatherv)) {
+        LOG_DEBUG("topo/scale_up/inter: use ze_a2a_allgatherv");
+        // for multinode with allgatherv_read, use a comm_barrier to make sure all
+        // r2r scaleout within even_comm has finished so that remote reads are valid
+        if (!is_single_node && is_read_allgatherv) {
+            ccl::add_comm_barrier(sched, even_comm, wait_events, ipc_event_pool, ipc_event_count++);
         }
 
-        if (is_multi_card) {
-            auto barrier_event = ccl::add_comm_barrier(
-                sched, even_comm, wait_events, ipc_event_pool, ipc_event_count++);
-            wait_events.push_back(barrier_event);
-
-            if (is_single_node) {
-                LOG_DEBUG("topo/scale_up/inter: use ze_a2a_allreduce");
-                auto entry = entry_factory::create<ze_a2a_allreduce_entry>(sched,
-                                                                           recv_buf,
-                                                                           recv_buf,
-                                                                           count,
-                                                                           dtype,
-                                                                           op,
-                                                                           even_comm,
-                                                                           wait_events,
-                                                                           recv_buf_idx);
-                wait_events.push_back(entry->entry_event);
-
-                auto barrier_event = ccl::add_comm_barrier(
-                    sched, even_comm, wait_events, ipc_event_pool, ipc_event_count++);
-                wait_events.push_back(barrier_event);
-            }
-            else {
-                size_t offset_bytes = main_block_count * even_comm->rank() * dtype.size();
-                ccl_buffer partial_recv_buf = recv_buf + offset_bytes;
-                LOG_DEBUG("topo/scale_up/inter: use ze_a2a_reduce_scatter_entry");
-                std::vector<size_t> block_counts(even_comm->size(), main_block_count);
-                block_counts.back() = block_count;
-                auto entry = entry_factory::create<ze_a2a_reduce_scatter_entry>(sched,
-                                                                                recv_buf,
-                                                                                partial_recv_buf,
-                                                                                block_counts.data(),
-                                                                                dtype,
-                                                                                op,
-                                                                                even_comm,
-                                                                                wait_events,
-                                                                                recv_buf_idx);
-                wait_events.push_back(entry->entry_event);
-
-                auto barrier_event = ccl::add_comm_barrier(
-                    sched, even_comm, wait_events, ipc_event_pool, ipc_event_count++);
-                wait_events.push_back(barrier_event);
-            }
-        }
-
-        size_t offset_bytes = main_block_count * even_comm->rank() * dtype.size();
-        ccl_buffer partial_recv_buf = recv_buf + offset_bytes;
-
-        ccl_coll_entry_param coll_param{ .ctype = ccl_coll_allreduce,
-                                         .send_buf = partial_recv_buf,
-                                         .recv_buf = partial_recv_buf,
-                                         .count = block_count,
-                                         .dtype = dtype,
-                                         .reduction = op,
-                                         .comm = r2r_comm };
-
-        ccl::add_scaleout(sched, coll_param, is_single_node, wait_events);
-
-        if (is_multi_card && !is_single_node) {
-            LOG_DEBUG("topo/scale_up/inter: use ze_a2a_allgatherv");
-            std::vector<size_t> recv_counts(even_comm_size, main_block_count);
-            recv_counts.at(even_comm->rank()) = block_count;
-            auto entry = entry_factory::create<ze_a2a_allgatherv_entry>(sched,
-                                                                        recv_buf,
-                                                                        block_count,
-                                                                        recv_buf,
-                                                                        recv_counts.data(),
-                                                                        dtype,
-                                                                        even_comm,
-                                                                        wait_events,
-                                                                        recv_buf_idx);
-            wait_events.push_back(entry->entry_event);
-
-            auto barrier_event = ccl::add_comm_barrier(
-                sched, even_comm, wait_events, ipc_event_pool, ipc_event_count++);
-            wait_events.push_back(barrier_event);
+        std::vector<size_t> recv_counts(even_comm_size, main_block_count);
+        recv_counts.back() += base_count % even_comm_size;
+        // allgatherv entry takes an array of recv bufs with one buffer for each rank
+        // rather than a single large recv buf that needs to be divided
+        std::vector<ccl_buffer> recv_bufs;
+        for (int i = 0; i < even_comm_size; i++) {
+            recv_bufs.push_back(pair_comm_recv_buf + i * main_block_count * dtype.size());
         }
+        auto entry = entry_factory::create<ze_a2a_allgatherv_entry>(sched,
+                                                                    recv_bufs[even_comm->rank()],
+                                                                    block_count,
+                                                                    recv_bufs,
+                                                                    recv_counts,
+                                                                    dtype,
+                                                                    even_comm,
+                                                                    wait_events,
+                                                                    recv_buf_idx,
+                                                                    pair_comm_offset);
+        wait_events.push_back(entry->entry_event);
+        ccl::add_comm_barrier(sched, even_comm, wait_events, ipc_event_pool, ipc_event_count++);
+    }
 
-        if (!is_single_card && (pair_comm->size() > 1)) {
-            LOG_DEBUG("topo/scale_up/intra: use ze_onesided_bcast");
-            int peer_rank = (pair_comm->rank() + 1) % pair_comm->size();
-            auto entry = entry_factory::create<ze_copy_entry>(
-                sched,
-                recv_buf,
-                ccl_buffer(),
-                count,
-                dtype,
-                copy_attr(peer_rank, recv_buf_idx, copy_direction::d2d, pair_comm),
-                wait_events);
-            wait_events.push_back(entry->entry_event);
-            sched->add_barrier();
-        }
+    if (!is_single_card && pair_comm->size() > 1) {
+        LOG_DEBUG("topo/scale_up/intra: use ze_onesided_bcast");
+        int peer_rank = (pair_comm->rank() + 1) % pair_comm->size();
+        auto entry = entry_factory::create<ze_copy_entry>(sched,
+                                                          recv_buf,
+                                                          ccl_buffer(),
+                                                          base_count,
+                                                          dtype,
+                                                          copy_attr(peer_rank,
+                                                                    recv_buf_idx,
+                                                                    copy_direction::t2t,
+                                                                    pair_comm,
+                                                                    pair_comm_offset,
+                                                                    pair_comm_offset),
+                                                          wait_events);
+        wait_events.push_back(entry->entry_event);
+        sched->add_barrier();
     }
 
     ccl::add_comm_barrier(sched, pair_comm, ipc_event_pool, ipc_event_count++);
diff --git a/src/coll/algorithms/allreduce/allreduce_2d.cpp b/src/coll/algorithms/allreduce/allreduce_2d.cpp
deleted file mode 100644
index 254a5cd0d..000000000
--- a/src/coll/algorithms/allreduce/allreduce_2d.cpp
+++ /dev/null
@@ -1,178 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#include "coll/algorithms/algorithms.hpp"
-#include "coll/algorithms/allreduce/allreduce_2d.hpp"
-#include "common/global/global.hpp"
-#include "sched/entry/factory/entry_factory.hpp"
-
-ccl_allreduce_2d_builder::ccl_allreduce_2d_builder(size_t base_size,
-                                                   bool switch_dims,
-                                                   ccl_comm* comm) {
-    parent_comm = comm;
-
-    int first_dim_color, second_dim_color;
-
-    if (switch_dims) {
-        first_dim_color = comm->rank() / base_size;
-        second_dim_color = comm->rank() % base_size;
-    }
-    else {
-        first_dim_color = comm->rank() % base_size;
-        second_dim_color = comm->rank() / base_size;
-    }
-
-    first_dim_comm = std::shared_ptr<ccl_comm>(comm->create_subcomm(first_dim_color));
-
-    second_dim_comm = std::shared_ptr<ccl_comm>(comm->create_subcomm(second_dim_color));
-
-    if (comm->rank() == 0) {
-        std::string first_dim_ranks, second_dim_ranks;
-        for (int idx = 0; idx < first_dim_comm->size(); idx++) {
-            first_dim_ranks += ((idx) ? " " : "") + std::to_string(idx);
-        }
-        for (int idx = 0; idx < second_dim_comm->size(); idx++) {
-            second_dim_ranks += ((idx) ? " " : "") + std::to_string(idx);
-        }
-
-        std::stringstream ss;
-        ss << "{"
-           << "base: " << base_size << ", switch: " << switch_dims
-           << ", 1st dim: {size:" << first_dim_comm->size() << ", ranks:" << first_dim_ranks << "}"
-           << ", 2nd dim: {size:" << second_dim_comm->size() << ", ranks:" << second_dim_ranks
-           << "}"
-           << "}";
-        LOG_DEBUG(ss.str());
-    }
-}
-
-ccl_allreduce_2d_builder::~ccl_allreduce_2d_builder() {
-    first_dim_comm.reset();
-    second_dim_comm.reset();
-}
-
-static void ccl_allreduce_2d_add_allreduce_allgather(ccl_sched* sched,
-                                                     ccl_buffer send_buf,
-                                                     ccl_buffer recv_buf,
-                                                     size_t count,
-                                                     const ccl_datatype& dtype,
-                                                     ccl::reduction op,
-                                                     ccl_comm* comm,
-                                                     size_t chunk_idx,
-                                                     size_t chunk_count) {
-    ccl_comm* first_dim_comm = comm->get_allreduce_2d_builder()->get_first_dim_comm();
-    ccl_comm* second_dim_comm = comm->get_allreduce_2d_builder()->get_second_dim_comm();
-
-    size_t dtype_size = dtype.size();
-    size_t main_chunk_size = count / chunk_count;
-    size_t last_chunk_size = main_chunk_size + count % chunk_count;
-    size_t cnt = (chunk_idx == (chunk_count - 1)) ? last_chunk_size : main_chunk_size;
-    ccl_buffer rbuf = recv_buf + chunk_idx * main_chunk_size * dtype_size;
-
-    size_t main_block_count = cnt / first_dim_comm->size();
-    size_t last_block_count = main_block_count + cnt % first_dim_comm->size();
-    size_t ar_count = (first_dim_comm->rank() == (first_dim_comm->size() - 1)) ? last_block_count
-                                                                               : main_block_count;
-
-    if (ar_count) {
-        /* TODO: add second level selection to distinguish high and low level algorithms */
-        ccl_buffer ar_buf = rbuf + first_dim_comm->rank() * main_block_count * dtype_size;
-        ccl_coll_build_nreduce_allreduce(
-            sched, ar_buf, ar_buf, ar_count, dtype, op, second_dim_comm);
-        sched->add_barrier();
-    }
-
-    std::vector<size_t> ag_recv_counts(first_dim_comm->size(), main_block_count);
-    ag_recv_counts[first_dim_comm->size() - 1] = last_block_count;
-    ccl_coll_build_allgatherv(
-        sched, rbuf, ar_count, rbuf, ag_recv_counts.data(), dtype, first_dim_comm);
-}
-
-static void ccl_allreduce_2d_add_reduce_scatter_allreduce_allgather(ccl_sched* sched,
-                                                                    ccl_buffer send_buf,
-                                                                    ccl_buffer recv_buf,
-                                                                    size_t count,
-                                                                    const ccl_datatype& dtype,
-                                                                    ccl::reduction op,
-                                                                    ccl_comm* comm,
-                                                                    size_t chunk_idx,
-                                                                    size_t chunk_count) {
-    ccl_comm* first_dim_comm = comm->get_allreduce_2d_builder()->get_first_dim_comm();
-
-    size_t dtype_size = dtype.size();
-    size_t main_chunk_size = count / chunk_count;
-    size_t last_chunk_size = main_chunk_size + count % chunk_count;
-    size_t cnt = (chunk_idx == (chunk_count - 1)) ? last_chunk_size : main_chunk_size;
-    ccl_buffer sbuf = send_buf + chunk_idx * main_chunk_size * dtype_size;
-    ccl_buffer rbuf = recv_buf + chunk_idx * main_chunk_size * dtype_size;
-
-    ccl_coll_build_reduce_scatter(sched, sbuf, rbuf, cnt, dtype, op, first_dim_comm, true);
-    sched->add_barrier();
-
-    if (chunk_idx == (chunk_count - 1) || (chunk_count == 1)) {
-        ccl_allreduce_2d_add_allreduce_allgather(
-            sched, send_buf, recv_buf, count, dtype, op, comm, chunk_idx, chunk_count);
-    }
-    else {
-        entry_factory::create<subsched_entry>(
-            sched,
-            chunk_idx,
-            [send_buf, recv_buf, count, &dtype, op, comm, chunk_idx, chunk_count](ccl_sched* s) {
-                ccl_allreduce_2d_add_allreduce_allgather(
-                    s, send_buf, recv_buf, count, dtype, op, comm, chunk_idx, chunk_count);
-            },
-            "AR_AG");
-
-        entry_factory::create<subsched_entry>(
-            sched,
-            chunk_idx + 1,
-            [send_buf, recv_buf, count, &dtype, op, comm, chunk_idx, chunk_count](ccl_sched* s) {
-                ccl_allreduce_2d_add_reduce_scatter_allreduce_allgather(
-                    s, send_buf, recv_buf, count, dtype, op, comm, chunk_idx + 1, chunk_count);
-            },
-            "RS_AR_AG");
-    }
-}
-
-ccl::status ccl_allreduce_2d_builder::build(ccl_sched* sched,
-                                            ccl_buffer send_buf,
-                                            ccl_buffer recv_buf,
-                                            size_t count,
-                                            const ccl_datatype& dtype,
-                                            ccl::reduction op) {
-    CCL_THROW_IF_NOT(sched && send_buf && recv_buf && count,
-                     "incorrect values, sched ",
-                     sched,
-                     ", send ",
-                     send_buf,
-                     " recv ",
-                     recv_buf);
-
-    ccl::status status = ccl::status::success;
-
-    size_t chunk_count = ccl::global_data::env().ar2d_chunk_count;
-
-    if (chunk_count == 0) {
-        LOG_ERROR("unexpected chunk_count");
-        chunk_count = 1;
-    }
-
-    LOG_DEBUG("build 2d allreduce, chunk_count ", chunk_count);
-
-    ccl_allreduce_2d_add_reduce_scatter_allreduce_allgather(
-        sched, send_buf, recv_buf, count, dtype, op, parent_comm, 0 /* chunk_idx */, chunk_count);
-
-    return status;
-}
diff --git a/src/coll/algorithms/allreduce/allreduce_2d.hpp b/src/coll/algorithms/allreduce/allreduce_2d.hpp
deleted file mode 100644
index 41de40d1f..000000000
--- a/src/coll/algorithms/allreduce/allreduce_2d.hpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-
-#include "common/utils/buffer.hpp"
-#include "sched/sched.hpp"
-
-class comm;
-
-class ccl_allreduce_2d_builder {
-public:
-    ccl_allreduce_2d_builder(size_t base_size, bool switch_dims, ccl_comm* comm);
-    ~ccl_allreduce_2d_builder();
-
-    ccl_allreduce_2d_builder(const ccl_allreduce_2d_builder&) = delete;
-    ccl_allreduce_2d_builder(ccl_allreduce_2d_builder&&) = delete;
-
-    ccl_allreduce_2d_builder& operator=(const ccl_allreduce_2d_builder&) = delete;
-    ccl_allreduce_2d_builder& operator=(ccl_allreduce_2d_builder&&) = delete;
-
-    ccl::status build(ccl_sched* sched,
-                      ccl_buffer send_buf,
-                      ccl_buffer recv_buf,
-                      size_t count,
-                      const ccl_datatype& dtype,
-                      ccl::reduction op);
-
-    ccl_comm* get_first_dim_comm() const {
-        return first_dim_comm.get();
-    }
-    ccl_comm* get_second_dim_comm() const {
-        return second_dim_comm.get();
-    }
-
-private:
-    ccl_comm* parent_comm{};
-    std::shared_ptr<ccl_comm> first_dim_comm;
-    std::shared_ptr<ccl_comm> second_dim_comm;
-};
diff --git a/src/coll/algorithms/allreduce/allreduce_rma.cpp b/src/coll/algorithms/allreduce/allreduce_rma.cpp
index da418f555..f10a93cc6 100644
--- a/src/coll/algorithms/allreduce/allreduce_rma.cpp
+++ b/src/coll/algorithms/allreduce/allreduce_rma.cpp
@@ -210,9 +210,9 @@ ccl::status ccl_coll_build_ring_rma_allreduce(ccl_sched* sched,
 
     sched->set_entry_exec_mode(ccl_sched_entry_exec_once);
 
+    send_entry* e{};
     if (inplace) {
-        send_entry* e =
-            entry_factory::create<send_entry>(sched,
+        e = entry_factory::create<send_entry>(sched,
                                               ccl_buffer(&ar_handler->tmp_buf_mr, sizeof(atl_mr_t)),
                                               sizeof(atl_mr_t),
                                               ccl_datatype_int8,
@@ -221,7 +221,7 @@ ccl::status ccl_coll_build_ring_rma_allreduce(ccl_sched* sched,
         e->set_field_fn<ccl_sched_entry_field_buf>(rma_ring_allreduce_get_tmp_buf_mr, ar_handler);
     }
     else {
-        send_entry* e = entry_factory::create<send_entry>(
+        e = entry_factory::create<send_entry>(
             sched,
             ccl_buffer(&ar_handler->recv_buf_mr, sizeof(atl_mr_t)),
             sizeof(atl_mr_t),
@@ -230,8 +230,7 @@ ccl::status ccl_coll_build_ring_rma_allreduce(ccl_sched* sched,
             comm);
         e->set_field_fn<ccl_sched_entry_field_buf>(rma_ring_allreduce_get_recv_buf_mr, ar_handler);
     }
-    send_entry* e =
-        entry_factory::create<send_entry>(sched,
+    e = entry_factory::create<send_entry>(sched,
                                           ccl_buffer(&ar_handler->recv_buf_mr, sizeof(atl_mr_t)),
                                           sizeof(atl_mr_t),
                                           ccl_datatype_int8,
@@ -269,7 +268,7 @@ ccl::status ccl_coll_build_ring_rma_allreduce(ccl_sched* sched,
         comm);
 
     if (ar_handler->wait_dst) {
-        send_entry* e = entry_factory::create<send_entry>(
+        e = entry_factory::create<send_entry>(
             sched,
             ccl_buffer(ar_handler->dst_ready_flag_mr, sizeof(atl_mr_t)),
             sizeof(atl_mr_t),
diff --git a/src/coll/algorithms/alltoallv.cpp b/src/coll/algorithms/alltoallv.cpp
index 4289e9aa4..2bb4dbc80 100644
--- a/src/coll/algorithms/alltoallv.cpp
+++ b/src/coll/algorithms/alltoallv.cpp
@@ -271,6 +271,10 @@ ccl::status ccl_coll_build_scatter_alltoallv(ccl_sched* main_sched,
         if (src == comm_rank)
             continue;
 
+        if (recv_counts[src] == 0) {
+            continue;
+        }
+
         size_t sched_idx = (comm_rank + src) % sched_count;
 
         ccl_buffer recv_buf;
@@ -296,7 +300,9 @@ ccl::status ccl_coll_build_scatter_alltoallv(ccl_sched* main_sched,
             continue;
 
         size_t sched_idx = (comm_rank + dst) % sched_count;
-
+        if (send_counts[dst] == 0) {
+            continue;
+        }
         entry_factory::make_chunked_send_entry(send_scheds,
                                                sched_idx,
                                                ccl_buffer(coll_param.get_send_buf_ptr(),
@@ -312,13 +318,17 @@ ccl::status ccl_coll_build_scatter_alltoallv(ccl_sched* main_sched,
     if (!inplace)
         return ccl::status::success;
 
-    main_sched->sync_subscheds();
+    if (main_sched) {
+        main_sched->sync_subscheds();
+    }
 
     for (int idx = 0; idx < comm_size; idx++) {
         int src = (comm_rank + idx) % comm_size;
         if (src == comm_rank)
             continue;
-
+        if (recv_counts[src] == 0) {
+            continue;
+        }
         size_t sched_idx = (comm_rank + src) % sched_count;
 
         entry_factory::create<copy_entry>(scheds[sched_idx],
@@ -340,9 +350,10 @@ ccl::status ccl_coll_build_topo_alltoallv(ccl_sched* main_sched,
                                           const ccl_coll_param& coll_param) {
     ccl_comm* comm = coll_param.comm;
     ccl_sched* sched = scheds.front();
-    CCL_THROW_IF_NOT(scheds.size() == 1, "unexpected scheds size: ", scheds.size());
+
     const ccl_datatype& dtype = coll_param.dtype;
     const bool is_inplace = coll_param.is_inplace();
+    const bool is_read = ccl::global_data::env().alltoallv_topo_read;
 
     int comm_rank = comm->rank();
     int comm_size = comm->size();
@@ -391,11 +402,10 @@ ccl::status ccl_coll_build_topo_alltoallv(ccl_sched* main_sched,
     ccl_comm* pair_comm = comm->get_pair_comm().get();
     ccl_comm* even_comm = comm->get_even_comm().get();
     ccl_comm* node_comm = comm->get_node_comm().get();
+    ccl_comm* r2r_comm = comm->get_r2r_comm().get();
 
-    const int even_comm_size = even_comm->size();
-    bool is_multi_card = (even_comm_size > 1);
     const ccl::topo_manager& topo_manager = comm->get_topo_manager();
-    CCL_THROW_IF_NOT(topo_manager.is_single_card != is_multi_card);
+    bool is_single_node = topo_manager.is_single_node;
 
     // IPC exchange
     std::vector<ze_handle_exchange_entry::mem_desc_t> in_buffers;
@@ -426,31 +436,44 @@ ccl::status ccl_coll_build_topo_alltoallv(ccl_sched* main_sched,
     std::vector<ze_event_handle_t> wait_events;
     std::list<ze_event_handle_t> parallel_copy_events;
 
-    auto copy_to_peers = [&](std::vector<ccl_buffer>& bufs, std::vector<size_t>& counts) {
+    auto copy_to_peers = [&](std::vector<ccl_buffer>& bufs,
+                             std::vector<size_t>& counts,
+                             ccl_comm* comm,
+                             int start_buf_idx,
+                             int offset) {
         auto card_count = even_comm->size();
         auto tile_count = pair_comm->size();
         for (int card_idx = 0; card_idx < card_count; card_idx++) {
             for (int tile_idx = 0; tile_idx < tile_count; tile_idx++) {
                 auto peer_rank = (card_idx * tile_count + tile_idx);
-                if (peer_rank == comm_rank)
+                if (peer_rank == comm->rank())
                     continue;
                 copy_attr attr{};
                 attr.peer_rank = peer_rank;
-                attr.peer_buf_idx = recv_buf_idx_start + comm_rank;
-                attr.direction = copy_direction::d2d;
+                attr.peer_buf_idx = start_buf_idx + offset;
                 attr.map_comm = comm;
                 attr.hint_queue_index = parallel_copy_events.size();
-                attr.is_peer_card_copy = true;
-                auto entry = entry_factory::create<ze_copy_entry>(sched,
-                                                                  bufs[peer_rank],
-                                                                  ccl_buffer(),
-                                                                  counts[peer_rank],
-                                                                  dtype,
-                                                                  attr,
-                                                                  wait_events);
+                attr.direction = copy_direction::c2c;
+
+                if (!is_single_node) {
+                    // in order to get the correct offset for peer rank for example(2)
+                    peer_rank = peer_rank + (r2r_comm->rank() * node_comm->size());
+                }
+
+                auto src = bufs[peer_rank];
+                auto dst = ccl_buffer();
+                if (is_read) {
+                    src = ccl_buffer();
+                    dst = bufs[peer_rank];
+                    LOG_DEBUG("ze_copy: read copy is enabled")
+                }
+
+                auto entry = entry_factory::create<ze_copy_entry>(
+                    sched, src, dst, counts[peer_rank], dtype, attr, wait_events);
                 parallel_copy_events.push_back(entry->entry_event);
             }
         }
+        LOG_DEBUG("copy_to_peers phase done")
     };
 
     auto add_sched_barrier_for_parallel_copies = [&]() {
@@ -463,18 +486,14 @@ ccl::status ccl_coll_build_topo_alltoallv(ccl_sched* main_sched,
     auto copy_to_self = [&](ccl_buffer& send, ccl_buffer& recv, const size_t count) {
         copy_attr attr{};
         attr.hint_queue_index = parallel_copy_events.size();
-        attr.is_peer_card_copy = true;
+        attr.direction = copy_direction::c2c;
         auto entry = entry_factory::create<ze_copy_entry>(
             sched, send, recv, count, dtype, attr, wait_events);
         parallel_copy_events.push_back(entry->entry_event);
+        LOG_DEBUG("copy_to_self phase done")
     };
 
-    auto barrier_comm = [&](ccl_comm* comm) {
-        auto barrier_event = ccl::add_comm_barrier(sched, comm, wait_events);
-        wait_events.push_back(barrier_event);
-    };
-
-    if (is_inplace) {
+    auto inplace_mode = [&](ccl_comm* comm, int offset) {
         for (int idx = 0; idx < comm_size; idx++) {
             CCL_THROW_IF_NOT(send_bufs[idx].get_ptr() == recv_bufs[idx].get_ptr(),
                              "unexpected send_buf ptr for inplace case");
@@ -485,20 +504,114 @@ ccl::status ccl_coll_build_topo_alltoallv(ccl_sched* main_sched,
             copy_to_self(recv_bufs[idx], tmp_bufs[idx], send_counts[idx]);
         }
         add_sched_barrier_for_parallel_copies();
-        barrier_comm(node_comm);
+        ccl::add_comm_barrier(sched, node_comm, wait_events);
 
-        // copy from my tmp buffer to peer recv
-        copy_to_peers(tmp_bufs, send_counts);
-    }
-    else {
-        // copy from own send to own recv
-        copy_to_self(send_bufs[comm->rank()], recv_bufs[comm->rank()], send_counts[comm->rank()]);
         // copy from peer rank send to peer rank recv
-        copy_to_peers(send_bufs, send_counts);
+        if (is_read) {
+            copy_to_peers(recv_bufs, recv_counts, comm, tmp_buf_idx_start, offset);
+        }
+        else {
+            copy_to_peers(tmp_bufs, send_counts, comm, recv_buf_idx_start, offset);
+        }
+    };
+
+    // the case, when it only goes on a single node,
+    // scale up is performing
+    if (is_single_node) {
+        LOG_DEBUG("topo/scale_up/intra: in single node case");
+        if (is_inplace) {
+            LOG_DEBUG("inplace/topo/scale_up/intra: in single node case");
+            // do scale up for inplace on a global comm
+            inplace_mode(comm, comm->rank());
+        }
+        else {
+            LOG_DEBUG("outofplace/topo/scale_up/intra: in single node case");
+            // do scale up for outofplace on a global comm
+            // copy from own send to own recv
+            copy_to_self(
+                send_bufs[comm->rank()], recv_bufs[comm->rank()], send_counts[comm->rank()]);
+            // copy from peer rank send to peer rank recv
+            if (is_read) {
+                copy_to_peers(recv_bufs, recv_counts, comm, send_buf_idx_start, comm->rank());
+            }
+            else {
+                copy_to_peers(send_bufs, send_counts, comm, recv_buf_idx_start, comm->rank());
+            }
+        }
     }
 
+    // the last part of scale out case is intra scale up
+    // it runs when it is multi
+    if (!is_single_node) {
+        // Here is example of the whole flow:
+        // input data:      scale-out:(1)  scale-up:(2)   result:
+        // 0r 0  1  2  3    |x  x  8  12   |0  4  x  x   |0  4  8  12
+        // 1r 4  5  6  7    |x  x  9  13   |1  5  x  x   |1  5  9  13
+        // 2r 8  9  10 11   |2  6  x  x    |x  x  10 14  |2  6  10 14
+        // 3r 12 13 14 15   |3  7  x  x    |x  x  11 15  |3  7  11 15
+
+        // set it by default to use
+        // scatter algo for scale out phase
+        auto ze_multi_workers_saved = ccl::global_data::env().ze_multi_workers;
+        ccl::global_data::env().ze_multi_workers = 1;
+
+        auto tmp_send_counts = send_counts;
+        auto tmp_recv_counts = recv_counts;
+
+        // skip the parts for scale up phase via nullifying
+        // those send/recv_counts which are placed on one node
+        // checking if ranks from global comm is a local one
+        auto rank_info = topo_manager.get_filtered_rank_info_vec(topo_manager.get_host_idx());
+        for (int rank_idx = 0; rank_idx < comm_size; rank_idx++) {
+            for (auto& local_info : rank_info) {
+                if (rank_idx == local_info.rank) {
+                    tmp_send_counts[rank_idx] = 0;
+                    tmp_recv_counts[rank_idx] = 0;
+                }
+            }
+        }
+
+        // preparation for host alltoall coll
+        ccl_coll_entry_param host_coll_param{ .ctype = ccl_coll_alltoallv,
+                                              .send_bufs = send_bufs,
+                                              .recv_bufs = recv_bufs,
+                                              .send_counts = tmp_send_counts.data(),
+                                              .recv_counts = tmp_recv_counts.data(),
+                                              .dtype = dtype,
+                                              .comm = comm };
+        if (is_inplace) {
+            host_coll_param.recv_bufs = send_bufs;
+        }
+        host_coll_param.hint_algo.alltoallv = ccl_coll_alltoallv_direct;
+
+        // do alltoall on the host (scale out) using global comm
+        ccl::add_scaleout(sched, host_coll_param, is_single_node, wait_events);
+        // returned back saved value
+        ccl::global_data::env().ze_multi_workers = ze_multi_workers_saved;
+
+        LOG_DEBUG("topo/scale_up/intra: in scale out case");
+        if (is_inplace) {
+            LOG_DEBUG("inplace/topo/scale_up/intra: in multi node case");
+            // do scale up for inplace on a node comm
+            inplace_mode(node_comm, comm_rank);
+        }
+        else {
+            LOG_DEBUG("outofplace/topo/scale_up/intra: in multi node case");
+            // do scale up for outofplace on a node comm
+            // copy from own send to own recv
+            copy_to_self(
+                send_bufs[comm->rank()], recv_bufs[comm->rank()], send_counts[comm->rank()]);
+            // copy from peer rank send to peer rank recv
+            if (is_read) {
+                copy_to_peers(recv_bufs, recv_counts, node_comm, send_buf_idx_start, comm_rank);
+            }
+            else {
+                copy_to_peers(send_bufs, send_counts, node_comm, recv_buf_idx_start, comm_rank);
+            }
+        }
+    }
     add_sched_barrier_for_parallel_copies();
-    barrier_comm(node_comm);
+    ccl::add_comm_barrier(sched, node_comm, wait_events);
 
     return ccl::status::success;
 }
diff --git a/src/coll/algorithms/reduce.cpp b/src/coll/algorithms/reduce.cpp
index 8d206bafc..fd08cc572 100644
--- a/src/coll/algorithms/reduce.cpp
+++ b/src/coll/algorithms/reduce.cpp
@@ -460,7 +460,7 @@ ccl::status ccl_coll_build_topo_reduce(ccl_sched* sched,
                                        ccl::reduction op,
                                        int root,
                                        ccl_comm* comm) {
-    LOG_DEBUG("build gpu reduce");
+    LOG_DEBUG("build gpu topo reduce");
 
     ccl_comm* pair_comm = comm->get_pair_comm().get();
     ccl_comm* even_comm = comm->get_even_comm().get();
@@ -494,11 +494,21 @@ ccl::status ccl_coll_build_topo_reduce(ccl_sched* sched,
         tmp_buf_idx = in_buffers.size();
         in_buffers.push_back({ tmp_buf.get_ptr(), ccl::ze::ipc_mem_type::memory });
     }
+    size_t ipc_event_count{};
+    size_t max_ipc_event_count{ 5 };
+    ze_event_pool_handle_t ipc_event_pool{};
+    if (ccl::global_data::env().enable_ze_barrier) {
+        ipc_event_pool = sched->get_memory().ipc_event_pool_manager.create(max_ipc_event_count);
+        in_buffers.push_back({ static_cast<void*>(ipc_event_pool), ccl::ze::ipc_mem_type::pool });
+    }
 
-    ccl::add_handle_exchange(sched, node_comm, in_buffers);
+    ccl::add_handle_exchange(
+        sched, node_comm, in_buffers, ccl_comm::invalid_rank, ipc_event_pool, ipc_event_count++);
 
     CCL_THROW_IF_NOT(comm_size % 2 == 0, "unexpected comm_size ", comm_size);
     CCL_THROW_IF_NOT(node_comm_size % 2 == 0, "unexpected node_comm_size ", node_comm_size);
+    CCL_THROW_IF_NOT(
+        count >= size_t(comm_size), "unexpected count:", count, " < comm_size:", comm_size);
 
     if (is_single_card) {
         LOG_DEBUG("topo/scale_up/intra: use ze_onesided_reduce");
@@ -509,7 +519,124 @@ ccl::status ccl_coll_build_topo_reduce(ccl_sched* sched,
 
         ccl::add_comm_barrier(sched, pair_comm);
     }
+    else if (ccl::global_data::env().enable_ze_bidir_algo) {
+        size_t base_count = count;
+        size_t pair_comm_offset = 0;
+        size_t pair_comm_offset_bytes = 0;
+
+        base_count = count / pair_comm->size();
+        pair_comm_offset = base_count * pair_comm->rank();
+        pair_comm_offset_bytes = pair_comm_offset * dtype.size();
+
+        if (pair_comm->rank() == pair_comm->size() - 1)
+            base_count += count % pair_comm->size();
+
+        size_t main_block_count = base_count / even_comm_size;
+        size_t block_count = main_block_count;
+        if (even_comm->rank() == even_comm_size - 1) {
+            block_count += base_count % even_comm_size;
+        }
+        size_t even_comm_offset_bytes = main_block_count * even_comm->rank() * dtype.size();
+        ccl_buffer pair_comm_send_buf = send_buf + pair_comm_offset_bytes;
+        ccl_buffer pair_comm_recv_buf = tmp_buf + pair_comm_offset_bytes;
+        ccl_buffer even_comm_recv_buf = tmp_buf + pair_comm_offset_bytes + even_comm_offset_bytes;
+
+        LOG_DEBUG("rank: ",
+                  pair_comm->rank(),
+                  ", count: ",
+                  count,
+                  ", base_count: ",
+                  base_count,
+                  ", block_count: ",
+                  block_count,
+                  ", pair_comm_offset: ",
+                  pair_comm_offset,
+                  ", pair_comm_offset_bytes: ",
+                  pair_comm_offset_bytes);
+
+        std::vector<ze_event_handle_t> wait_events;
+
+        LOG_DEBUG("topo/scale_up/intra: use ze_onesided_reduce");
+        auto entry = entry_factory::create<ze_onesided_reduce_entry>(sched,
+                                                                     pair_comm_send_buf,
+                                                                     pair_comm_recv_buf,
+                                                                     base_count,
+                                                                     dtype,
+                                                                     op,
+                                                                     pair_comm->rank(),
+                                                                     pair_comm,
+                                                                     wait_events,
+                                                                     pair_comm_offset);
+        wait_events.push_back(entry->entry_event);
+        ccl::add_comm_barrier(sched, even_comm, wait_events, ipc_event_pool, ipc_event_count++);
+
+        LOG_DEBUG("topo/scale_up/inter: use ze_a2a_reduce_scatter_entry");
+        std::vector<size_t> block_counts(even_comm->size(), main_block_count);
+        block_counts.back() = block_count;
+        auto entry_reduce_scatter =
+            entry_factory::create<ze_a2a_reduce_scatter_entry>(sched,
+                                                               pair_comm_recv_buf,
+                                                               even_comm_recv_buf,
+                                                               block_counts.data(),
+                                                               dtype,
+                                                               op,
+                                                               even_comm,
+                                                               wait_events,
+                                                               tmp_buf_idx,
+                                                               pair_comm_offset);
+        wait_events.push_back(entry_reduce_scatter->entry_event);
+        ccl::add_comm_barrier(sched, even_comm, wait_events, ipc_event_pool, ipc_event_count++);
+
+        CCL_THROW_IF_NOT(comm->size() % node_comm_size == 0);
+        int root_node_idx = root / node_comm_size;
+        size_t offset = (pair_comm_offset_bytes + even_comm_offset_bytes) / dtype.size();
+
+        if (!is_single_node) {
+            ccl_coll_entry_param coll_param{ .ctype = ccl_coll_reduce,
+                                             .send_buf = even_comm_recv_buf,
+                                             .recv_buf = even_comm_recv_buf,
+                                             .count = block_count,
+                                             .dtype = dtype,
+                                             .reduction = op,
+                                             .root = root_node_idx,
+                                             .comm = r2r_comm };
+
+            ccl::add_scaleout(sched,
+                              coll_param,
+                              is_single_node,
+                              wait_events,
+                              copy_attr(copy_direction::h2d, 0, offset),
+                              r2r_comm,
+                              use_tmp_buf ? tmp_buf : recv_buf,
+                              root_node_idx);
+            ccl::add_comm_barrier(sched, node_comm, wait_events, ipc_event_pool, ipc_event_count++);
+        }
+
+        if (root_node_idx == r2r_comm->rank()) {
+            LOG_DEBUG("topo/scale_up/intra: use ze_gatherv");
+            int root_card_idx = root / pair_comm->size();
+            copy_direction cd =
+                even_comm->rank() != root_card_idx ? copy_direction::d2d : copy_direction::t2t;
+            auto entry_copy = entry_factory::create<ze_copy_entry>(
+                sched,
+                even_comm_recv_buf,
+                comm->rank() == root ? recv_buf : ccl_buffer(),
+                block_count,
+                dtype,
+                copy_attr(root, recv_buf_idx, cd, comm, 0, offset),
+                wait_events);
+            wait_events.push_back(entry_copy->entry_event);
+            ccl::add_comm_barrier(sched, node_comm, wait_events, ipc_event_pool, ipc_event_count++);
+        }
+
+        CCL_THROW_IF_NOT(ipc_event_count <= max_ipc_event_count,
+                         "unexpected ipc_event_count ",
+                         ipc_event_count,
+                         ", expected max ",
+                         max_ipc_event_count);
+    }
     else {
+        LOG_DEBUG("topo reduce original");
         if (pair_comm->rank() == ccl::global_data::env().kernel_1s_lead) {
             LOG_DEBUG("topo/scale_up/intra: use ze_onesided_reduce");
             entry_factory::create<ze_onesided_reduce_entry>(
diff --git a/src/coll/coll.cpp b/src/coll/coll.cpp
index 5be94783a..f8f762c9a 100644
--- a/src/coll/coll.cpp
+++ b/src/coll/coll.cpp
@@ -51,7 +51,6 @@
 
 #include "coll/algorithms/algorithms.hpp"
 #include "coll/algorithms/algorithm_utils.hpp"
-#include "coll/algorithms/allreduce/allreduce_2d.hpp"
 #include "coll/selection/selection.hpp"
 #include "exec/exec.hpp"
 #include "fusion/fusion.hpp"
@@ -68,6 +67,12 @@ static ccl_request* ccl_coll_create(ccl_coll_param& param, const ccl_coll_attr&
 #ifdef CCL_ENABLE_SYCL
     if (ccl::global_data::env().enable_op_sync)
         attr.synchronous = 1;
+
+    if (param.stream && ccl::global_data::env().enable_external_queue) {
+        LOG_DEBUG("use external queue in CCL for compute kernel.");
+        // Todo: need to submit kernel before this API return. Now, just use wait execution as WA.
+        attr.synchronous = 1;
+    }
 #endif // CCL_ENABLE_SYCL
 
     LOG_DEBUG("\n{\n",
@@ -137,8 +142,8 @@ static ccl_request* ccl_coll_create(ccl_coll_param& param, const ccl_coll_attr&
     /* 6. regular schedule execution */
     ccl_request* request = sched->start(data.executor.get());
     if (sched->coll_attr.synchronous) {
+        request->synchronous = true;
         ccl_wait_impl<ccl_sched>(data.executor.get(), request);
-        request = nullptr;
     }
 
 #ifdef CCL_ENABLE_ITT
@@ -228,7 +233,8 @@ ccl::status ccl_coll_build_allreduce(ccl_sched* sched,
                                      size_t count,
                                      const ccl_datatype& dtype,
                                      ccl::reduction reduction,
-                                     ccl_comm* comm) {
+                                     ccl_comm* comm,
+                                     bool is_scaleout) {
     CCL_ASSERT(sched != nullptr && comm != nullptr);
     ccl::status status = ccl::status::success;
 
@@ -243,6 +249,7 @@ ccl::status ccl_coll_build_allreduce(ccl_sched* sched,
     param.is_sycl_buf = sched->coll_attr.is_sycl_buf;
 #endif // CCL_ENABLE_SYCL
     param.hint_algo = sched->hint_algo;
+    param.is_scaleout = is_scaleout;
 
     auto algo = ccl::global_data::get().algorithm_selector->get<ccl_coll_allreduce>(param);
 
@@ -283,8 +290,8 @@ ccl::status ccl_coll_build_allreduce(ccl_sched* sched,
                 sched, send_buf, recv_buf, count, dtype, reduction, comm));
             break;
         case ccl_coll_allreduce_2d:
-            CCL_CALL(comm->get_allreduce_2d_builder()->build(
-                sched, send_buf, recv_buf, count, dtype, reduction));
+            CCL_CALL(ccl_coll_build_2d_allreduce(
+                sched, send_buf, recv_buf, count, dtype, reduction, comm));
             break;
 #if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
         case ccl_coll_allreduce_topo:
diff --git a/src/coll/coll.hpp b/src/coll/coll.hpp
index 180d5f06a..dfc01a11f 100644
--- a/src/coll/coll.hpp
+++ b/src/coll/coll.hpp
@@ -37,13 +37,15 @@ ccl::status ccl_coll_build_allgatherv(ccl_sched* sched,
                                       const ccl_datatype& dtype,
                                       ccl_comm* comm);
 
+// TODO: pack this arguments in ccl_coll_build parameters structure
 ccl::status ccl_coll_build_allreduce(ccl_sched* sched,
                                      ccl_buffer send_buf,
                                      ccl_buffer recv_buf,
                                      size_t count,
                                      const ccl_datatype& dtype,
                                      ccl::reduction reduction,
-                                     ccl_comm* comm);
+                                     ccl_comm* comm,
+                                     bool is_scaleout);
 
 ccl::status ccl_coll_build_alltoall(ccl_sched* sched,
                                     ccl_buffer send_buf,
diff --git a/src/coll/coll_check.cpp b/src/coll/coll_check.cpp
index d5fba1e9f..0bfb002e1 100644
--- a/src/coll/coll_check.cpp
+++ b/src/coll/coll_check.cpp
@@ -20,9 +20,10 @@
 #include "coll/coll_check.hpp"
 #include "common/env/env.hpp"
 #include "common/global/global.hpp"
-#include "common/utils/sycl_utils.hpp"
 
 #ifdef CCL_ENABLE_SYCL
+#include "common/utils/sycl_utils.hpp"
+
 void ccl_check_usm_pointers(const ccl_coll_param& param) {
     auto bufs = param.get_all_non_zero_bufs();
     if (bufs.empty()) {
diff --git a/src/coll/coll_param.cpp b/src/coll/coll_param.cpp
index e4f91f75b..11f1faa11 100644
--- a/src/coll/coll_param.cpp
+++ b/src/coll/coll_param.cpp
@@ -17,7 +17,10 @@
 
 #include "coll/coll_param.hpp"
 #include "common/global/global.hpp"
+
+#ifdef CCL_ENABLE_SYCL
 #include "common/utils/sycl_utils.hpp"
+#endif // CCL_ENABLE_SYCL
 
 #define COPY_COMMON_OP_ATTRS(from, to) \
     to->priority = from.get<ccl::operation_attr_id::priority>(); \
@@ -95,6 +98,7 @@ ccl_coll_param::ccl_coll_param() {
     recv_counts.reserve(1);
     stream = nullptr;
     comm = nullptr;
+    is_scaleout = false;
 }
 void ccl_coll_param::copy(const ccl_coll_param& other) {
     ctype = other.ctype;
@@ -109,6 +113,7 @@ void ccl_coll_param::copy(const ccl_coll_param& other) {
     root = other.root;
     comm = other.comm;
     stream = other.stream;
+    is_scaleout = other.is_scaleout;
     copy_deps(other.deps);
     validate();
 }
diff --git a/src/coll/coll_param.hpp b/src/coll/coll_param.hpp
index 3d61d28d0..32d9aa108 100644
--- a/src/coll/coll_param.hpp
+++ b/src/coll/coll_param.hpp
@@ -27,7 +27,7 @@ class ccl_comm;
 #include <CL/sycl.hpp>
 
 template <class native_type>
-using ccl_sycl_typed_buffer_t = cl::sycl::buffer<native_type, 1>;
+using ccl_sycl_typed_buffer_t = sycl::buffer<native_type, 1>;
 
 /* ordering should be aligned with ccl::datatype */
 using ccl_sycl_buffer_one_dim_types = std::tuple<ccl_sycl_typed_buffer_t<int8_t>,
@@ -105,6 +105,7 @@ struct ccl_coll_param {
     ccl_stream* stream;
     ccl_comm* comm;
     std::vector<ccl::event> deps;
+    bool is_scaleout;
 
     ccl_coll_param();
     ccl_coll_param(const ccl_coll_param& other);
diff --git a/src/coll/coll_util.cpp b/src/coll/coll_util.cpp
index f8ebea2c9..c591065d7 100644
--- a/src/coll/coll_util.cpp
+++ b/src/coll/coll_util.cpp
@@ -41,6 +41,7 @@ void add_coll_entry(ccl_sched* sched, const ccl_coll_entry_param& param) {
     selector_param.is_sycl_buf = sched->coll_attr.is_sycl_buf;
 #endif // CCL_ENABLE_SYCL
     selector_param.hint_algo = param.hint_algo;
+    selector_param.is_scaleout = param.is_scaleout;
 
     if (ccl_is_device_side_algo(selector_param)) {
         sched->strict_order = true;
@@ -99,11 +100,11 @@ void add_comm_barrier(ccl_sched* sched,
     sched->add_barrier();
 }
 
-ze_event_handle_t add_comm_barrier(ccl_sched* sched,
-                                   ccl_comm* comm,
-                                   const std::vector<ze_event_handle_t>& wait_events,
-                                   ze_event_pool_handle_t ipc_pool,
-                                   size_t ipc_event_idx) {
+void add_comm_barrier(ccl_sched* sched,
+                      ccl_comm* comm,
+                      std::vector<ze_event_handle_t>& wait_events,
+                      ze_event_pool_handle_t ipc_pool,
+                      size_t ipc_event_idx) {
     sched->add_barrier();
     auto signal_event = sched->get_memory().event_manager->create();
     if (sched->use_single_list) {
@@ -116,7 +117,7 @@ ze_event_handle_t add_comm_barrier(ccl_sched* sched,
         add_signal_event(sched, signal_event);
     }
     sched->add_barrier();
-    return signal_event;
+    wait_events.push_back(signal_event);
 }
 
 void add_handle_exchange(ccl_sched* sched,
@@ -173,8 +174,36 @@ void add_coll(ccl_sched* sched,
                                                                  param.stream);
                 break;
             }
+            case ccl_coll_alltoallv: {
+                coll_param = ccl_coll_param::create_alltoallv_param(param.send_buf.get_src(),
+                                                                    param.send_counts,
+                                                                    param.recv_buf.get_src(),
+                                                                    param.recv_counts,
+                                                                    param.dtype.idx(),
+                                                                    attr,
+                                                                    param.comm,
+                                                                    param.stream);
+
+                break;
+            }
+            case ccl_coll_allgatherv: {
+                coll_param = ccl_coll_param::create_allgatherv_param(param.send_buf.get_src(),
+                                                                     param.send_count,
+                                                                     param.recv_buf.get_src(),
+                                                                     param.recv_counts,
+                                                                     param.dtype.idx(),
+                                                                     attr,
+                                                                     param.comm,
+                                                                     param.stream);
+                break;
+            }
             default: CCL_THROW("unexpected coll type", ccl_coll_type_to_str(param.ctype));
         }
+        LOG_DEBUG("scaleout/multi_workers: created params for: ",
+                  ccl_coll_type_to_str(param.ctype),
+                  " coll");
+        // pass the scale-out selection param through factory
+        coll_param.is_scaleout = param.is_scaleout;
         ccl_sched_create_param sched_param(sched->sched_id, coll_param);
         entry_factory::create<subsched_entry>(sched, 0, sched_param, "SCALEOUT");
     }
@@ -190,69 +219,139 @@ void add_coll(ccl_sched* sched,
 }
 
 void add_scaleout(ccl_sched* sched,
-                  const ccl_coll_entry_param& coll_param,
+                  const ccl_coll_entry_param& in_coll_param,
                   const bool is_single_node,
                   std::vector<ze_event_handle_t>& wait_events,
                   const copy_attr& h2d_copy_attr,
                   ccl_comm* global_comm,
                   ccl_buffer global_recv_buf,
                   int global_root) {
-    ccl_coll_entry_param local_coll_param(coll_param);
+    ccl_coll_entry_param coll_param(in_coll_param);
 
-    bool multi_node = (!is_single_node && local_coll_param.count);
+    bool multi_node = (!is_single_node && (coll_param.count || coll_param.recv_counts));
     bool enable_hmem = (ccl::global_data::env().use_hmem && atl_base_comm::attr.out.enable_hmem);
     bool do_h2d_copy =
-        (local_coll_param.ctype == ccl_coll_allreduce && multi_node && !enable_hmem) ||
-        (local_coll_param.ctype == ccl_coll_reduce &&
-         local_coll_param.comm->rank() == local_coll_param.root);
+        ((coll_param.ctype == ccl_coll_allreduce || coll_param.ctype == ccl_coll_alltoallv ||
+          coll_param.ctype == ccl_coll_alltoall || coll_param.ctype == ccl_coll_allgatherv) &&
+         multi_node && !enable_hmem) ||
+        (coll_param.ctype == ccl_coll_reduce && coll_param.comm->rank() == coll_param.root);
+
+    auto copy_entry =
+        [&](ccl_buffer src, ccl_buffer dst, const size_t count, const copy_attr& copy_attr) {
+            LOG_DEBUG("topo/scale_out/intra: use ze_copy_entry");
+            auto entry = entry_factory::create<ze_copy_entry>(
+                sched, src, dst, count, coll_param.dtype, copy_attr, wait_events);
+            wait_events.push_back(entry->entry_event);
+        };
+
+    auto copy_entry_with_offset = [&](std::vector<ccl_buffer> bufs,
+                                      ccl_buffer buf,
+                                      const size_t* counts,
+                                      const copy_attr& copy_attr) {
+        size_t offset = 0;
+        // number of not skipped s/r_counts, helps calculate the offset
+        for (int idx = 0; idx < coll_param.comm->size(); idx++) {
+            if (counts[idx] == 0) {
+                continue;
+            }
+
+            ccl_buffer src = bufs[idx];
+            ccl_buffer dst = buf + offset;
+            if (copy_attr.direction == copy_direction::h2d) {
+                src = buf + offset;
+                dst = bufs[idx];
+            }
+            copy_entry(src, dst, counts[idx], copy_attr);
+            offset += counts[idx] * coll_param.dtype.size();
+        }
+        LOG_DEBUG("copy_entry_with_offset done");
+    };
 
     if (multi_node) {
         if (!enable_hmem) {
-            LOG_DEBUG("topo/scale_out: use host_", ccl_coll_type_to_str(local_coll_param.ctype));
-            ccl::alloc_param alloc_param(local_coll_param.count * local_coll_param.dtype.size(),
-                                         ccl::buffer_type::regular,
-                                         ccl::buffer_place::host);
-            local_coll_param.send_buf = sched->alloc_buffer(alloc_param);
-            local_coll_param.recv_buf = local_coll_param.send_buf;
-
-            auto entry = entry_factory::create<ze_copy_entry>(sched,
-                                                              coll_param.send_buf,
-                                                              local_coll_param.send_buf,
-                                                              local_coll_param.count,
-                                                              local_coll_param.dtype,
-                                                              copy_attr(copy_direction::d2h),
-                                                              wait_events);
-            wait_events.push_back(entry->entry_event);
+            LOG_DEBUG("topo/scale_out: use host_", ccl_coll_type_to_str(coll_param.ctype));
+
+            size_t host_buf_size = 0;
+            if (coll_param.ctype == ccl_coll_alltoallv || coll_param.ctype == ccl_coll_alltoall ||
+                coll_param.ctype == ccl_coll_allgatherv) {
+                // assume sum of send_counts and recv_counts are equal for alltoallv
+                host_buf_size = std::accumulate(coll_param.recv_counts,
+                                                coll_param.recv_counts + coll_param.comm->size(),
+                                                0) *
+                                coll_param.dtype.size();
+                LOG_DEBUG("alltoall(v) scale_out host buf size: ", host_buf_size);
+            }
+            else {
+                host_buf_size = coll_param.count * coll_param.dtype.size();
+            }
+
+            CCL_THROW_IF_NOT(host_buf_size != invalid_host_buf_size,
+                             "unexpected the size of buffer in scaleout phase");
+            ccl::alloc_param alloc_param(
+                host_buf_size, ccl::buffer_type::regular, ccl::buffer_place::host);
+            coll_param.send_buf = sched->alloc_buffer(alloc_param);
+            coll_param.recv_buf = coll_param.send_buf;
+
+            if (coll_param.ctype == ccl_coll_alltoallv || coll_param.ctype == ccl_coll_alltoall) {
+                copy_entry_with_offset(in_coll_param.send_bufs,
+                                       coll_param.send_buf,
+                                       coll_param.send_counts,
+                                       copy_attr(copy_direction::d2h));
+            }
+            else if (coll_param.ctype == ccl_coll_allgatherv) {
+                size_t offset = std::accumulate(coll_param.recv_counts,
+                                                coll_param.recv_counts + coll_param.comm->rank(),
+                                                0) *
+                                coll_param.dtype.size();
+                copy_entry(in_coll_param.send_buf,
+                           coll_param.send_buf + offset,
+                           coll_param.send_count,
+                           copy_attr(copy_direction::d2h));
+            }
+            else {
+                copy_entry(in_coll_param.send_buf,
+                           coll_param.send_buf,
+                           coll_param.count,
+                           copy_attr(copy_direction::d2h));
+            }
             sched->add_barrier();
+
+            LOG_DEBUG("topo/scale_out: ze_copy_entry of D2H for ",
+                      ccl_coll_type_to_str(coll_param.ctype),
+                      " done");
         }
+        // pass the scale-out selection param directly
+        coll_param.is_scaleout = true;
         // do inplace collective
-        ccl::add_coll(sched, local_coll_param, wait_events);
+        ccl::add_coll(sched, coll_param, wait_events);
     }
 
     if (!do_h2d_copy)
         return;
 
-    ccl_buffer src_copy_buf = local_coll_param.recv_buf;
-    ccl_buffer dst_copy_buf = coll_param.recv_buf;
+    ccl_buffer src_copy_buf = coll_param.recv_buf;
+    ccl_buffer dst_copy_buf = in_coll_param.recv_buf;
 
-    if (coll_param.ctype == ccl_coll_reduce) {
+    if (in_coll_param.ctype == ccl_coll_reduce) {
         if (!multi_node)
-            src_copy_buf = coll_param.recv_buf;
+            src_copy_buf = in_coll_param.recv_buf;
         dst_copy_buf = (global_comm->rank() == global_root) ? global_recv_buf : ccl_buffer();
     }
 
-    LOG_DEBUG("topo/scale_up/intra: use ze_copy_entry");
-    auto entry = entry_factory::create<ze_copy_entry>(sched,
-                                                      src_copy_buf,
-                                                      dst_copy_buf,
-                                                      local_coll_param.count,
-                                                      local_coll_param.dtype,
-                                                      h2d_copy_attr,
-                                                      wait_events);
-    wait_events.push_back(entry->entry_event);
+    if (coll_param.ctype == ccl_coll_alltoallv || coll_param.ctype == ccl_coll_alltoall ||
+        coll_param.ctype == ccl_coll_allgatherv) {
+        copy_entry_with_offset(
+            in_coll_param.recv_bufs, coll_param.recv_buf, coll_param.recv_counts, h2d_copy_attr);
+    }
+    else {
+        copy_entry(src_copy_buf, dst_copy_buf, coll_param.count, h2d_copy_attr);
+    }
     sched->add_barrier();
-}
 
+    LOG_DEBUG("topo/scale_out: ze_copy_entry of H2D for ",
+              ccl_coll_type_to_str(coll_param.ctype),
+              " done");
+}
 #endif // CCL_ENABLE_ZE && CCL_ENABLE_SYCL
 
 } // namespace ccl
diff --git a/src/coll/coll_util.hpp b/src/coll/coll_util.hpp
index 8693466b7..5e2729018 100644
--- a/src/coll/coll_util.hpp
+++ b/src/coll/coll_util.hpp
@@ -27,6 +27,7 @@ namespace ccl {
 void add_coll_entry(ccl_sched* sched, const ccl_coll_entry_param& param);
 
 #if defined(CCL_ENABLE_ZE) && defined(CCL_ENABLE_SYCL)
+static constexpr int invalid_host_buf_size = 0;
 
 void add_wait_events(ccl_sched* sched, const std::vector<ze_event_handle_t>& wait_events);
 void add_signal_event(ccl_sched* sched, ze_event_handle_t signal_event);
@@ -37,11 +38,11 @@ void add_comm_barrier(ccl_sched* sched,
                       ze_event_pool_handle_t ipc_pool = {},
                       size_t ipc_event_idx = 0);
 
-ze_event_handle_t add_comm_barrier(ccl_sched* sched,
-                                   ccl_comm* comm,
-                                   const std::vector<ze_event_handle_t>& wait_events,
-                                   ze_event_pool_handle_t ipc_pool = {},
-                                   size_t ipc_event_idx = 0);
+void add_comm_barrier(ccl_sched* sched,
+                      ccl_comm* comm,
+                      std::vector<ze_event_handle_t>& wait_events,
+                      ze_event_pool_handle_t ipc_pool = {},
+                      size_t ipc_event_idx = 0);
 
 void add_handle_exchange(ccl_sched* sched,
                          ccl_comm* comm,
@@ -55,14 +56,13 @@ void add_coll(ccl_sched* sched,
               std::vector<ze_event_handle_t>& wait_events);
 
 void add_scaleout(ccl_sched* sched,
-                  const ccl_coll_entry_param& coll_param,
+                  const ccl_coll_entry_param& in_coll_param,
                   const bool is_single_node,
                   std::vector<ze_event_handle_t>& wait_events,
                   const copy_attr& h2d_copy_attr = copy_attr(copy_direction::h2d),
                   ccl_comm* global_comm = nullptr,
                   ccl_buffer global_recv = {},
                   int global_root = 0);
-
 #endif // CCL_ENABLE_ZE && CCL_ENABLE_SYCL
 
 } // namespace ccl
diff --git a/src/coll/selection/selection.cpp b/src/coll/selection/selection.cpp
index 41be1e628..3c1bdd89a 100644
--- a/src/coll/selection/selection.cpp
+++ b/src/coll/selection/selection.cpp
@@ -18,7 +18,6 @@
 #include "common/global/global.hpp"
 
 #if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
-#include <CL/sycl/backend_types.hpp>
 #include "common/utils/sycl_utils.hpp"
 #include "sched/entry/ze/ze_primitives.hpp"
 #endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
@@ -96,6 +95,13 @@ bool ccl_is_direct_algo(const ccl_selector_param& param) {
 
 namespace checkers {
 
+bool is_unknown_device_family(const ccl_selector_param& param) {
+    if (param.stream) {
+        return param.stream->get_device_family() == ccl::device_family::unknown;
+    }
+    return false;
+}
+
 bool is_family1_card(const ccl_selector_param& param) {
     if (param.stream) {
         return param.stream->get_device_family() == ccl::device_family::family1;
@@ -141,7 +147,7 @@ bool is_gpu_stream(const ccl_selector_param& param) {
 }
 
 bool is_single_node(const ccl_selector_param& param) {
-    size_t local_proc_count = ccl::global_data::get().executor->get_local_proc_count();
+    size_t local_proc_count = ccl::global_data::get().get_local_proc_count();
     return static_cast<size_t>(param.comm->size()) <= local_proc_count;
 }
 
@@ -229,7 +235,11 @@ bool ccl_is_device_side_algo(const ccl_selector_param& param) {
 }
 
 bool ccl_can_use_topo_algo(const ccl_selector_param& param) {
-    RETURN_FALSE_IF(!ccl::global_data::env().enable_topo_algo, "topo algo is explicitly disabled");
+#ifdef CCL_ENABLE_SYCL
+    RETURN_FALSE_IF(!param.comm->get_env()->get_enable_topo_algo(), "topo algo is disabled");
+#else // CCL_ENABLE_SYCL
+    return false;
+#endif // CCL_ENABLE_SYCL
 
     auto supported_colls = { ccl_coll_allgatherv,    ccl_coll_allreduce, ccl_coll_alltoall,
                              ccl_coll_alltoallv,     ccl_coll_bcast,     ccl_coll_reduce,
@@ -237,17 +247,7 @@ bool ccl_can_use_topo_algo(const ccl_selector_param& param) {
     RETURN_FALSE_IF(!checkers::is_coll_supported(supported_colls, param.ctype),
                     "coll is not supported");
 
-    // Fallback if implicit scaling is enabled
-    char* implicit_scaling_env = getenv("EnableImplicitScaling");
-    int enable_implicit_scaling;
-
-    if (implicit_scaling_env) {
-        enable_implicit_scaling = atoi(implicit_scaling_env);
-        LOG_DEBUG("Implicit scaling not null value = ", enable_implicit_scaling);
-        RETURN_FALSE_IF(enable_implicit_scaling != 0, "Implicit scaling is not supported");
-    }
-
-    size_t local_proc_count = ccl::global_data::get().executor->get_local_proc_count();
+    size_t local_proc_count = ccl::global_data::get().get_local_proc_count();
     int comm_size = param.comm->size();
 
     LOG_DEBUG("coll ",
@@ -278,16 +278,6 @@ bool ccl_can_use_topo_algo(const ccl_selector_param& param) {
     RETURN_FALSE_IF(!param.comm->get_even_comm().get(), "sub-communicators are not available");
 
 #ifdef CCL_ENABLE_SYCL
-    RETURN_FALSE_IF(ccl::global_data::env().enable_ze_bidir_algo &&
-                        (checkers::is_family1_card(param) || !checkers::is_single_card(param)) &&
-                        param.ctype == ccl_coll_allreduce,
-                    "bidir for allreduce is not supported for family1 card or for multi-card");
-
-    RETURN_FALSE_IF(
-        (!ccl::global_data::env().enable_ze_bidir_algo || checkers::is_family1_card(param)) &&
-            (param.ctype == ccl_coll_alltoall || param.ctype == ccl_coll_alltoallv),
-        "alltoall(v) is supported with bidir only or not on family1 card");
-
     RETURN_FALSE_IF(!param.comm->get_topo_manager().has_p2p_access(),
                     "no p2p access between devices");
 
@@ -297,6 +287,15 @@ bool ccl_can_use_topo_algo(const ccl_selector_param& param) {
     RETURN_FALSE_IF(!param.comm->get_topo_manager().has_same_domains(),
                     "processes are not properly distributed among domains");
 
+    if (!ccl::global_data::env().ze_disable_oversubscription_check) {
+        RETURN_FALSE_IF(param.comm->get_topo_manager().has_oversubscription(),
+                        "oversubscription case: one rank per device is only supported");
+    }
+
+    RETURN_FALSE_IF(!ccl::global_data::env().enable_ze_bidir_algo &&
+                        (param.ctype == ccl_coll_alltoall || param.ctype == ccl_coll_alltoallv),
+                    "alltoall(v) is supported with bidir only");
+
     if (!ccl::global_data::env().disable_ze_port_check) {
         RETURN_FALSE_IF(
             !checkers::is_single_card(param) && param.comm->get_topo_manager().has_failed_ports(),
@@ -305,10 +304,26 @@ bool ccl_can_use_topo_algo(const ccl_selector_param& param) {
 
     if (!ccl::global_data::env().disable_ze_family_check) {
         RETURN_FALSE_IF(checkers::is_family1_card(param) && !checkers::is_single_card(param),
-                        "family1 multi-card for ",
+                        "multi-card ",
                         ccl_coll_type_to_str(param.ctype),
-                        " is not supported");
-    }
+                        " is not supported for family1");
+
+        RETURN_FALSE_IF(
+            checkers::is_family1_card(param) && ccl::global_data::env().enable_ze_bidir_algo,
+            "bidir ",
+            ccl_coll_type_to_str(param.ctype),
+            " is not supported for family1");
+    }
+
+    RETURN_FALSE_IF(checkers::is_unknown_device_family(param),
+                    "topo algo is not supported for unknown device family");
+#ifndef CCL_BF16_GPU_TRUNCATE
+    RETURN_FALSE_IF(checkers::is_unknown_device_family(param) &&
+                        (param.dtype.idx() == ccl::datatype::bfloat16) &&
+                        (param.ctype == ccl_coll_allreduce || param.ctype == ccl_coll_reduce ||
+                         param.ctype == ccl_coll_reduce_scatter),
+                    "bfloat16 reduction is not supported for unknown device family");
+#endif // !CCL_BF16_GPU_TRUNCATE
 #endif // CCL_ENABLE_SYCL
 
     RETURN_FALSE_IF((((param.ctype == ccl_coll_allreduce) || (param.ctype == ccl_coll_bcast) ||
@@ -317,9 +332,7 @@ bool ccl_can_use_topo_algo(const ccl_selector_param& param) {
                     "unsupported comm size for ",
                     ccl_coll_type_to_str(param.ctype));
 
-    RETURN_FALSE_IF((param.ctype == ccl_coll_allgatherv || param.ctype == ccl_coll_alltoall ||
-                     param.ctype == ccl_coll_alltoallv || param.ctype == ccl_coll_bcast ||
-                     param.ctype == ccl_coll_reduce_scatter) &&
+    RETURN_FALSE_IF((param.ctype == ccl_coll_bcast || param.ctype == ccl_coll_reduce_scatter) &&
                         !checkers::is_single_node(param),
                     "multi-node for ",
                     ccl_coll_type_to_str(param.ctype),
@@ -338,56 +351,42 @@ bool ccl_can_use_topo_algo(const ccl_selector_param& param) {
                         (local_proc_count % 2 != 0),
                     "odd proc count per node is not supported");
 
+    RETURN_FALSE_IF((param.ctype == ccl_coll_reduce || param.ctype == ccl_coll_allreduce) &&
+                        (param.count < size_t(param.comm->size())),
+                    "reduce with count < comm_size not supported");
+
     return true;
 }
 
 bool ccl_can_use_datatype(ccl_coll_algo algo, const ccl_selector_param& param) {
-    // regular datatype, don't need to check for an additional support
-    if (param.dtype.idx() != ccl::datatype::bfloat16 &&
-        param.dtype.idx() != ccl::datatype::float16) {
+    if (param.dtype.idx() != ccl::datatype::float16) {
         return true;
     }
 
     bool can_use = true;
 
+    // algorithms running on device side support fp16
+    // so we don't need to require their support on the host
     bool device_side_algo = ccl_is_device_side_algo(algo, param);
+    if (device_side_algo) {
+        return true;
+    }
 
-    // algorithms running on device side support fp16 and bf16 both
-    // so we don't need to require their support on the host
-    if (!device_side_algo) {
-        if (param.dtype == ccl::datatype::bfloat16) {
-            bool bf16_hw_support =
-                ccl::global_data::env().bf16_impl_type != ccl_bf16_no_hardware_support;
-            bool bf16_compiler_support =
-                ccl::global_data::env().bf16_impl_type != ccl_bf16_no_compiler_support;
-
-            can_use = bf16_compiler_support && bf16_hw_support;
-
-            if (!can_use) {
-                LOG_DEBUG("BF16 datatype is requested for ",
-                          ccl_coll_type_to_str(param.ctype),
-                          " running on CPU but not fully supported: hw: ",
-                          bf16_hw_support,
-                          " compiler: ",
-                          bf16_compiler_support);
-            }
-        }
-        else if (param.dtype == ccl::datatype::float16) {
-            bool fp16_hw_support =
-                ccl::global_data::env().fp16_impl_type != ccl_fp16_no_hardware_support;
-            bool fp16_compiler_support =
-                ccl::global_data::env().fp16_impl_type != ccl_fp16_no_compiler_support;
-
-            can_use = fp16_hw_support && fp16_compiler_support;
-
-            if (!can_use) {
-                LOG_DEBUG("FP16 datatype is requested for ",
-                          ccl_coll_type_to_str(param.ctype),
-                          " running on CPU but not fully supported: hw: ",
-                          fp16_hw_support,
-                          " compiler: ",
-                          fp16_compiler_support);
-            }
+    if (param.dtype == ccl::datatype::float16) {
+        bool fp16_hw_support =
+            ccl::global_data::env().fp16_impl_type != ccl_fp16_no_hardware_support;
+        bool fp16_compiler_support =
+            ccl::global_data::env().fp16_impl_type != ccl_fp16_no_compiler_support;
+
+        can_use = fp16_hw_support && fp16_compiler_support;
+
+        if (!can_use) {
+            LOG_DEBUG("FP16 datatype is requested for ",
+                      ccl_coll_type_to_str(param.ctype),
+                      " running on CPU but not fully supported: hw: ",
+                      fp16_hw_support,
+                      " compiler: ",
+                      fp16_compiler_support);
         }
     }
 
diff --git a/src/coll/selection/selector.hpp b/src/coll/selection/selector.hpp
index fbc97d903..9d2dcdb1b 100644
--- a/src/coll/selection/selector.hpp
+++ b/src/coll/selection/selector.hpp
@@ -52,6 +52,8 @@ struct ccl_selector_param {
 #endif // CCL_ENABLE_SYCL
 
     ccl_coll_algo hint_algo = {};
+
+    bool is_scaleout = false;
 };
 
 template <ccl_coll_type coll_id>
@@ -69,6 +71,7 @@ using ccl_selection_table_iter_t = typename ccl_selection_table_t<algo_group_typ
     struct ccl_algorithm_selector_base { \
         ccl_selection_table_t<algo_group_type> main_table{}; \
         ccl_selection_table_t<algo_group_type> fallback_table{}; \
+        ccl_selection_table_t<algo_group_type> scaleout_table{}; \
         ccl_algorithm_selector_base(){}; \
         void init(); \
         void print() const; \
diff --git a/src/coll/selection/selector_allgatherv.cpp b/src/coll/selection/selector_allgatherv.cpp
index f78277a00..6d8a1c720 100644
--- a/src/coll/selection/selector_allgatherv.cpp
+++ b/src/coll/selection/selector_allgatherv.cpp
@@ -47,6 +47,14 @@ ccl_algorithm_selector<ccl_coll_allgatherv>::ccl_algorithm_selector() {
 #endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
 
     insert(fallback_table, 0, CCL_SELECTION_MAX_COLL_SIZE, ccl_coll_allgatherv_flat);
+
+    // scale-out table by default duplicates the main table
+    // TODO: fill the table with algorithms which is suitable for the better scale-out performance.
+    // Explanation: when implementing it was a simple scenario that does not contradict with the selection logic.
+    // If there are no environemnt variable provided, scale-out path will go through the scaleout_table like it is a main_table
+    // and use fallback path if nothing is suitable. Correct default behavior of each algorithm`s scale-out path is another task with discussion
+    // and performance measurements.
+    scaleout_table = main_table;
 }
 
 template <>
@@ -82,4 +90,5 @@ CCL_SELECTION_DEFINE_HELPER_METHODS(ccl_coll_allgatherv_algo,
                                                             0);
                                         count /= param.comm->size();
                                         count;
-                                    }));
+                                    }),
+                                    ccl::global_data::env().allgatherv_scaleout_algo_raw);
diff --git a/src/coll/selection/selector_allreduce.cpp b/src/coll/selection/selector_allreduce.cpp
index b98e9ffb1..521c37da8 100644
--- a/src/coll/selection/selector_allreduce.cpp
+++ b/src/coll/selection/selector_allreduce.cpp
@@ -59,6 +59,14 @@ ccl_algorithm_selector<ccl_coll_allreduce>::ccl_algorithm_selector() {
     insert(fallback_table, 0, CCL_SELECTION_MAX_COLL_SIZE, ccl_coll_allreduce_ring);
     insert(fallback_table, 0, CCL_ALLREDUCE_SHORT_MSG_SIZE, ccl_coll_allreduce_recursive_doubling);
 #endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
+
+    // scale-out table by default duplicates the main table
+    // TODO: fill the table with algorithms which is suitable for the better scale-out performance.
+    // Explanation: when implementing it was a simple scenario that does not contradict with the selection logic.
+    // If there are no environemnt variable provided, scale-out path will go through the scaleout_table like it is a main_table
+    // and use fallback path if nothing is suitable. Correct default behavior of each algorithm`s scale-out path is another task with discussion
+    // and performance measurements.
+    scaleout_table = main_table;
 }
 
 template <>
@@ -79,9 +87,6 @@ bool ccl_algorithm_selector_helper<ccl_coll_allreduce_algo>::can_use(
         can_use = false;
     else if (algo == ccl_coll_allreduce_nreduce && !(param.count / param.comm->size()))
         can_use = false;
-    else if (algo == ccl_coll_allreduce_2d &&
-             (ccl::global_data::env().atl_transport == ccl_atl_mpi))
-        can_use = false;
     else if (algo == ccl_coll_allreduce_direct &&
              (ccl::global_data::env().atl_transport == ccl_atl_ofi))
         can_use = false;
@@ -94,4 +99,5 @@ bool ccl_algorithm_selector_helper<ccl_coll_allreduce_algo>::can_use(
 CCL_SELECTION_DEFINE_HELPER_METHODS(ccl_coll_allreduce_algo,
                                     ccl_coll_allreduce,
                                     ccl::global_data::env().allreduce_algo_raw,
-                                    param.count);
+                                    param.count,
+                                    ccl::global_data::env().allreduce_scaleout_algo_raw);
diff --git a/src/coll/selection/selector_alltoall.cpp b/src/coll/selection/selector_alltoall.cpp
index d68df3325..8a48051e7 100644
--- a/src/coll/selection/selector_alltoall.cpp
+++ b/src/coll/selection/selector_alltoall.cpp
@@ -36,6 +36,14 @@ ccl_algorithm_selector<ccl_coll_alltoall>::ccl_algorithm_selector() {
     }
 #endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
     insert(fallback_table, 0, CCL_SELECTION_MAX_COLL_SIZE, ccl_coll_alltoall_scatter);
+
+    // scale-out table by default duplicates the main table
+    // TODO: fill the table with algorithms which is suitable for the better scale-out performance.
+    // Explanation: when implementing it was a simple scenario that does not contradict with the selection logic.
+    // If there are no environemnt variable provided, scale-out path will go through the scaleout_table like it is a main_table
+    // and use fallback path if nothing is suitable. Correct default behavior of each algorithm`s scale-out path is another task with discussion
+    // and performance measurements.
+    scaleout_table = main_table;
 }
 
 template <>
@@ -63,4 +71,5 @@ bool ccl_algorithm_selector_helper<ccl_coll_alltoall_algo>::can_use(
 CCL_SELECTION_DEFINE_HELPER_METHODS(ccl_coll_alltoall_algo,
                                     ccl_coll_alltoall,
                                     ccl::global_data::env().alltoall_algo_raw,
-                                    param.count);
+                                    param.count,
+                                    ccl::global_data::env().alltoall_scaleout_algo_raw);
diff --git a/src/coll/selection/selector_alltoallv.cpp b/src/coll/selection/selector_alltoallv.cpp
index dd76f5bef..b95973993 100644
--- a/src/coll/selection/selector_alltoallv.cpp
+++ b/src/coll/selection/selector_alltoallv.cpp
@@ -36,6 +36,14 @@ ccl_algorithm_selector<ccl_coll_alltoallv>::ccl_algorithm_selector() {
     }
 #endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
     insert(fallback_table, 0, CCL_SELECTION_MAX_COLL_SIZE, ccl_coll_alltoallv_scatter);
+
+    // scale-out table by default duplicates the main table
+    // TODO: fill the table with algorithms which is suitable for the better scale-out performance.
+    // Explanation: when implementing it was a simple scenario that does not contradict with the selection logic.
+    // If there are no environemnt variable provided, scale-out path will go through the scaleout_table like it is a main_table
+    // and use fallback path if nothing is suitable. Correct default behavior of each algorithm`s scale-out path is another task with discussion
+    // and performance measurements.
+    scaleout_table = main_table;
 }
 
 template <>
@@ -63,4 +71,5 @@ bool ccl_algorithm_selector_helper<ccl_coll_alltoallv_algo>::can_use(
 CCL_SELECTION_DEFINE_HELPER_METHODS(ccl_coll_alltoallv_algo,
                                     ccl_coll_alltoallv,
                                     ccl::global_data::env().alltoallv_algo_raw,
-                                    0);
+                                    0,
+                                    ccl::global_data::env().alltoallv_scaleout_algo_raw);
diff --git a/src/coll/selection/selector_barrier.cpp b/src/coll/selection/selector_barrier.cpp
index f18a12db1..510a71620 100644
--- a/src/coll/selection/selector_barrier.cpp
+++ b/src/coll/selection/selector_barrier.cpp
@@ -29,6 +29,14 @@ ccl_algorithm_selector<ccl_coll_barrier>::ccl_algorithm_selector() {
         insert(main_table, 0, CCL_SELECTION_MAX_COLL_SIZE, ccl_coll_barrier_direct);
 
     insert(fallback_table, 0, CCL_SELECTION_MAX_COLL_SIZE, ccl_coll_barrier_ring);
+
+    // scale-out table by default duplicates the main table
+    // TODO: fill the table with algorithms which is suitable for the better scale-out performance.
+    // Explanation: when implementing it was a simple scenario that does not contradict with the selection logic.
+    // If there are no environemnt variable provided, scale-out path will go through the scaleout_table like it is a main_table
+    // and use fallback path if nothing is suitable. Correct default behavior of each algorithm`s scale-out path is another task with discussion
+    // and performance measurements.
+    scaleout_table = main_table;
 }
 
 template <>
@@ -47,4 +55,5 @@ bool ccl_algorithm_selector_helper<ccl_coll_barrier_algo>::can_use(
 CCL_SELECTION_DEFINE_HELPER_METHODS(ccl_coll_barrier_algo,
                                     ccl_coll_barrier,
                                     ccl::global_data::env().barrier_algo_raw,
-                                    0);
+                                    0,
+                                    ccl::global_data::env().barrier_scaleout_algo_raw);
diff --git a/src/coll/selection/selector_bcast.cpp b/src/coll/selection/selector_bcast.cpp
index c928adc66..4602ae582 100644
--- a/src/coll/selection/selector_bcast.cpp
+++ b/src/coll/selection/selector_bcast.cpp
@@ -41,6 +41,14 @@ ccl_algorithm_selector<ccl_coll_bcast>::ccl_algorithm_selector() {
 #endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
 
     insert(fallback_table, 0, CCL_SELECTION_MAX_COLL_SIZE, ccl_coll_bcast_naive);
+
+    // scale-out table by default duplicates the main table
+    // TODO: fill the table with algorithms which is suitable for the better scale-out performance.
+    // Explanation: when implementing it was a simple scenario that does not contradict with the selection logic.
+    // If there are no environemnt variable provided, scale-out path will go through the scaleout_table like it is a main_table
+    // and use fallback path if nothing is suitable. Correct default behavior of each algorithm`s scale-out path is another task with discussion
+    // and performance measurements.
+    scaleout_table = main_table;
 }
 
 template <>
@@ -72,4 +80,5 @@ bool ccl_algorithm_selector_helper<ccl_coll_bcast_algo>::can_use(
 CCL_SELECTION_DEFINE_HELPER_METHODS(ccl_coll_bcast_algo,
                                     ccl_coll_bcast,
                                     ccl::global_data::env().bcast_algo_raw,
-                                    param.count);
+                                    param.count,
+                                    ccl::global_data::env().bcast_scaleout_algo_raw);
diff --git a/src/coll/selection/selector_helper.hpp b/src/coll/selection/selector_helper.hpp
index 0bf9846e4..0a9daf743 100644
--- a/src/coll/selection/selector_helper.hpp
+++ b/src/coll/selection/selector_helper.hpp
@@ -26,7 +26,8 @@ struct ccl_algorithm_selector_helper {
     static bool can_use(algo_group_type algo,
                         const ccl_selector_param& param,
                         const ccl_selection_table_t<algo_group_type>& table);
-    static const std::string& get_str_to_parse();
+    static const std::string& get_main_str_to_parse();
+    static const std::string& get_scaleout_str_to_parse();
     static ccl_coll_type get_coll_id();
     static size_t get_count(const ccl_selector_param& param);
     static algo_group_type algo_from_str(const std::string& str);
@@ -40,10 +41,16 @@ const std::string& ccl_coll_algorithm_to_str(algo_group_type algo) {
     return ccl_algorithm_selector_helper<algo_group_type>::algo_to_str(algo);
 }
 
-#define CCL_SELECTION_DEFINE_HELPER_METHODS(algo_group_type, coll_id, env_str, count_expr) \
+#define CCL_SELECTION_DEFINE_HELPER_METHODS( \
+    algo_group_type, coll_id, main_env_str, count_expr, scaleout_env_str) \
     template <> \
-    const std::string& ccl_algorithm_selector_helper<algo_group_type>::get_str_to_parse() { \
-        return env_str; \
+    const std::string& ccl_algorithm_selector_helper<algo_group_type>::get_main_str_to_parse() { \
+        return main_env_str; \
+    } \
+    template <> \
+    const std::string& \
+    ccl_algorithm_selector_helper<algo_group_type>::get_scaleout_str_to_parse() { \
+        return scaleout_env_str; \
     } \
     template <> \
     ccl_coll_type ccl_algorithm_selector_helper<algo_group_type>::get_coll_id() { \
diff --git a/src/coll/selection/selector_impl.hpp b/src/coll/selection/selector_impl.hpp
index 6952d4ac4..6f6364c45 100644
--- a/src/coll/selection/selector_impl.hpp
+++ b/src/coll/selection/selector_impl.hpp
@@ -52,14 +52,9 @@ void ccl_selection_unpack_elem(size_t& size,
 }
 
 template <typename algo_group_type>
-void ccl_algorithm_selector_base<algo_group_type>::init() {
-    const std::string& str_to_parse =
-        ccl_algorithm_selector_helper<algo_group_type>::get_str_to_parse();
-
-    size_t elem_size;
-    algo_group_type elem_algo;
-    ccl_selection_border_type elem_border;
-
+void fill_table_from_str(ccl_algorithm_selector_base<algo_group_type>* selector,
+                         const std::string& str_to_parse,
+                         ccl_selection_table_t<algo_group_type>& table) {
     std::string block;
     std::string algo_name_str;
     std::string size_str;
@@ -98,8 +93,8 @@ void ccl_algorithm_selector_base<algo_group_type>::init() {
 
         if (algo_name_str.length() == block.length()) {
             /* set the single algorithm for the whole range */
-            main_table.clear();
-            insert(main_table, 0, CCL_SELECTION_MAX_COLL_SIZE, algo);
+            table.clear();
+            selector->insert(table, 0, CCL_SELECTION_MAX_COLL_SIZE, algo);
         }
         else {
             try {
@@ -145,13 +140,29 @@ void ccl_algorithm_selector_base<algo_group_type>::init() {
                              right_size,
                              ")");
 
-            insert(main_table, left_size, right_size, algo);
+            selector->insert(table, left_size, right_size, algo);
         }
         block_stream.clear();
     }
+}
+
+template <typename algo_group_type>
+void ccl_algorithm_selector_base<algo_group_type>::init() {
+    const std::string& main_str_to_parse =
+        ccl_algorithm_selector_helper<algo_group_type>::get_main_str_to_parse();
+    const std::string& scaleout_str_to_parse =
+        ccl_algorithm_selector_helper<algo_group_type>::get_scaleout_str_to_parse();
+
+    size_t elem_size;
+    algo_group_type elem_algo;
+    ccl_selection_border_type elem_border;
+
+    fill_table_from_str<algo_group_type>(this, main_str_to_parse, main_table);
+    fill_table_from_str<algo_group_type>(this, scaleout_str_to_parse, scaleout_table);
 
-    auto tables_to_check =
-        std::vector<const ccl_selection_table_t<algo_group_type>*>{ &main_table, &fallback_table };
+    auto tables_to_check = std::vector<const ccl_selection_table_t<algo_group_type>*>{
+        &main_table, &fallback_table, &scaleout_table
+    };
 
     for (const auto& table : tables_to_check) {
         CCL_THROW_IF_NOT(table->size() >= 2, "selection table should have at least 2 entries");
@@ -198,15 +209,17 @@ void ccl_algorithm_selector_base<algo_group_type>::print() const {
     ccl_selection_border_type elem_border;
 
     std::stringstream str;
-    auto tables_to_print =
-        std::vector<const ccl_selection_table_t<algo_group_type>*>{ &main_table, &fallback_table };
+    auto tables_to_print = std::vector<const ccl_selection_table_t<algo_group_type>*>{
+        &main_table, &fallback_table, &scaleout_table
+    };
 
     str << std::endl
         << ccl_coll_type_to_str(ccl_algorithm_selector_helper<algo_group_type>::get_coll_id())
         << " selection" << std::endl;
 
     for (const auto& table : tables_to_print) {
-        const std::string& table_name = (table == &main_table) ? "main table" : "fallback table";
+        std::string table_name = (table == &main_table) ? "main table" : "fallback table";
+        table_name = (table == &scaleout_table) ? "scaleout table" : table_name;
 
         str << "  " << table_name << std::endl;
 
@@ -273,6 +286,25 @@ algo_group_type ccl_algorithm_selector_base<algo_group_type>::get(
     }
 
     size_t size = count * param.dtype.size();
+
+    // Firstly check the scale-out case
+    if (param.is_scaleout) {
+        auto lower_bound = scaleout_table.lower_bound(size);
+        ccl_selection_unpack_elem(elem_size, elem_algo, elem_border, lower_bound, scaleout_table);
+
+        if (lower_bound != scaleout_table.end() &&
+            ccl_algorithm_selector_helper<algo_group_type>::can_use(
+                elem_algo, param, scaleout_table)) {
+            LOG_DEBUG("selected scale-out algo: coll ",
+                      ccl_coll_type_to_str(param.ctype),
+                      ", count ",
+                      count,
+                      ", algo ",
+                      ccl_coll_algorithm_to_str(elem_algo));
+            return elem_algo;
+        }
+    }
+
     auto lower_bound = main_table.lower_bound(size);
     ccl_selection_unpack_elem(elem_size, elem_algo, elem_border, lower_bound, main_table);
 
@@ -446,7 +478,7 @@ void ccl_algorithm_selector_base<algo_group_type>::insert(
         return;
 
     /* merge adjacent ranges for the same algorithm */
-    for (auto iter = table.begin(); iter != table.end();) {
+    for (iter = table.begin(); iter != table.end();) {
         ccl_selection_unpack_elem(elem_size, elem_algo, elem_border, iter, table);
 
         if (elem_border == ccl_selection_border_right || elem_border == ccl_selection_border_both) {
diff --git a/src/coll/selection/selector_reduce.cpp b/src/coll/selection/selector_reduce.cpp
index bfe38870d..44f47997c 100644
--- a/src/coll/selection/selector_reduce.cpp
+++ b/src/coll/selection/selector_reduce.cpp
@@ -40,6 +40,14 @@ ccl_algorithm_selector<ccl_coll_reduce>::ccl_algorithm_selector() {
 #endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
 
     insert(fallback_table, 0, CCL_SELECTION_MAX_COLL_SIZE, ccl_coll_reduce_tree);
+
+    // scale-out table by default duplicates the main table
+    // TODO: fill the table with algorithms which is suitable for the better scale-out performance.
+    // Explanation: when implementing it was a simple scenario that does not contradict with the selection logic.
+    // If there are no environemnt variable provided, scale-out path will go through the scaleout_table like it is a main_table
+    // and use fallback path if nothing is suitable. Correct default behavior of each algorithm`s scale-out path is another task with discussion
+    // and performance measurements.
+    scaleout_table = main_table;
 }
 
 template <>
@@ -67,4 +75,5 @@ bool ccl_algorithm_selector_helper<ccl_coll_reduce_algo>::can_use(
 CCL_SELECTION_DEFINE_HELPER_METHODS(ccl_coll_reduce_algo,
                                     ccl_coll_reduce,
                                     ccl::global_data::env().reduce_algo_raw,
-                                    param.count);
+                                    param.count,
+                                    ccl::global_data::env().reduce_scaleout_algo_raw);
diff --git a/src/coll/selection/selector_reduce_scatter.cpp b/src/coll/selection/selector_reduce_scatter.cpp
index db0793b3c..a908ec4bd 100644
--- a/src/coll/selection/selector_reduce_scatter.cpp
+++ b/src/coll/selection/selector_reduce_scatter.cpp
@@ -32,6 +32,14 @@ ccl_algorithm_selector<ccl_coll_reduce_scatter>::ccl_algorithm_selector() {
         insert(main_table, 0, CCL_SELECTION_MAX_COLL_SIZE, ccl_coll_reduce_scatter_direct);
 
     insert(fallback_table, 0, CCL_SELECTION_MAX_COLL_SIZE, ccl_coll_reduce_scatter_ring);
+
+    // scale-out table by default duplicates the main table
+    // TODO: fill the table with algorithms which is suitable for the better scale-out performance.
+    // Explanation: when implementing it was a simple scenario that does not contradict with the selection logic.
+    // If there are no environemnt variable provided, scale-out path will go through the scaleout_table like it is a main_table
+    // and use fallback path if nothing is suitable. Correct default behavior of each algorithm`s scale-out path is another task with discussion
+    // and performance measurements.
+    scaleout_table = main_table;
 }
 
 template <>
@@ -54,4 +62,5 @@ bool ccl_algorithm_selector_helper<ccl_coll_reduce_scatter_algo>::can_use(
 CCL_SELECTION_DEFINE_HELPER_METHODS(ccl_coll_reduce_scatter_algo,
                                     ccl_coll_reduce_scatter,
                                     ccl::global_data::env().reduce_scatter_algo_raw,
-                                    param.count);
+                                    param.count,
+                                    ccl::global_data::env().reduce_scatter_scaleout_algo_raw);
diff --git a/src/comm/comm.cpp b/src/comm/comm.cpp
index 96e0176bd..0d33712a4 100644
--- a/src/comm/comm.cpp
+++ b/src/comm/comm.cpp
@@ -33,6 +33,56 @@
 #include "util/pm/pmi_resizable_rt/pmi_resizable/kvs/ikvs_wrapper.h"
 #include "kvs_impl.hpp"
 
+#ifdef CCL_ENABLE_SYCL
+#include "common/utils/sycl_utils.hpp"
+#endif // CCL_ENABLE_SYCL
+
+// ccl_comm_env
+
+ccl_comm_env::ccl_comm_env(std::shared_ptr<ccl::device> device) : device(device) {
+#ifdef CCL_ENABLE_SYCL
+    enable_topo_algo = ccl::global_data::env().enable_topo_algo;
+    ze_copy_engine = ccl::global_data::env().ze_copy_engine;
+    ze_h2d_copy_engine = ccl::global_data::env().ze_h2d_copy_engine;
+
+    if (device &&
+        (device.get()->get_native().get_backend() == ccl::utils::get_level_zero_backend())) {
+        auto ze_device =
+            sycl::get_native<ccl::utils::get_level_zero_backend()>(device.get()->get_native());
+        CCL_THROW_IF_NOT(ze_device, "null ze device");
+
+        if ((ccl::ze::get_device_family(ze_device) == ccl::device_family::unknown) ||
+            (ccl::ze::get_device_family(ze_device) == ccl::device_family::family1)) {
+            ze_copy_engine = ccl::ze::copy_engine_mode::none;
+            ze_h2d_copy_engine = ccl::ze::h2d_copy_engine_mode::none;
+        }
+    }
+    else {
+        enable_topo_algo = 0;
+        ze_copy_engine = ccl::ze::copy_engine_mode::none;
+        ze_h2d_copy_engine = ccl::ze::h2d_copy_engine_mode::none;
+    }
+#endif // CCL_ENABLE_SYCL
+}
+
+std::string ccl_comm_env::to_string() const {
+    std::stringstream ss;
+    ss << "{";
+
+#ifdef CCL_ENABLE_SYCL
+    if (device) {
+        ss << " enable_topo_algo: " << enable_topo_algo;
+        ss << ", ze_copy_engine: " << ccl::ze::copy_engine_names[ze_copy_engine];
+        ss << ", ze_h2d_copy_engine: " << ccl::ze::h2d_copy_engine_names[ze_h2d_copy_engine];
+        ss << " ";
+    }
+#endif // CCL_ENABLE_SYCL
+
+    ss << "}";
+
+    return ss.str();
+}
+
 // ccl_internal_comm
 
 ccl_internal_comm::ccl_internal_comm(int comm_id,
@@ -56,6 +106,7 @@ void ccl_internal_comm::reset(int rank, int size) {
 }
 
 // ccl_comm
+
 void ccl_comm::init(int comm_id,
                     std::shared_ptr<atl_base_comm> atl_comm,
                     bool share_resources,
@@ -89,6 +140,12 @@ void ccl_comm::init(int comm_id,
     else {
         local2global_map = atl_comm->get_rank2rank_map();
     }
+
+    env = std::make_shared<ccl_comm_env>(device_ptr);
+
+    if (comm_rank == 0) {
+        LOG_DEBUG(to_string_ext());
+    }
 }
 
 ccl_comm::ccl_comm(int comm_id,
@@ -186,17 +243,7 @@ void ccl_comm::allocate_resources() {
     if (ccl::global_data::env().enable_unordered_coll) {
         comm_impl->unordered_coll_manager.reset(new ccl_unordered_coll_manager(*this));
     }
-
-    auto& env_object = ccl::global_data::env();
-
-    comm_impl->allreduce_2d_builder.reset(new ccl_allreduce_2d_builder(
-        (env_object.allreduce_2d_base_size != CCL_ENV_SIZET_NOT_SPECIFIED)
-            ? env_object.allreduce_2d_base_size
-            : ccl::global_data::get().executor->get_local_proc_count(),
-        env_object.allreduce_2d_switch_dims,
-        this));
-
-    env_object.print(rank());
+    ccl::global_data::env().print(rank());
 }
 
 ccl::comm_interface_ptr ccl_comm::split(const ccl::comm_split_attr& attr) {
@@ -224,6 +271,7 @@ std::string ccl_comm::to_string_ext() const {
     ss << "   node_comm: " << (node_comm ? node_comm->to_string() : "{}") << "\n";
     ss << "   even_comm: " << (even_comm ? even_comm->to_string() : "{}") << "\n";
     ss << "   pair_comm: " << (pair_comm ? pair_comm->to_string() : "{}") << "\n";
+    ss << "   env: " << (env ? env->to_string() : "{}") << "\n";
     ss << "}";
 
     return ss.str();
diff --git a/src/comm/comm.hpp b/src/comm/comm.hpp
index c68f2ee1f..77b6fad46 100644
--- a/src/comm/comm.hpp
+++ b/src/comm/comm.hpp
@@ -19,7 +19,6 @@
 #include <unordered_map>
 
 #include "atl/atl_base_comm.hpp"
-#include "coll/algorithms/allreduce/allreduce_2d.hpp"
 #include "comm/comm_interface.hpp"
 #include "comm/atl_tag.hpp"
 #include "common/log/log.hpp"
@@ -38,6 +37,9 @@
 #include "oneapi/ccl/coll_attr_ids.hpp"
 #include "oneapi/ccl/coll_attr_ids_traits.hpp"
 #include "oneapi/ccl/coll_attr.hpp"
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+#include "sched/entry/ze/ze_primitives.hpp"
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
 #include "types_generator_defines.hpp"
 #include "topology/topo_manager.hpp"
 #include "unordered_coll/unordered_coll.hpp"
@@ -63,6 +65,39 @@ class kvs_interface;
 }
 } // namespace ccl
 
+// comm-specific environment
+// based on global environment
+// and adjusted according to comm parameters
+class ccl_comm_env {
+public:
+    ccl_comm_env(std::shared_ptr<ccl::device> device);
+    ccl_comm_env(const ccl_comm_env& other) = delete;
+    ccl_comm_env& operator=(const ccl_comm_env& other) = delete;
+    ~ccl_comm_env() = default;
+
+    std::string to_string() const;
+
+#ifdef CCL_ENABLE_SYCL
+    int get_enable_topo_algo() const {
+        return enable_topo_algo;
+    }
+
+    ccl::ze::copy_engine_mode get_ze_copy_engine() const {
+        return ze_copy_engine;
+    }
+#endif // CCL_ENABLE_SYCL
+
+private:
+    std::shared_ptr<ccl::device> device;
+
+#ifdef CCL_ENABLE_SYCL
+    int enable_topo_algo;
+    ccl::ze::copy_engine_mode ze_copy_engine;
+    ccl::ze::h2d_copy_engine_mode ze_h2d_copy_engine;
+
+#endif // CCL_ENABLE_SYCL
+};
+
 // the main purpose of internal comm is to hold
 // shareable parts of ccl_comm which don't need to
 // be copied/reset on ccl_comm's copy
@@ -94,7 +129,6 @@ class alignas(CACHELINE_SIZE) ccl_internal_comm {
 
     std::shared_ptr<atl_base_comm> atl_comm;
     std::unique_ptr<ccl_unordered_coll_manager> unordered_coll_manager;
-    std::unique_ptr<ccl_allreduce_2d_builder> allreduce_2d_builder;
 
 private:
     int m_rank;
@@ -229,12 +263,13 @@ class alignas(CACHELINE_SIZE) ccl_comm : public ccl::comm_interface {
         }
     }
 
+    std::shared_ptr<ccl_comm_env> get_env() const {
+        return env;
+    }
+
     std::unique_ptr<ccl_unordered_coll_manager>& get_unordered_coll_manager() const {
         return comm_impl->unordered_coll_manager;
     }
-    std::unique_ptr<ccl_allreduce_2d_builder>& get_allreduce_2d_builder() const {
-        return comm_impl->allreduce_2d_builder;
-    }
 
     int rank() const override {
         return comm_rank;
@@ -300,6 +335,8 @@ class alignas(CACHELINE_SIZE) ccl_comm : public ccl::comm_interface {
     ccl_rank2rank_map local2global_map{};
     ccl::topo_manager topo_manager;
 
+    std::shared_ptr<ccl_comm_env> env;
+
     ccl_sched_id_t next_sched_id_internal;
     ccl_sched_id_t next_sched_id_external;
 
diff --git a/src/comm/comm_interface.hpp b/src/comm/comm_interface.hpp
index fe12b7b5d..8d74679b6 100644
--- a/src/comm/comm_interface.hpp
+++ b/src/comm/comm_interface.hpp
@@ -65,18 +65,18 @@ class reduce_scatter_attr;
     COMM_INTERFACE_COLL_##TYPE(double);
 
 #define SYCL_COMM_INTERFACE_COLL_METHODS(TYPE) \
-    COMM_INTERFACE_COLL_CLASS_##TYPE(cl::sycl::buffer<int8_t COMMA 1>); \
-    COMM_INTERFACE_COLL_CLASS_##TYPE(cl::sycl::buffer<uint8_t COMMA 1>); \
-    COMM_INTERFACE_COLL_CLASS_##TYPE(cl::sycl::buffer<int16_t COMMA 1>); \
-    COMM_INTERFACE_COLL_CLASS_##TYPE(cl::sycl::buffer<uint16_t COMMA 1>); \
-    COMM_INTERFACE_COLL_CLASS_##TYPE(cl::sycl::buffer<int32_t COMMA 1>); \
-    COMM_INTERFACE_COLL_CLASS_##TYPE(cl::sycl::buffer<uint32_t COMMA 1>); \
-    COMM_INTERFACE_COLL_CLASS_##TYPE(cl::sycl::buffer<int64_t COMMA 1>); \
-    COMM_INTERFACE_COLL_CLASS_##TYPE(cl::sycl::buffer<uint64_t COMMA 1>); \
-    /*COMM_INTERFACE_COLL_CLASS_##TYPE(cl::sycl::buffer<ccl::float16 COMMA 1>);*/ \
-    COMM_INTERFACE_COLL_CLASS_##TYPE(cl::sycl::buffer<float COMMA 1>); \
-    COMM_INTERFACE_COLL_CLASS_##TYPE(cl::sycl::buffer<double COMMA 1>); \
-    /*COMM_INTERFACE_COLL_CLASS_##TYPE(cl::sycl::buffer<ccl::bfloat16 COMMA 1>);*/
+    COMM_INTERFACE_COLL_CLASS_##TYPE(sycl::buffer<int8_t COMMA 1>); \
+    COMM_INTERFACE_COLL_CLASS_##TYPE(sycl::buffer<uint8_t COMMA 1>); \
+    COMM_INTERFACE_COLL_CLASS_##TYPE(sycl::buffer<int16_t COMMA 1>); \
+    COMM_INTERFACE_COLL_CLASS_##TYPE(sycl::buffer<uint16_t COMMA 1>); \
+    COMM_INTERFACE_COLL_CLASS_##TYPE(sycl::buffer<int32_t COMMA 1>); \
+    COMM_INTERFACE_COLL_CLASS_##TYPE(sycl::buffer<uint32_t COMMA 1>); \
+    COMM_INTERFACE_COLL_CLASS_##TYPE(sycl::buffer<int64_t COMMA 1>); \
+    COMM_INTERFACE_COLL_CLASS_##TYPE(sycl::buffer<uint64_t COMMA 1>); \
+    /*COMM_INTERFACE_COLL_CLASS_##TYPE(sycl::buffer<ccl::float16 COMMA 1>);*/ \
+    COMM_INTERFACE_COLL_CLASS_##TYPE(sycl::buffer<float COMMA 1>); \
+    COMM_INTERFACE_COLL_CLASS_##TYPE(sycl::buffer<double COMMA 1>); \
+    /*COMM_INTERFACE_COLL_CLASS_##TYPE(sycl::buffer<ccl::bfloat16 COMMA 1>);*/
 
 #define COMM_INTERFACE_COLL_INSTANTIATION(COMM) \
     COMM_INTERFACE_COLL_INSTANTIATIONS(COMM, int8_t); \
@@ -93,14 +93,14 @@ class reduce_scatter_attr;
     COMM_INTERFACE_COLL_INSTANTIATIONS(COMM, ccl::float16);
 
 #define SYCL_COMM_INTERFACE_COLL_INSTANTIATION(COMM) \
-    COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(COMM, cl::sycl::buffer<int8_t COMMA 1>); \
-    COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(COMM, cl::sycl::buffer<int32_t COMMA 1>); \
-    COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(COMM, cl::sycl::buffer<int64_t COMMA 1>); \
-    COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(COMM, cl::sycl::buffer<uint64_t COMMA 1>); \
-    /*COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(COMM, cl::sycl::buffer<ccl::float16 COMMA 1>);*/ \
-    COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(COMM, cl::sycl::buffer<float COMMA 1>); \
-    COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(COMM, cl::sycl::buffer<double COMMA 1>); \
-    /*COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(COMM, cl::sycl::buffer<ccl::bfloat16 COMMA 1>);*/
+    COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(COMM, sycl::buffer<int8_t COMMA 1>); \
+    COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(COMM, sycl::buffer<int32_t COMMA 1>); \
+    COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(COMM, sycl::buffer<int64_t COMMA 1>); \
+    COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(COMM, sycl::buffer<uint64_t COMMA 1>); \
+    /*COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(COMM, sycl::buffer<ccl::float16 COMMA 1>);*/ \
+    COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(COMM, sycl::buffer<float COMMA 1>); \
+    COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(COMM, sycl::buffer<double COMMA 1>); \
+    /*COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(COMM, sycl::buffer<ccl::bfloat16 COMMA 1>);*/
 
 namespace ccl {
 struct comm_interface : public comm_selector {
diff --git a/src/common/api_wrapper/api_wrapper.cpp b/src/common/api_wrapper/api_wrapper.cpp
new file mode 100644
index 000000000..29d26ae15
--- /dev/null
+++ b/src/common/api_wrapper/api_wrapper.cpp
@@ -0,0 +1,106 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "common/api_wrapper/api_wrapper.hpp"
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+#include "common/api_wrapper/ze_api_wrapper.hpp"
+#endif //CCL_ENABLE_SYCL && CCL_ENABLE_ZE
+#if defined(CCL_ENABLE_MPI)
+#include "common/api_wrapper/mpi_api_wrapper.hpp"
+#endif //CCL_ENABLE_MPI
+#include "common/api_wrapper/ofi_api_wrapper.hpp"
+
+#include <dlfcn.h>
+
+namespace ccl {
+
+void api_wrappers_init() {
+    bool ofi_inited = true, mpi_inited = true;
+    if (!(ofi_inited = ofi_api_init())) {
+        LOG_INFO("could not initialize OFI api");
+    }
+#if defined(CCL_ENABLE_MPI)
+    if (!(mpi_inited = mpi_api_init())) {
+        LOG_INFO("could not initialize MPI api");
+    }
+#endif //CCL_ENABLE_MPI
+    CCL_THROW_IF_NOT(ofi_inited || mpi_inited, "could not initialize any transport library");
+    if (!ofi_inited && (ccl::global_data::env().atl_transport == ccl_atl_ofi)) {
+        ccl::global_data::env().atl_transport = ccl_atl_mpi;
+        LOG_WARN("OFI transport was not initialized, fallback to MPI transport");
+    }
+
+    if (!mpi_inited && (ccl::global_data::env().atl_transport == ccl_atl_mpi)) {
+        ccl::global_data::env().atl_transport = ccl_atl_ofi;
+        LOG_WARN("MPI transport was not initialized, fallback to OFI transport");
+    }
+
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+    if (ccl::global_data::env().backend == backend_mode::native &&
+        ccl::global_data::env().ze_enable) {
+        LOG_INFO("initializing level-zero api");
+        if (ze_api_init()) {
+            try {
+                ccl::global_data::get().ze_data.reset(new ze::global_data_desc);
+            }
+            catch (const ccl::exception& e) {
+                LOG_INFO("could not initialize level-zero: ", e.what());
+            }
+            catch (...) {
+                LOG_INFO("could not initialize level-zero: unknown error");
+            }
+        }
+    }
+    else {
+        LOG_INFO("could not initialize level-zero api");
+    }
+#endif //CCL_ENABLE_SYCL && CCL_ENABLE_ZE
+}
+
+void api_wrappers_fini() {
+    ofi_api_fini();
+#if defined(CCL_ENABLE_MPI)
+    mpi_api_fini();
+#endif //CCL_ENABLE_MPI
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+    ze_api_fini();
+#endif //CCL_ENABLE_SYCL && CCL_ENABLE_ZE
+}
+
+void load_library(lib_info_t& info) {
+    //TODO: MLSL-1384, finish with parse of lib_path
+    info.handle = dlopen(info.path.c_str(), RTLD_LAZY | RTLD_GLOBAL);
+    if (!info.handle) {
+        LOG_WARN("could not open the library: ", info.path.c_str(), ", error: ", dlerror());
+        return;
+    }
+
+    void** ops = (void**)((void*)info.ops);
+    auto fn_names = info.fn_names;
+    for (size_t i = 0; i < fn_names.size(); ++i) {
+        ops[i] = dlsym(info.handle, fn_names[i].c_str());
+        CCL_THROW_IF_NOT(ops[i], "dlsym is failed on: ", fn_names[i], ", error: ", dlerror());
+        LOG_TRACE("dlsym loaded of ", fn_names.size(), " - ", i + 1, ": ", fn_names[i]);
+    }
+}
+
+void close_library(lib_info_t& info) {
+    if (info.handle) {
+        dlclose(info.handle);
+        info.handle = nullptr;
+    }
+}
+
+} //namespace ccl
diff --git a/src/common/api_wrapper/api_wrapper.hpp b/src/common/api_wrapper/api_wrapper.hpp
new file mode 100644
index 000000000..467938430
--- /dev/null
+++ b/src/common/api_wrapper/api_wrapper.hpp
@@ -0,0 +1,38 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#include "oneapi/ccl/config.h"
+
+#include "common/global/global.hpp"
+#include "common/log/log.hpp"
+
+namespace ccl {
+
+typedef struct lib_info {
+    std::string path;
+    void* handle;
+    void* ops;
+    std::vector<std::string> fn_names;
+} lib_info_t;
+
+void api_wrappers_init();
+void api_wrappers_fini();
+
+void load_library(lib_info_t& info);
+void close_library(lib_info_t& info);
+
+} //namespace ccl
diff --git a/src/common/api_wrapper/mpi_api_wrapper.cpp b/src/common/api_wrapper/mpi_api_wrapper.cpp
new file mode 100644
index 000000000..e84e769ce
--- /dev/null
+++ b/src/common/api_wrapper/mpi_api_wrapper.cpp
@@ -0,0 +1,56 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "common/api_wrapper/api_wrapper.hpp"
+#include "common/api_wrapper/mpi_api_wrapper.hpp"
+
+#if defined(CCL_ENABLE_MPI)
+
+namespace ccl {
+
+lib_info_t mpi_lib_info;
+mpi_lib_ops_t mpi_lib_ops;
+
+bool mpi_api_init() {
+    bool ret = true;
+
+    mpi_lib_info.ops = &mpi_lib_ops;
+    mpi_lib_info.fn_names = mpi_fn_names;
+
+    // lib_path specifies the name and full path to the MPI library
+    // it should be absolute and validated path
+    // pointing to desired libmpi library
+    mpi_lib_info.path = ccl::global_data::env().mpi_lib_path;
+
+    if (mpi_lib_info.path.empty()) {
+        mpi_lib_info.path = "libmpi.so.12";
+    }
+    LOG_DEBUG("MPI lib path: ", mpi_lib_info.path);
+
+    load_library(mpi_lib_info);
+    if (!mpi_lib_info.handle)
+        ret = false;
+
+    return ret;
+}
+
+void mpi_api_fini() {
+    LOG_DEBUG("close MPI lib: handle: ", mpi_lib_info.handle);
+    close_library(mpi_lib_info);
+}
+
+} //namespace ccl
+
+#endif //CCL_ENABLE_MPI
diff --git a/src/common/api_wrapper/mpi_api_wrapper.hpp b/src/common/api_wrapper/mpi_api_wrapper.hpp
new file mode 100644
index 000000000..6e855f78d
--- /dev/null
+++ b/src/common/api_wrapper/mpi_api_wrapper.hpp
@@ -0,0 +1,196 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#include "oneapi/ccl/config.h"
+
+#if defined(CCL_ENABLE_MPI)
+
+#include <mpi.h>
+
+namespace ccl {
+
+typedef struct mpi_lib_ops {
+    decltype(MPI_Allgather) *MPI_Allgather_ptr;
+    decltype(MPI_Allgatherv) *MPI_Allgatherv_ptr;
+    decltype(MPI_Allreduce) *MPI_Allreduce_ptr;
+    decltype(MPI_Alltoall) *MPI_Alltoall_ptr;
+    decltype(MPI_Alltoallv) *MPI_Alltoallv_ptr;
+    decltype(MPI_Barrier) *MPI_Barrier_ptr;
+    decltype(MPI_Bcast) *MPI_Bcast_ptr;
+    decltype(MPI_Cancel) *MPI_Cancel_ptr;
+    decltype(MPI_Comm_create_group) *MPI_Comm_create_group_ptr;
+    decltype(MPI_Comm_free) *MPI_Comm_free_ptr;
+    decltype(MPI_Comm_get_attr) *MPI_Comm_get_attr_ptr;
+    decltype(MPI_Comm_get_info) *MPI_Comm_get_info_ptr;
+    decltype(MPI_Comm_group) *MPI_Comm_group_ptr;
+    decltype(MPI_Comm_rank) *MPI_Comm_rank_ptr;
+    decltype(MPI_Comm_set_info) *MPI_Comm_set_info_ptr;
+    decltype(MPI_Comm_size) *MPI_Comm_size_ptr;
+    decltype(MPI_Comm_split) *MPI_Comm_split_ptr;
+    decltype(MPI_Comm_split_type) *MPI_Comm_split_type_ptr;
+    decltype(MPI_Error_string) *MPI_Error_string_ptr;
+    decltype(MPI_Finalize) *MPI_Finalize_ptr;
+    decltype(MPI_Finalized) *MPI_Finalized_ptr;
+    decltype(MPI_Get_count) *MPI_Get_count_ptr;
+    decltype(MPI_Get_library_version) *MPI_Get_library_version_ptr;
+    decltype(MPI_Group_incl) *MPI_Group_incl_ptr;
+    decltype(MPI_Iallgatherv) *MPI_Iallgatherv_ptr;
+    decltype(MPI_Iallreduce) *MPI_Iallreduce_ptr;
+    decltype(MPI_Ialltoall) *MPI_Ialltoall_ptr;
+    decltype(MPI_Ialltoallv) *MPI_Ialltoallv_ptr;
+    decltype(MPI_Ibarrier) *MPI_Ibarrier_ptr;
+    decltype(MPI_Ibcast) *MPI_Ibcast_ptr;
+    decltype(MPI_Info_create) *MPI_Info_create_ptr;
+    decltype(MPI_Info_free) *MPI_Info_free_ptr;
+    decltype(MPI_Info_get) *MPI_Info_get_ptr;
+    decltype(MPI_Info_set) *MPI_Info_set_ptr;
+    decltype(MPI_Initialized) *MPI_Initialized_ptr;
+    decltype(MPI_Init) *MPI_Init_ptr;
+    decltype(MPI_Init_thread) *MPI_Init_thread_ptr;
+    decltype(MPI_Iprobe) *MPI_Iprobe_ptr;
+    decltype(MPI_Irecv) *MPI_Irecv_ptr;
+    decltype(MPI_Ireduce) *MPI_Ireduce_ptr;
+    decltype(MPI_Ireduce_scatter_block) *MPI_Ireduce_scatter_block_ptr;
+    decltype(MPI_Isend) *MPI_Isend_ptr;
+    decltype(MPI_Op_create) *MPI_Op_create_ptr;
+    decltype(MPI_Op_free) *MPI_Op_free_ptr;
+    decltype(MPI_Query_thread) *MPI_Query_thread_ptr;
+    decltype(MPI_Reduce) *MPI_Reduce_ptr;
+    decltype(MPI_Reduce_scatter_block) *MPI_Reduce_scatter_block_ptr;
+    decltype(MPI_Test) *MPI_Test_ptr;
+    decltype(MPI_Type_commit) *MPI_Type_commit_ptr;
+    decltype(MPI_Type_contiguous) *MPI_Type_contiguous_ptr;
+    decltype(MPI_Type_free) *MPI_Type_free_ptr;
+    decltype(MPI_Wait) *MPI_Wait_ptr;
+} mpi_lib_ops_t;
+
+static std::vector<std::string> mpi_fn_names = {
+    "MPI_Allgather",
+    "MPI_Allgatherv",
+    "MPI_Allreduce",
+    "MPI_Alltoall",
+    "MPI_Alltoallv",
+    "MPI_Barrier",
+    "MPI_Bcast",
+    "MPI_Cancel",
+    "MPI_Comm_create_group",
+    "MPI_Comm_free",
+    "MPI_Comm_get_attr",
+    "MPI_Comm_get_info",
+    "MPI_Comm_group",
+    "MPI_Comm_rank",
+    "MPI_Comm_set_info",
+    "MPI_Comm_size",
+    "MPI_Comm_split",
+    "MPI_Comm_split_type",
+    "MPI_Error_string",
+    "MPI_Finalize",
+    "MPI_Finalized",
+    "MPI_Get_count",
+    "MPI_Get_library_version",
+    "MPI_Group_incl",
+    "MPI_Iallgatherv",
+    "MPI_Iallreduce",
+    "MPI_Ialltoall",
+    "MPI_Ialltoallv",
+    "MPI_Ibarrier",
+    "MPI_Ibcast",
+    "MPI_Info_create",
+    "MPI_Info_free",
+    "MPI_Info_get",
+    "MPI_Info_set",
+    "MPI_Initialized",
+    "MPI_Init",
+    "MPI_Init_thread",
+    "MPI_Iprobe",
+    "MPI_Irecv",
+    "MPI_Ireduce",
+    "MPI_Ireduce_scatter_block",
+    "MPI_Isend",
+    "MPI_Op_create",
+    "MPI_Op_free",
+    "MPI_Query_thread",
+    "MPI_Reduce",
+    "MPI_Reduce_scatter_block",
+    "MPI_Test",
+    "MPI_Type_commit",
+    "MPI_Type_contiguous",
+    "MPI_Type_free",
+    "MPI_Wait",
+};
+
+extern ccl::mpi_lib_ops_t mpi_lib_ops;
+
+#define MPI_Allgather             ccl::mpi_lib_ops.MPI_Allgather_ptr
+#define MPI_Allgatherv            ccl::mpi_lib_ops.MPI_Allgatherv_ptr
+#define MPI_Allreduce             ccl::mpi_lib_ops.MPI_Allreduce_ptr
+#define MPI_Alltoall              ccl::mpi_lib_ops.MPI_Alltoall_ptr
+#define MPI_Alltoallv             ccl::mpi_lib_ops.MPI_Alltoallv_ptr
+#define MPI_Barrier               ccl::mpi_lib_ops.MPI_Barrier_ptr
+#define MPI_Bcast                 ccl::mpi_lib_ops.MPI_Bcast_ptr
+#define MPI_Cancel                ccl::mpi_lib_ops.MPI_Cancel_ptr
+#define MPI_Comm_create_group     ccl::mpi_lib_ops.MPI_Comm_create_group_ptr
+#define MPI_Comm_free             ccl::mpi_lib_ops.MPI_Comm_free_ptr
+#define MPI_Comm_get_attr         ccl::mpi_lib_ops.MPI_Comm_get_attr_ptr
+#define MPI_Comm_get_info         ccl::mpi_lib_ops.MPI_Comm_get_info_ptr
+#define MPI_Comm_group            ccl::mpi_lib_ops.MPI_Comm_group_ptr
+#define MPI_Comm_rank             ccl::mpi_lib_ops.MPI_Comm_rank_ptr
+#define MPI_Comm_set_info         ccl::mpi_lib_ops.MPI_Comm_set_info_ptr
+#define MPI_Comm_size             ccl::mpi_lib_ops.MPI_Comm_size_ptr
+#define MPI_Comm_split            ccl::mpi_lib_ops.MPI_Comm_split_ptr
+#define MPI_Comm_split_type       ccl::mpi_lib_ops.MPI_Comm_split_type_ptr
+#define MPI_Error_string          ccl::mpi_lib_ops.MPI_Error_string_ptr
+#define MPI_Finalize              ccl::mpi_lib_ops.MPI_Finalize_ptr
+#define MPI_Finalized             ccl::mpi_lib_ops.MPI_Finalized_ptr
+#define MPI_Get_count             ccl::mpi_lib_ops.MPI_Get_count_ptr
+#define MPI_Get_library_version   ccl::mpi_lib_ops.MPI_Get_library_version_ptr
+#define MPI_Group_incl            ccl::mpi_lib_ops.MPI_Group_incl_ptr
+#define MPI_Iallgatherv           ccl::mpi_lib_ops.MPI_Iallgatherv_ptr
+#define MPI_Iallreduce            ccl::mpi_lib_ops.MPI_Iallreduce_ptr
+#define MPI_Ialltoall             ccl::mpi_lib_ops.MPI_Ialltoall_ptr
+#define MPI_Ialltoallv            ccl::mpi_lib_ops.MPI_Ialltoallv_ptr
+#define MPI_Ibarrier              ccl::mpi_lib_ops.MPI_Ibarrier_ptr
+#define MPI_Ibcast                ccl::mpi_lib_ops.MPI_Ibcast_ptr
+#define MPI_Info_create           ccl::mpi_lib_ops.MPI_Info_create_ptr
+#define MPI_Info_free             ccl::mpi_lib_ops.MPI_Info_free_ptr
+#define MPI_Info_get              ccl::mpi_lib_ops.MPI_Info_get_ptr
+#define MPI_Info_set              ccl::mpi_lib_ops.MPI_Info_set_ptr
+#define MPI_Initialized           ccl::mpi_lib_ops.MPI_Initialized_ptr
+#define MPI_Init                  ccl::mpi_lib_ops.MPI_Init_ptr
+#define MPI_Init_thread           ccl::mpi_lib_ops.MPI_Init_thread_ptr
+#define MPI_Iprobe                ccl::mpi_lib_ops.MPI_Iprobe_ptr
+#define MPI_Irecv                 ccl::mpi_lib_ops.MPI_Irecv_ptr
+#define MPI_Ireduce               ccl::mpi_lib_ops.MPI_Ireduce_ptr
+#define MPI_Ireduce_scatter_block ccl::mpi_lib_ops.MPI_Ireduce_scatter_block_ptr
+#define MPI_Isend                 ccl::mpi_lib_ops.MPI_Isend_ptr
+#define MPI_Op_create             ccl::mpi_lib_ops.MPI_Op_create_ptr
+#define MPI_Op_free               ccl::mpi_lib_ops.MPI_Op_free_ptr
+#define MPI_Query_thread          ccl::mpi_lib_ops.MPI_Query_thread_ptr
+#define MPI_Reduce                ccl::mpi_lib_ops.MPI_Reduce_ptr
+#define MPI_Reduce_scatter_block  ccl::mpi_lib_ops.MPI_Reduce_scatter_block_ptr
+#define MPI_Test                  ccl::mpi_lib_ops.MPI_Test_ptr
+#define MPI_Type_commit           ccl::mpi_lib_ops.MPI_Type_commit_ptr
+#define MPI_Type_contiguous       ccl::mpi_lib_ops.MPI_Type_contiguous_ptr
+#define MPI_Type_free             ccl::mpi_lib_ops.MPI_Type_free_ptr
+#define MPI_Wait                  ccl::mpi_lib_ops.MPI_Wait_ptr
+
+bool mpi_api_init();
+void mpi_api_fini();
+
+} //namespace ccl
+
+#endif //CCL_ENABLE_MPI
diff --git a/src/common/api_wrapper/ofi_api_wrapper.cpp b/src/common/api_wrapper/ofi_api_wrapper.cpp
new file mode 100644
index 000000000..3cd6b7f2c
--- /dev/null
+++ b/src/common/api_wrapper/ofi_api_wrapper.cpp
@@ -0,0 +1,52 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "common/api_wrapper/api_wrapper.hpp"
+#include "common/api_wrapper/ofi_api_wrapper.hpp"
+
+namespace ccl {
+
+lib_info_t ofi_lib_info;
+ofi_lib_ops_t ofi_lib_ops;
+
+bool ofi_api_init() {
+    bool ret = true;
+
+    ofi_lib_info.ops = &ofi_lib_ops;
+    ofi_lib_info.fn_names = ofi_fn_names;
+
+    // lib_path specifies the name and full path to the OFI library
+    // it should be absolute and validated path
+    // pointing to desired libfabric library
+    ofi_lib_info.path = ccl::global_data::env().ofi_lib_path;
+
+    if (ofi_lib_info.path.empty()) {
+        ofi_lib_info.path = "libfabric.so.1";
+    }
+    LOG_DEBUG("OFI lib path: ", ofi_lib_info.path);
+
+    load_library(ofi_lib_info);
+    if (!ofi_lib_info.handle)
+        ret = false;
+
+    return ret;
+}
+
+void ofi_api_fini() {
+    LOG_DEBUG("close OFI lib: handle: ", ofi_lib_info.handle);
+    close_library(ofi_lib_info);
+}
+
+} //namespace ccl
diff --git a/src/common/api_wrapper/ofi_api_wrapper.hpp b/src/common/api_wrapper/ofi_api_wrapper.hpp
new file mode 100644
index 000000000..9a4da26ce
--- /dev/null
+++ b/src/common/api_wrapper/ofi_api_wrapper.hpp
@@ -0,0 +1,54 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#include "oneapi/ccl/config.h"
+
+#include <rdma/fabric.h>
+#include <rdma/fi_domain.h>
+#include <rdma/fi_cm.h>
+#include <rdma/fi_tagged.h>
+#include <rdma/fi_rma.h>
+
+namespace ccl {
+
+typedef struct ofi_lib_ops {
+    decltype(fi_dupinfo) *fi_dupinfo_ptr;
+    decltype(fi_fabric) *fi_fabric_ptr;
+    decltype(fi_freeinfo) *fi_freeinfo_ptr;
+    decltype(fi_getinfo) *fi_getinfo_ptr;
+    decltype(fi_strerror) *fi_strerror_ptr;
+    decltype(fi_tostr) *fi_tostr_ptr;
+} ofi_lib_ops_t;
+
+static std::vector<std::string> ofi_fn_names = {
+    "fi_dupinfo", "fi_fabric", "fi_freeinfo", "fi_getinfo", "fi_strerror", "fi_tostr",
+};
+
+extern ccl::ofi_lib_ops_t ofi_lib_ops;
+
+#define fi_allocinfo() (fi_dupinfo)(NULL)
+#define fi_dupinfo     ccl::ofi_lib_ops.fi_dupinfo_ptr
+#define fi_fabric      ccl::ofi_lib_ops.fi_fabric_ptr
+#define fi_freeinfo    ccl::ofi_lib_ops.fi_freeinfo_ptr
+#define fi_getinfo     ccl::ofi_lib_ops.fi_getinfo_ptr
+#define fi_strerror    ccl::ofi_lib_ops.fi_strerror_ptr
+#define fi_tostr       ccl::ofi_lib_ops.fi_tostr_ptr
+
+bool ofi_api_init();
+void ofi_api_fini();
+
+} //namespace ccl
diff --git a/src/common/api_wrapper/pmix_api_wrapper.cpp b/src/common/api_wrapper/pmix_api_wrapper.cpp
new file mode 100644
index 000000000..d0349141f
--- /dev/null
+++ b/src/common/api_wrapper/pmix_api_wrapper.cpp
@@ -0,0 +1,105 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "common/api_wrapper/api_wrapper.hpp"
+#include "common/api_wrapper/pmix_api_wrapper.hpp"
+
+namespace ccl {
+
+#ifdef CCL_ENABLE_PMIX
+static pmix_proc_t global_proc;
+
+ccl::lib_info_t pmix_lib_info;
+pmix_lib_ops_t pmix_lib_ops;
+
+bool get_pmix_local_coord(int *local_proc_idx, int *local_proc_count) {
+    *local_proc_idx = CCL_ENV_INT_NOT_SPECIFIED;
+    *local_proc_count = CCL_ENV_INT_NOT_SPECIFIED;
+
+    pmix_status_t rc = PMIX_SUCCESS;
+    pmix_value_t *val = NULL;
+    pmix_proc_t proc;
+
+    if (PMIX_SUCCESS != (rc = PMIx_Init(&global_proc, NULL, 0))) {
+        LOG_WARN("PMIx_Init failed: ", PMIx_Error_string(rc));
+        return false;
+    }
+
+    PMIX_PROC_CONSTRUCT(&proc);
+    strcpy(proc.nspace, global_proc.nspace);
+    proc.rank = PMIX_RANK_WILDCARD;
+
+    // number of local ranks on node
+    if (PMIX_SUCCESS != (rc = PMIx_Get(&proc, PMIX_LOCAL_SIZE, NULL, 0, &val))) {
+        LOG_WARN("PMIx_Get(PMIX_LOCAL_SIZE) failed: ", PMIx_Error_string(rc));
+        return false;
+    }
+    *local_proc_count = val->data.uint32;
+    PMIX_VALUE_RELEASE(val);
+
+    // my local rank on node
+    if (PMIX_SUCCESS != (rc = PMIx_Get(&global_proc, PMIX_LOCAL_RANK, NULL, 0, &val))) {
+        LOG_WARN("PMIx_Get(PMIX_LOCAL_RANK) failed: ", PMIx_Error_string(rc));
+        return false;
+    }
+    *local_proc_idx = val->data.uint16;
+    PMIX_VALUE_RELEASE(val);
+
+    LOG_DEBUG("get pmix_local_rank/size - local_proc_idx: ",
+              *local_proc_idx,
+              ", local_proc_count: ",
+              *local_proc_count);
+    return true;
+}
+#endif // CCL_ENABLE_PMIX
+
+void pmix_api_init() {
+#ifdef CCL_ENABLE_PMIX
+    if (ccl::global_data::env().process_launcher == process_launcher_mode::pmix) {
+        pmix_lib_info.ops = &pmix_lib_ops;
+        pmix_lib_info.fn_names = pmix_fn_names;
+
+        // lib_path specifies the name and full path to the PMIX library
+        // it should be absolute and validated path
+        // pointing to desired libpmix library
+        pmix_lib_info.path = ccl::global_data::env().pmix_lib_path;
+
+        if (pmix_lib_info.path.empty()) {
+            pmix_lib_info.path = "libpmix.so";
+        }
+        LOG_DEBUG("pmix lib path: ", pmix_lib_info.path);
+
+        load_library(pmix_lib_info);
+
+        CCL_THROW_IF_NOT(pmix_lib_info.handle != nullptr, "could not initialize PMIX api");
+    }
+#endif // CCL_ENABLE_PMIX
+}
+
+void pmix_api_fini() {
+#ifdef CCL_ENABLE_PMIX
+    if (ccl::global_data::env().process_launcher == process_launcher_mode::pmix) {
+        pmix_status_t rc = PMIX_SUCCESS;
+        if (PMIX_SUCCESS != (rc = PMIx_Finalize(NULL, 0))) {
+            CCL_THROW("PMIx_Finalize failed: ", PMIx_Error_string(rc));
+        }
+
+        LOG_DEBUG("close pmix lib: handle: ", pmix_lib_info.handle);
+        close_library(pmix_lib_info);
+    }
+#endif // CCL_ENABLE_PMIX
+}
+
+} //namespace ccl
diff --git a/src/common/api_wrapper/pmix_api_wrapper.hpp b/src/common/api_wrapper/pmix_api_wrapper.hpp
new file mode 100644
index 000000000..2a7f9f23c
--- /dev/null
+++ b/src/common/api_wrapper/pmix_api_wrapper.hpp
@@ -0,0 +1,58 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#include "oneapi/ccl/config.h"
+
+#include <string>
+#include <vector>
+
+#ifdef CCL_ENABLE_PMIX
+#include <pmix.h>
+#endif // CCL_ENABLE_PMIX
+
+namespace ccl {
+
+#ifdef CCL_ENABLE_PMIX
+typedef struct pmix_lib_ops {
+    decltype(::PMIx_Init) *PMIx_Init;
+    decltype(::PMIx_Error_string) *PMIx_Error_string;
+    decltype(::PMIx_Get) *PMIx_Get;
+    decltype(::PMIx_Finalize) *PMIx_Finalize;
+    decltype(::PMIx_Value_destruct) *PMIx_Value_destruct;
+} pmix_lib_ops_t;
+
+static std::vector<std::string> pmix_fn_names = { "PMIx_Init",
+                                                  "PMIx_Error_string",
+                                                  "PMIx_Get",
+                                                  "PMIx_Finalize",
+                                                  "PMIx_Value_destruct" };
+
+extern ccl::pmix_lib_ops_t pmix_lib_ops;
+
+#define PMIx_Init           ccl::pmix_lib_ops.PMIx_Init
+#define PMIx_Error_string   ccl::pmix_lib_ops.PMIx_Error_string
+#define PMIx_Get            ccl::pmix_lib_ops.PMIx_Get
+#define PMIx_Finalize       ccl::pmix_lib_ops.PMIx_Finalize
+#define PMIx_Value_destruct ccl::pmix_lib_ops.PMIx_Value_destruct
+
+bool get_pmix_local_coord(int *local_proc_idx, int *local_proc_count);
+#endif // CCL_ENABLE_PMIX
+
+void pmix_api_init();
+void pmix_api_fini();
+
+} //namespace ccl
diff --git a/src/common/api_wrapper/ze_api_wrapper.cpp b/src/common/api_wrapper/ze_api_wrapper.cpp
new file mode 100644
index 000000000..6b6d98bf2
--- /dev/null
+++ b/src/common/api_wrapper/ze_api_wrapper.cpp
@@ -0,0 +1,54 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "common/api_wrapper/api_wrapper.hpp"
+#include "common/api_wrapper/ze_api_wrapper.hpp"
+#include "common/stream/stream.hpp"
+#include "sched/entry/ze/ze_primitives.hpp"
+
+namespace ccl {
+
+ccl::lib_info_t ze_lib_info;
+ze_lib_ops_t ze_lib_ops;
+
+bool ze_api_init() {
+    bool ret = true;
+
+    ze_lib_info.ops = &ze_lib_ops;
+    ze_lib_info.fn_names = ze_fn_names;
+
+    // lib_path specifies the name and full path to the level-zero library
+    // it should be absolute and validated path
+    // pointing to desired libze_loader library
+    ze_lib_info.path = ccl::global_data::env().ze_lib_path;
+
+    if (ze_lib_info.path.empty()) {
+        ze_lib_info.path = "libze_loader.so";
+    }
+    LOG_DEBUG("level-zero lib path: ", ze_lib_info.path);
+
+    load_library(ze_lib_info);
+    if (!ze_lib_info.handle)
+        ret = false;
+
+    return ret;
+}
+
+void ze_api_fini() {
+    LOG_DEBUG("close level-zero lib: handle: ", ze_lib_info.handle);
+    close_library(ze_lib_info);
+}
+
+} //namespace ccl
diff --git a/src/common/ze/ze_api_wrapper.hpp b/src/common/api_wrapper/ze_api_wrapper.hpp
similarity index 54%
rename from src/common/ze/ze_api_wrapper.hpp
rename to src/common/api_wrapper/ze_api_wrapper.hpp
index 713a157df..acce9a8ac 100644
--- a/src/common/ze/ze_api_wrapper.hpp
+++ b/src/common/api_wrapper/ze_api_wrapper.hpp
@@ -17,7 +17,8 @@
 
 #include "oneapi/ccl/config.h"
 
-#include <dlfcn.h>
+#include <string>
+#include <vector>
 
 #if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
 
@@ -26,7 +27,7 @@
 
 namespace ccl {
 
-typedef struct libze_ops {
+typedef struct ze_lib_ops {
     decltype(zeInit) *zeInit;
     decltype(zeDriverGet) *zeDriverGet;
     decltype(zeDriverGetApiVersion) *zeDriverGetApiVersion;
@@ -94,9 +95,9 @@ typedef struct libze_ops {
     decltype(zesFabricPortGetConfig) *zesFabricPortGetConfig;
     decltype(zesFabricPortGetProperties) *zesFabricPortGetProperties;
     decltype(zesFabricPortGetState) *zesFabricPortGetState;
-} libze_ops_t;
+} ze_lib_ops_t;
 
-static const char *fn_names[] = {
+static std::vector<std::string> ze_fn_names = {
     "zeInit",
     "zeDriverGet",
     "zeDriverGetApiVersion",
@@ -166,75 +167,76 @@ static const char *fn_names[] = {
     "zesFabricPortGetState",
 };
 
-extern ccl::libze_ops_t libze_ops;
+extern ccl::ze_lib_ops_t ze_lib_ops;
 
-#define zeInit                                 ccl::libze_ops.zeInit
-#define zeDriverGet                            ccl::libze_ops.zeDriverGet
-#define zeDriverGetApiVersion                  ccl::libze_ops.zeDriverGetApiVersion
-#define zeMemGetAllocProperties                ccl::libze_ops.zeMemGetAllocProperties
-#define zeMemGetAddressRange                   ccl::libze_ops.zeMemGetAddressRange
-#define zeMemAllocHost                         ccl::libze_ops.zeMemAllocHost
-#define zeMemAllocDevice                       ccl::libze_ops.zeMemAllocDevice
-#define zeMemAllocShared                       ccl::libze_ops.zeMemAllocShared
-#define zeMemFree                              ccl::libze_ops.zeMemFree
-#define zeMemOpenIpcHandle                     ccl::libze_ops.zeMemOpenIpcHandle
-#define zeMemCloseIpcHandle                    ccl::libze_ops.zeMemCloseIpcHandle
-#define zeMemGetIpcHandle                      ccl::libze_ops.zeMemGetIpcHandle
-#define zeDeviceGet                            ccl::libze_ops.zeDeviceGet
-#define zeDeviceGetProperties                  ccl::libze_ops.zeDeviceGetProperties
-#define zeDeviceCanAccessPeer                  ccl::libze_ops.zeDeviceCanAccessPeer
-#define zeDeviceGetCommandQueueGroupProperties ccl::libze_ops.zeDeviceGetCommandQueueGroupProperties
-#define zeDeviceGetP2PProperties               ccl::libze_ops.zeDeviceGetP2PProperties
-#define zeDeviceGetGlobalTimestamps            ccl::libze_ops.zeDeviceGetGlobalTimestamps
-#define zeDriverGetProperties                  ccl::libze_ops.zeDriverGetProperties
-#define zeDriverGetIpcProperties               ccl::libze_ops.zeDriverGetIpcProperties
-#define zeCommandQueueCreate                   ccl::libze_ops.zeCommandQueueCreate
-#define zeCommandQueueExecuteCommandLists      ccl::libze_ops.zeCommandQueueExecuteCommandLists
-#define zeCommandQueueSynchronize              ccl::libze_ops.zeCommandQueueSynchronize
-#define zeCommandQueueDestroy                  ccl::libze_ops.zeCommandQueueDestroy
-#define zeCommandListCreate                    ccl::libze_ops.zeCommandListCreate
-#define zeCommandListCreateImmediate           ccl::libze_ops.zeCommandListCreateImmediate
-#define zeCommandListAppendMemoryCopy          ccl::libze_ops.zeCommandListAppendMemoryCopy
-#define zeCommandListAppendLaunchKernel        ccl::libze_ops.zeCommandListAppendLaunchKernel
-#define zeCommandListAppendWaitOnEvents        ccl::libze_ops.zeCommandListAppendWaitOnEvents
-#define zeCommandListAppendBarrier             ccl::libze_ops.zeCommandListAppendBarrier
-#define zeCommandListClose                     ccl::libze_ops.zeCommandListClose
-#define zeCommandListReset                     ccl::libze_ops.zeCommandListReset
-#define zeCommandListDestroy                   ccl::libze_ops.zeCommandListDestroy
-#define zeContextCreate                        ccl::libze_ops.zeContextCreate
-#define zeContextDestroy                       ccl::libze_ops.zeContextDestroy
-#define zeEventPoolCreate                      ccl::libze_ops.zeEventPoolCreate
-#define zeEventCreate                          ccl::libze_ops.zeEventCreate
-#define zeEventQueryStatus                     ccl::libze_ops.zeEventQueryStatus
-#define zeEventHostSynchronize                 ccl::libze_ops.zeEventHostSynchronize
-#define zeEventHostReset                       ccl::libze_ops.zeEventHostReset
-#define zeEventHostSignal                      ccl::libze_ops.zeEventHostSignal
-#define zeEventDestroy                         ccl::libze_ops.zeEventDestroy
-#define zeEventPoolOpenIpcHandle               ccl::libze_ops.zeEventPoolOpenIpcHandle
-#define zeEventPoolCloseIpcHandle              ccl::libze_ops.zeEventPoolCloseIpcHandle
-#define zeEventPoolGetIpcHandle                ccl::libze_ops.zeEventPoolGetIpcHandle
-#define zeEventQueryKernelTimestamp            ccl::libze_ops.zeEventQueryKernelTimestamp
-#define zeEventPoolDestroy                     ccl::libze_ops.zeEventPoolDestroy
-#define zeFenceHostSynchronize                 ccl::libze_ops.zeFenceHostSynchronize
-#define zeFenceCreate                          ccl::libze_ops.zeFenceCreate
-#define zeKernelCreate                         ccl::libze_ops.zeKernelCreate
-#define zeKernelSetArgumentValue               ccl::libze_ops.zeKernelSetArgumentValue
-#define zeKernelSuggestGroupSize               ccl::libze_ops.zeKernelSuggestGroupSize
-#define zeKernelSetGroupSize                   ccl::libze_ops.zeKernelSetGroupSize
-#define zeKernelDestroy                        ccl::libze_ops.zeKernelDestroy
-#define zeModuleCreate                         ccl::libze_ops.zeModuleCreate
-#define zeModuleDestroy                        ccl::libze_ops.zeModuleDestroy
-#define zeModuleBuildLogGetString              ccl::libze_ops.zeModuleBuildLogGetString
-#define zeModuleBuildLogDestroy                ccl::libze_ops.zeModuleBuildLogDestroy
-#define zeDeviceGetComputeProperties           ccl::libze_ops.zeDeviceGetComputeProperties
-#define zeDeviceGetMemoryAccessProperties      ccl::libze_ops.zeDeviceGetMemoryAccessProperties
-#define zeDeviceGetMemoryProperties            ccl::libze_ops.zeDeviceGetMemoryProperties
-#define zeDeviceGetSubDevices                  ccl::libze_ops.zeDeviceGetSubDevices
-#define zesDevicePciGetProperties              ccl::libze_ops.zesDevicePciGetProperties
-#define zesDeviceEnumFabricPorts               ccl::libze_ops.zesDeviceEnumFabricPorts
-#define zesFabricPortGetConfig                 ccl::libze_ops.zesFabricPortGetConfig
-#define zesFabricPortGetProperties             ccl::libze_ops.zesFabricPortGetProperties
-#define zesFabricPortGetState                  ccl::libze_ops.zesFabricPortGetState
+#define zeInit                  ccl::ze_lib_ops.zeInit
+#define zeDriverGet             ccl::ze_lib_ops.zeDriverGet
+#define zeDriverGetApiVersion   ccl::ze_lib_ops.zeDriverGetApiVersion
+#define zeMemGetAllocProperties ccl::ze_lib_ops.zeMemGetAllocProperties
+#define zeMemGetAddressRange    ccl::ze_lib_ops.zeMemGetAddressRange
+#define zeMemAllocHost          ccl::ze_lib_ops.zeMemAllocHost
+#define zeMemAllocDevice        ccl::ze_lib_ops.zeMemAllocDevice
+#define zeMemAllocShared        ccl::ze_lib_ops.zeMemAllocShared
+#define zeMemFree               ccl::ze_lib_ops.zeMemFree
+#define zeMemOpenIpcHandle      ccl::ze_lib_ops.zeMemOpenIpcHandle
+#define zeMemCloseIpcHandle     ccl::ze_lib_ops.zeMemCloseIpcHandle
+#define zeMemGetIpcHandle       ccl::ze_lib_ops.zeMemGetIpcHandle
+#define zeDeviceGet             ccl::ze_lib_ops.zeDeviceGet
+#define zeDeviceGetProperties   ccl::ze_lib_ops.zeDeviceGetProperties
+#define zeDeviceCanAccessPeer   ccl::ze_lib_ops.zeDeviceCanAccessPeer
+#define zeDeviceGetCommandQueueGroupProperties \
+    ccl::ze_lib_ops.zeDeviceGetCommandQueueGroupProperties
+#define zeDeviceGetP2PProperties          ccl::ze_lib_ops.zeDeviceGetP2PProperties
+#define zeDeviceGetGlobalTimestamps       ccl::ze_lib_ops.zeDeviceGetGlobalTimestamps
+#define zeDriverGetProperties             ccl::ze_lib_ops.zeDriverGetProperties
+#define zeDriverGetIpcProperties          ccl::ze_lib_ops.zeDriverGetIpcProperties
+#define zeCommandQueueCreate              ccl::ze_lib_ops.zeCommandQueueCreate
+#define zeCommandQueueExecuteCommandLists ccl::ze_lib_ops.zeCommandQueueExecuteCommandLists
+#define zeCommandQueueSynchronize         ccl::ze_lib_ops.zeCommandQueueSynchronize
+#define zeCommandQueueDestroy             ccl::ze_lib_ops.zeCommandQueueDestroy
+#define zeCommandListCreate               ccl::ze_lib_ops.zeCommandListCreate
+#define zeCommandListCreateImmediate      ccl::ze_lib_ops.zeCommandListCreateImmediate
+#define zeCommandListAppendMemoryCopy     ccl::ze_lib_ops.zeCommandListAppendMemoryCopy
+#define zeCommandListAppendLaunchKernel   ccl::ze_lib_ops.zeCommandListAppendLaunchKernel
+#define zeCommandListAppendWaitOnEvents   ccl::ze_lib_ops.zeCommandListAppendWaitOnEvents
+#define zeCommandListAppendBarrier        ccl::ze_lib_ops.zeCommandListAppendBarrier
+#define zeCommandListClose                ccl::ze_lib_ops.zeCommandListClose
+#define zeCommandListReset                ccl::ze_lib_ops.zeCommandListReset
+#define zeCommandListDestroy              ccl::ze_lib_ops.zeCommandListDestroy
+#define zeContextCreate                   ccl::ze_lib_ops.zeContextCreate
+#define zeContextDestroy                  ccl::ze_lib_ops.zeContextDestroy
+#define zeEventPoolCreate                 ccl::ze_lib_ops.zeEventPoolCreate
+#define zeEventCreate                     ccl::ze_lib_ops.zeEventCreate
+#define zeEventQueryStatus                ccl::ze_lib_ops.zeEventQueryStatus
+#define zeEventHostSynchronize            ccl::ze_lib_ops.zeEventHostSynchronize
+#define zeEventHostReset                  ccl::ze_lib_ops.zeEventHostReset
+#define zeEventHostSignal                 ccl::ze_lib_ops.zeEventHostSignal
+#define zeEventDestroy                    ccl::ze_lib_ops.zeEventDestroy
+#define zeEventPoolOpenIpcHandle          ccl::ze_lib_ops.zeEventPoolOpenIpcHandle
+#define zeEventPoolCloseIpcHandle         ccl::ze_lib_ops.zeEventPoolCloseIpcHandle
+#define zeEventPoolGetIpcHandle           ccl::ze_lib_ops.zeEventPoolGetIpcHandle
+#define zeEventQueryKernelTimestamp       ccl::ze_lib_ops.zeEventQueryKernelTimestamp
+#define zeEventPoolDestroy                ccl::ze_lib_ops.zeEventPoolDestroy
+#define zeFenceHostSynchronize            ccl::ze_lib_ops.zeFenceHostSynchronize
+#define zeFenceCreate                     ccl::ze_lib_ops.zeFenceCreate
+#define zeKernelCreate                    ccl::ze_lib_ops.zeKernelCreate
+#define zeKernelSetArgumentValue          ccl::ze_lib_ops.zeKernelSetArgumentValue
+#define zeKernelSuggestGroupSize          ccl::ze_lib_ops.zeKernelSuggestGroupSize
+#define zeKernelSetGroupSize              ccl::ze_lib_ops.zeKernelSetGroupSize
+#define zeKernelDestroy                   ccl::ze_lib_ops.zeKernelDestroy
+#define zeModuleCreate                    ccl::ze_lib_ops.zeModuleCreate
+#define zeModuleDestroy                   ccl::ze_lib_ops.zeModuleDestroy
+#define zeModuleBuildLogGetString         ccl::ze_lib_ops.zeModuleBuildLogGetString
+#define zeModuleBuildLogDestroy           ccl::ze_lib_ops.zeModuleBuildLogDestroy
+#define zeDeviceGetComputeProperties      ccl::ze_lib_ops.zeDeviceGetComputeProperties
+#define zeDeviceGetMemoryAccessProperties ccl::ze_lib_ops.zeDeviceGetMemoryAccessProperties
+#define zeDeviceGetMemoryProperties       ccl::ze_lib_ops.zeDeviceGetMemoryProperties
+#define zeDeviceGetSubDevices             ccl::ze_lib_ops.zeDeviceGetSubDevices
+#define zesDevicePciGetProperties         ccl::ze_lib_ops.zesDevicePciGetProperties
+#define zesDeviceEnumFabricPorts          ccl::ze_lib_ops.zesDeviceEnumFabricPorts
+#define zesFabricPortGetConfig            ccl::ze_lib_ops.zesFabricPortGetConfig
+#define zesFabricPortGetProperties        ccl::ze_lib_ops.zesFabricPortGetProperties
+#define zesFabricPortGetState             ccl::ze_lib_ops.zesFabricPortGetState
 
 bool ze_api_init();
 void ze_api_fini();
diff --git a/src/common/datatype/datatype.cpp b/src/common/datatype/datatype.cpp
index 169ac5a6e..d9d8484b1 100644
--- a/src/common/datatype/datatype.cpp
+++ b/src/common/datatype/datatype.cpp
@@ -23,7 +23,8 @@
 const ccl::datatype last_predefined_dt = ccl::datatype::bfloat16;
 
 namespace ccl {
-using datatype_str_enum = ::utils::enum_to_str<::utils::enum_to_underlying(last_predefined_dt) + 1>;
+using datatype_str_enum =
+    ccl::utils::enum_to_str<ccl::utils::enum_to_underlying(last_predefined_dt) + 1>;
 string_class to_string(const datatype& dt) {
     return datatype_str_enum({ "INT8",
                                "UINT8",
@@ -148,10 +149,11 @@ ccl::datatype ccl_datatype_storage::create_by_datatype_size(size_t datatype_size
     }
 
     CCL_ASSERT(datatype_size > 0);
-    create_internal(custom_table,
-                    custom_idx,
-                    datatype_size,
-                    std::string("DTYPE_") + std::to_string(utils::enum_to_underlying(custom_idx)));
+    create_internal(
+        custom_table,
+        custom_idx,
+        datatype_size,
+        std::string("DTYPE_") + std::to_string(ccl::utils::enum_to_underlying(custom_idx)));
 
     return custom_idx;
 }
diff --git a/src/common/env/env.cpp b/src/common/env/env.cpp
index 8bbf49b10..90d513d36 100644
--- a/src/common/env/env.cpp
+++ b/src/common/env/env.cpp
@@ -55,13 +55,6 @@ std::map<ccl_staging_buffer, std::string> env_data::staging_buffer_names = {
     std::make_pair(ccl_staging_usm, "usm")
 };
 
-std::map<ccl_ze_copy_engine_mode, std::string> env_data::ze_copy_engine_names = {
-    std::make_pair(ccl_ze_copy_engine_none, "none"),
-    std::make_pair(ccl_ze_copy_engine_main, "main"),
-    std::make_pair(ccl_ze_copy_engine_link, "link"),
-    std::make_pair(ccl_ze_copy_engine_auto, "auto")
-};
-
 std::map<backend_mode, std::string> env_data::backend_names = {
     std::make_pair(backend_mode::native, "native"),
 #ifdef CCL_ENABLE_STUB_BACKEND
@@ -72,6 +65,9 @@ std::map<backend_mode, std::string> env_data::backend_names = {
 std::map<process_launcher_mode, std::string> env_data::process_launcher_names = {
     std::make_pair(process_launcher_mode::hydra, "hydra"),
     std::make_pair(process_launcher_mode::torch, "torch"),
+#ifdef CCL_ENABLE_PMIX
+    std::make_pair(process_launcher_mode::pmix, "pmix"),
+#endif // CCL_ENABLE_PMIX
     std::make_pair(process_launcher_mode::none, "none")
 };
 
@@ -83,6 +79,7 @@ env_data::env_data()
           queue_dump(0),
           sched_dump(0),
           sched_profile(0),
+          entry_max_update_time_sec(CCL_ENV_SIZET_NOT_SPECIFIED),
 
           fw_type(ccl_framework_none),
 
@@ -129,22 +126,33 @@ env_data::env_data()
           enable_buffer_cache(1),
           enable_strict_order(0),
           staging_buffer(ccl_staging_regular),
+#ifdef CCL_ENABLE_SYCL
+          enable_op_sync(1),
+#else // CCL_ENABLE_SYCL
           enable_op_sync(0),
+#endif // CCL_ENABLE_SYCL
+          enable_external_queue(0),
 
           chunk_count(1),
           min_chunk_size(65536),
           rs_chunk_count(1),
           rs_min_chunk_size(65536),
-          ar2d_chunk_count(1),
-          ar2d_min_chunk_size(65536),
 
+#ifdef CCL_ENABLE_SYCL
           allgatherv_topo_large_scale(0),
+          allgatherv_topo_read(1),
+          alltoallv_topo_read(1),
+          reduce_scatter_monolithic_kernel(0),
+          allgatherv_monolithic_kernel(0),
+#endif // CCL_ENABLE_SYCL
 
-          allreduce_2d_base_size(CCL_ENV_SIZET_NOT_SPECIFIED),
-          allreduce_2d_switch_dims(0),
           allreduce_nreduce_buffering(0),
           allreduce_nreduce_segment_size(CCL_ENV_SIZET_NOT_SPECIFIED),
 
+          allreduce_2d_chunk_count(1),
+          allreduce_2d_min_chunk_size(65536),
+          allreduce_2d_switch_dims(0),
+
           alltoall_scatter_max_ops(CCL_ENV_SIZET_NOT_SPECIFIED),
 
           backend(backend_mode::native),
@@ -155,49 +163,73 @@ env_data::env_data()
           process_launcher(process_launcher_mode::hydra),
 
           enable_topo_algo(1),
+#ifdef CCL_ENABLE_SYCL
+          topo_color(topo_color_mode::ze),
+#else // CCL_ENABLE_SYCL
           topo_color(topo_color_mode::fixed),
+#endif // CCL_ENABLE_SYCL
           enable_p2p_access(CCL_ENV_INT_NOT_SPECIFIED),
 
+#ifdef CCL_ENABLE_MPI
+          mpi_lib_path(),
+#endif // CCL_ENABLE_MPI
+          ofi_lib_path(),
+
 #ifdef CCL_ENABLE_SYCL
           kernel_path(),
           kernel_debug(0),
-          kernel_group_size(CCL_ENV_SIZET_NOT_SPECIFIED),
+
+          // 32 is more generic constant value
+          // for gpus to avoid imbalance issue
+          kernel_group_size(32),
           kernel_group_count(CCL_ENV_SIZET_NOT_SPECIFIED),
+          kernel_mem_align(128),
+
           enable_kernel_sync(1),
           kernel_1s_lead(0),
           enable_kernel_1s_copy_ops(0),
           enable_kernel_1s_ipc_wa(0),
+          enable_kernel_single_reduce_peers(1),
           enable_close_fd_wa(1),
 
           enable_sycl_output_event(0),
           use_hmem(1),
 
           enable_ze_barrier(0),
-          enable_ze_bidir_algo(0),
+          enable_ze_bidir_algo(1),
           enable_ze_cache(1),
-          enable_ze_cache_ipc_handles(1),
-          ze_cache_ipc_handles_threshold(100),
+          enable_ze_cache_open_ipc_handles(1),
+          ze_cache_open_ipc_handles_threshold(100),
+          enable_ze_cache_get_ipc_handles(1),
           enable_ze_single_list(1),
           disable_ze_family_check(0),
           disable_ze_port_check(0),
+          ze_disable_oversubscription_check(0),
           ze_serialize_mode(0),
-          ze_copy_engine(ccl_ze_copy_engine_none),
+          ze_copy_engine(ccl::ze::copy_engine_mode::link),
+          ze_h2d_copy_engine(ccl::ze::h2d_copy_engine_mode::none),
           ze_max_compute_queues(1),
           ze_max_copy_queues(CCL_ENV_SIZET_NOT_SPECIFIED),
+          ze_enable_ccs_fallback_for_copy(1),
           enable_ze_list_dump(0),
-          ze_queue_index_offset(1),
+          ze_queue_index_offset(0),
           ze_close_ipc_wa(0),
           ze_lib_path(),
           ze_enable(1),
           ze_fini_wa(0),
           ze_multi_workers(0),
+          ze_ipc_exchange(ccl::ze::ipc_exchange_mode::drmfd),
 #endif // CCL_ENABLE_SYCL
 
+#ifdef CCL_ENABLE_PMIX
+          pmix_lib_path(),
+#endif // CCL_ENABLE_PMIX
+
 #ifdef CCL_ENABLE_ITT
           itt_level(0),
 #endif // CCL_ENABLE_ITT
 
-          bf16_impl_type(ccl_bf16_no_compiler_support),
+          bf16_impl_type(ccl_bf16_scalar),
           fp16_impl_type(ccl_fp16_no_compiler_support) {
 }
 
@@ -209,6 +241,13 @@ void env_data::parse() {
     env_2_type(CCL_QUEUE_DUMP, queue_dump);
     env_2_type(CCL_SCHED_DUMP, sched_dump);
     env_2_type(CCL_SCHED_PROFILE, sched_profile);
+    env_2_type(CCL_ENTRY_MAX_UPDATE_TIME_SEC, entry_max_update_time_sec);
+    CCL_THROW_IF_NOT(
+        entry_max_update_time_sec == CCL_ENV_SIZET_NOT_SPECIFIED || entry_max_update_time_sec > 0,
+        "incorrect ",
+        CCL_ENTRY_MAX_UPDATE_TIME_SEC,
+        " ",
+        entry_max_update_time_sec);
 
     if (fw_type == ccl_framework_none) {
         /* try to automatically detect framework */
@@ -262,6 +301,7 @@ void env_data::parse() {
     env_2_type(CCL_ALGO_FALLBACK, enable_algo_fallback);
     env_2_type(CCL_ALLGATHERV, allgatherv_algo_raw);
     env_2_type(CCL_ALLREDUCE, allreduce_algo_raw);
+    env_2_type(CCL_ALLREDUCE_SCALEOUT, allreduce_scaleout_algo_raw);
     env_2_type(CCL_ALLTOALL, alltoall_algo_raw);
     env_2_type(CCL_ALLTOALLV, alltoallv_algo_raw);
     env_2_type(CCL_BARRIER, barrier_algo_raw);
@@ -312,6 +352,7 @@ void env_data::parse() {
     }
     env_2_enum(CCL_STAGING_BUFFER, staging_buffer_names, staging_buffer);
     env_2_type(CCL_OP_SYNC, enable_op_sync);
+    env_2_type(CCL_USE_EXTERNAL_QUEUE, enable_external_queue);
 
     env_2_type(CCL_CHUNK_COUNT, chunk_count);
     CCL_THROW_IF_NOT(chunk_count >= 1, "incorrect ", CCL_CHUNK_COUNT, " ", chunk_count);
@@ -324,20 +365,31 @@ void env_data::parse() {
     CCL_THROW_IF_NOT(
         rs_min_chunk_size >= 1, "incorrect ", CCL_RS_MIN_CHUNK_SIZE, " ", rs_min_chunk_size);
 
-    env_2_type(CCL_AR2D_CHUNK_COUNT, ar2d_chunk_count);
-    CCL_THROW_IF_NOT(
-        ar2d_chunk_count >= 1, "incorrect ", CCL_AR2D_CHUNK_COUNT, " ", ar2d_chunk_count);
-    env_2_type(CCL_AR2D_MIN_CHUNK_SIZE, ar2d_min_chunk_size);
-    CCL_THROW_IF_NOT(
-        ar2d_min_chunk_size >= 1, "incorrect ", CCL_AR2D_MIN_CHUNK_SIZE, " ", ar2d_min_chunk_size);
-
+#ifdef CCL_ENABLE_SYCL
     env_2_type(CCL_ALLGATHERV_TOPO_LARGE_SCALE, allgatherv_topo_large_scale);
+    env_2_type(CCL_ALLGATHERV_TOPO_READ, allgatherv_topo_read);
+    env_2_type(CCL_ALLTOALLV_TOPO_READ, alltoallv_topo_read);
+    env_2_type(CCL_REDUCE_SCATTER_MONOLITHIC_KERNEL, reduce_scatter_monolithic_kernel);
+    env_2_type(CCL_ALLGATHERV_MONOLITHIC_KERNEL, allgatherv_monolithic_kernel);
+#endif // CCL_ENABLE_SYCL
 
-    env_2_type(CCL_ALLREDUCE_2D_BASE_SIZE, (size_t&)allreduce_2d_base_size);
-    env_2_type(CCL_ALLREDUCE_2D_SWITCH_DIMS, allreduce_2d_switch_dims);
     env_2_type(CCL_ALLREDUCE_NREDUCE_BUFFERING, allreduce_nreduce_buffering);
     env_2_type(CCL_ALLREDUCE_NREDUCE_SEGMENT_SIZE, (size_t&)allreduce_nreduce_segment_size);
 
+    env_2_type(CCL_ALLREDUCE_2D_CHUNK_COUNT, allreduce_2d_chunk_count);
+    CCL_THROW_IF_NOT(allreduce_2d_chunk_count >= 1,
+                     "incorrect ",
+                     CCL_ALLREDUCE_2D_CHUNK_COUNT,
+                     " ",
+                     allreduce_2d_chunk_count);
+    env_2_type(CCL_ALLREDUCE_2D_MIN_CHUNK_SIZE, allreduce_2d_min_chunk_size);
+    CCL_THROW_IF_NOT(allreduce_2d_min_chunk_size >= 1,
+                     "incorrect ",
+                     CCL_ALLREDUCE_2D_MIN_CHUNK_SIZE,
+                     " ",
+                     allreduce_2d_min_chunk_size);
+    env_2_type(CCL_ALLREDUCE_2D_SWITCH_DIMS, allreduce_2d_switch_dims);
+
     env_2_type(CCL_ALLTOALL_SCATTER_MAX_OPS, (size_t&)alltoall_scatter_max_ops);
 
     env_2_enum(CCL_BACKEND, backend_names, backend);
@@ -351,6 +403,11 @@ void env_data::parse() {
     env_2_topo(CCL_TOPO_COLOR, topo_color_names, topo_color);
     env_2_type(CCL_TOPO_P2P_ACCESS, enable_p2p_access);
 
+#ifdef CCL_ENABLE_MPI
+    env_2_type(CCL_MPI_LIBRARY_PATH, mpi_lib_path);
+#endif // CCL_ENABLE_MPI
+    env_2_type(CCL_OFI_LIBRARY_PATH, ofi_lib_path);
+
 #ifdef CCL_ENABLE_SYCL
     env_2_type(CCL_KERNEL_PATH, kernel_path);
     if (kernel_path.empty()) {
@@ -366,10 +423,12 @@ void env_data::parse() {
     env_2_type(CCL_KERNEL_DEBUG, kernel_debug);
     env_2_type(CCL_KERNEL_GROUP_SIZE, kernel_group_size);
     env_2_type(CCL_KERNEL_GROUP_COUNT, kernel_group_count);
+    env_2_type(CCL_KERNEL_MEM_ALIGN, kernel_mem_align);
     env_2_type(CCL_KERNEL_SYNC, enable_kernel_sync);
     env_2_type(CCL_KERNEL_1S_LEAD, kernel_1s_lead);
     env_2_type(CCL_KERNEL_1S_USE_COPY_OPS, enable_kernel_1s_copy_ops);
     env_2_type(CCL_KERNEL_1S_IPC_WA, enable_kernel_1s_ipc_wa);
+    env_2_type(CCL_KERNEL_SINGLE_REDUCE_PEERS, enable_kernel_single_reduce_peers);
     env_2_type(CCL_KERNEL_CLOSE_FD_WA, enable_close_fd_wa);
 
     env_2_type(CCL_SYCL_OUTPUT_EVENT, enable_sycl_output_event);
@@ -378,23 +437,26 @@ void env_data::parse() {
     env_2_type(CCL_ZE_BARRIER, enable_ze_barrier);
     env_2_type(CCL_ZE_BIDIR_ALGO, enable_ze_bidir_algo);
     env_2_type(CCL_ZE_CACHE, enable_ze_cache);
-    env_2_type(CCL_ZE_CACHE_IPC_HANDLES, enable_ze_cache_ipc_handles);
-    env_2_type(CCL_ZE_CACHE_IPC_HANDLES_THRESHOLD, ze_cache_ipc_handles_threshold);
+    env_2_type(CCL_ZE_CACHE_OPEN_IPC_HANDLES, enable_ze_cache_open_ipc_handles);
+    env_2_type(CCL_ZE_CACHE_OPEN_IPC_HANDLES_THRESHOLD, ze_cache_open_ipc_handles_threshold);
     if (enable_ze_cache == 0) {
-        enable_ze_cache_ipc_handles = 0;
+        enable_ze_cache_open_ipc_handles = 0;
     }
-    else if (enable_ze_cache && enable_ze_cache_ipc_handles) {
-        CCL_THROW_IF_NOT(ze_cache_ipc_handles_threshold > 0,
+    else if (enable_ze_cache && enable_ze_cache_open_ipc_handles) {
+        CCL_THROW_IF_NOT(ze_cache_open_ipc_handles_threshold > 0,
                          "incorrect ",
-                         CCL_ZE_CACHE_IPC_HANDLES_THRESHOLD,
+                         CCL_ZE_CACHE_OPEN_IPC_HANDLES_THRESHOLD,
                          " ",
-                         ze_cache_ipc_handles_threshold);
+                         ze_cache_open_ipc_handles_threshold);
     }
+    env_2_type(CCL_ZE_CACHE_GET_IPC_HANDLES, enable_ze_cache_get_ipc_handles);
     env_2_type(CCL_ZE_SINGLE_LIST, enable_ze_single_list);
     env_2_type(CCL_ZE_DISABLE_FAMILY_CHECK, disable_ze_family_check);
     env_2_type(CCL_ZE_DISABLE_PORT_CHECK, disable_ze_port_check);
+    env_2_type(CCL_ZE_DISABLE_OVERSUBSCRIPTION_CHECK, ze_disable_oversubscription_check);
     env_2_type(CCL_ZE_SERIALIZE, ze_serialize_mode);
-    env_2_enum(CCL_ZE_COPY_ENGINE, ze_copy_engine_names, ze_copy_engine);
+    env_2_enum(CCL_ZE_COPY_ENGINE, ccl::ze::copy_engine_names, ze_copy_engine);
+    env_2_enum(CCL_ZE_H2D_COPY_ENGINE, ccl::ze::h2d_copy_engine_names, ze_h2d_copy_engine);
     env_2_type(CCL_ZE_MAX_COMPUTE_QUEUES, ze_max_compute_queues);
     CCL_THROW_IF_NOT(
         ze_max_compute_queues == CCL_ENV_SIZET_NOT_SPECIFIED || ze_max_compute_queues > 0,
@@ -403,13 +465,14 @@ void env_data::parse() {
         " ",
         ze_max_compute_queues);
     env_2_type(CCL_ZE_MAX_COPY_QUEUES, ze_max_copy_queues);
-    CCL_THROW_IF_NOT(ze_copy_engine == ccl_ze_copy_engine_none ||
+    CCL_THROW_IF_NOT(ze_copy_engine == ccl::ze::copy_engine_mode::none ||
                          ze_max_copy_queues == CCL_ENV_SIZET_NOT_SPECIFIED ||
                          ze_max_copy_queues > 0,
                      "incorrect ",
                      CCL_ZE_MAX_COPY_QUEUES,
                      " ",
                      ze_max_copy_queues);
+    env_2_type(CCL_ZE_ENABLE_CCS_FALLBACK_FOR_COPY, ze_enable_ccs_fallback_for_copy);
     env_2_type(CCL_ZE_LIST_DUMP, enable_ze_list_dump);
     env_2_type(CCL_ZE_QUEUE_INDEX_OFFSET, ze_queue_index_offset);
     CCL_THROW_IF_NOT(ze_queue_index_offset >= 0,
@@ -422,21 +485,19 @@ void env_data::parse() {
     env_2_type(CCL_ZE_ENABLE, ze_enable);
     env_2_type(CCL_ZE_FINI_WA, ze_fini_wa);
     env_2_type(CCL_ZE_MULTI_WORKERS, ze_multi_workers);
+    env_2_enum(CCL_ZE_IPC_EXCHANGE, ze::ipc_exchange_names, ze_ipc_exchange);
 #endif // CCL_ENABLE_SYCL
 
+#ifdef CCL_ENABLE_PMIX
+    env_2_type(CCL_PMIX_LIBRARY_PATH, pmix_lib_path);
+#endif // CCL_ENABLE_PMIX
+
 #ifdef CCL_ENABLE_ITT
     env_2_type(CCL_ITT_LEVEL, itt_level);
 #endif // CCL_ENABLE_ITT
 
     auto bf16_impl_types = ccl_bf16_get_impl_types();
-    ccl_bf16_impl_type bf16_env_impl_type;
-    if (env_2_enum(CCL_BF16, bf16_env_impl_names, bf16_env_impl_type)) {
-        CCL_THROW_IF_NOT(bf16_impl_types.find(bf16_env_impl_type) != bf16_impl_types.end(),
-                         "unsupported BF16 impl type: ",
-                         bf16_env_impl_names[bf16_env_impl_type]);
-        bf16_impl_type = bf16_env_impl_type;
-    }
-    else {
+    if (!env_2_enum(CCL_BF16, bf16_impl_names, bf16_impl_type)) {
         bf16_impl_type = *bf16_impl_types.rbegin();
     }
 
@@ -482,8 +543,8 @@ void env_data::print(int rank) {
         LOG_INFO(global_data.hwloc_wrapper->to_string());
     }
 
-    auto local_proc_idx = global_data.executor->get_local_proc_idx();
-    auto local_proc_count = global_data.executor->get_local_proc_count();
+    auto local_proc_idx = global_data.get_local_proc_idx();
+    auto local_proc_count = global_data.get_local_proc_count();
 
     if (rank < local_proc_count) {
         for (size_t w_idx = 0; w_idx < worker_count; w_idx++) {
@@ -512,6 +573,11 @@ void env_data::print(int rank) {
     LOG_INFO(CCL_QUEUE_DUMP, ": ", queue_dump);
     LOG_INFO(CCL_SCHED_DUMP, ": ", sched_dump);
     LOG_INFO(CCL_SCHED_PROFILE, ": ", sched_profile);
+    LOG_INFO(CCL_ENTRY_MAX_UPDATE_TIME_SEC,
+             ": ",
+             (entry_max_update_time_sec != CCL_ENV_SIZET_NOT_SPECIFIED)
+                 ? std::to_string(entry_max_update_time_sec)
+                 : CCL_ENV_STR_NOT_SPECIFIED);
 
     LOG_INFO(CCL_FRAMEWORK, ": ", str_by_enum(ccl_framework_type_names, fw_type));
 
@@ -537,6 +603,10 @@ void env_data::print(int rank) {
     LOG_INFO(CCL_ALLREDUCE,
              ": ",
              (allreduce_algo_raw.length()) ? allreduce_algo_raw : CCL_ENV_STR_NOT_SPECIFIED);
+    LOG_INFO(CCL_ALLREDUCE_SCALEOUT,
+             ": ",
+             (allreduce_scaleout_algo_raw.length()) ? allreduce_scaleout_algo_raw
+                                                    : CCL_ENV_STR_NOT_SPECIFIED);
     LOG_INFO(CCL_ALLTOALL,
              ": ",
              (alltoall_algo_raw.length()) ? alltoall_algo_raw : CCL_ENV_STR_NOT_SPECIFIED);
@@ -576,22 +646,21 @@ void env_data::print(int rank) {
     LOG_INFO(CCL_STRICT_ORDER, ": ", enable_strict_order);
     LOG_INFO(CCL_STAGING_BUFFER, ": ", str_by_enum(staging_buffer_names, staging_buffer));
     LOG_INFO(CCL_OP_SYNC, ": ", enable_op_sync);
+    LOG_INFO(CCL_USE_EXTERNAL_QUEUE, ": ", enable_external_queue);
 
     LOG_INFO(CCL_CHUNK_COUNT, ": ", chunk_count);
     LOG_INFO(CCL_MIN_CHUNK_SIZE, ": ", min_chunk_size);
     LOG_INFO(CCL_RS_CHUNK_COUNT, ": ", rs_chunk_count);
     LOG_INFO(CCL_RS_MIN_CHUNK_SIZE, ": ", rs_min_chunk_size);
-    LOG_INFO(CCL_AR2D_CHUNK_COUNT, ": ", ar2d_chunk_count);
-    LOG_INFO(CCL_AR2D_MIN_CHUNK_SIZE, ": ", ar2d_min_chunk_size);
 
+#ifdef CCL_ENABLE_SYCL
     LOG_INFO(CCL_ALLGATHERV_TOPO_LARGE_SCALE, ": ", allgatherv_topo_large_scale);
+    LOG_INFO(CCL_ALLGATHERV_TOPO_READ, ": ", allgatherv_topo_read);
+    LOG_INFO(CCL_ALLTOALLV_TOPO_READ, ": ", alltoallv_topo_read);
+    LOG_INFO(CCL_REDUCE_SCATTER_MONOLITHIC_KERNEL, ": ", reduce_scatter_monolithic_kernel);
+    LOG_INFO(CCL_ALLGATHERV_MONOLITHIC_KERNEL, ": ", allgatherv_monolithic_kernel);
+#endif // CCL_ENABLE_SYCL
 
-    LOG_INFO(CCL_ALLREDUCE_2D_BASE_SIZE,
-             ": ",
-             (allreduce_2d_base_size != CCL_ENV_SIZET_NOT_SPECIFIED)
-                 ? std::to_string(allreduce_2d_base_size)
-                 : CCL_ENV_STR_NOT_SPECIFIED);
-    LOG_INFO(CCL_ALLREDUCE_2D_SWITCH_DIMS, ": ", allreduce_2d_switch_dims);
     LOG_INFO(CCL_ALLREDUCE_NREDUCE_BUFFERING, ": ", allreduce_nreduce_buffering);
     LOG_INFO(CCL_ALLREDUCE_NREDUCE_SEGMENT_SIZE,
              ": ",
@@ -599,6 +668,10 @@ void env_data::print(int rank) {
                  ? std::to_string(allreduce_nreduce_segment_size)
                  : CCL_ENV_STR_NOT_SPECIFIED);
 
+    LOG_INFO(CCL_ALLREDUCE_2D_CHUNK_COUNT, ": ", allreduce_2d_chunk_count);
+    LOG_INFO(CCL_ALLREDUCE_2D_MIN_CHUNK_SIZE, ": ", allreduce_2d_min_chunk_size);
+    LOG_INFO(CCL_ALLREDUCE_2D_SWITCH_DIMS, ": ", allreduce_2d_switch_dims);
+
     LOG_INFO(CCL_ALLTOALL_SCATTER_MAX_OPS,
              ": ",
              (alltoall_scatter_max_ops != CCL_ENV_SIZET_NOT_SPECIFIED)
@@ -618,27 +691,38 @@ void env_data::print(int rank) {
 
     LOG_INFO(CCL_PROCESS_LAUNCHER, ": ", str_by_enum(process_launcher_names, process_launcher));
 
+#ifdef CCL_ENABLE_MPI
+    LOG_INFO(CCL_MPI_LIBRARY_PATH,
+             ": ",
+             (!mpi_lib_path.empty()) ? mpi_lib_path : CCL_ENV_STR_NOT_SPECIFIED);
+#endif // CCL_ENABLE_MPI
+    LOG_INFO(CCL_OFI_LIBRARY_PATH,
+             ": ",
+             (!ofi_lib_path.empty()) ? ofi_lib_path : CCL_ENV_STR_NOT_SPECIFIED);
+
 #ifdef CCL_ENABLE_SYCL
     LOG_INFO(CCL_TOPO_ALGO, ": ", enable_topo_algo);
     LOG_INFO(CCL_TOPO_COLOR, ": ", str_by_enum(topo_color_names, topo_color));
-    LOG_INFO(CCL_TOPO_P2P_ACCESS, ": ", enable_p2p_access);
+    LOG_INFO(CCL_TOPO_P2P_ACCESS,
+             ": ",
+             (enable_p2p_access != CCL_ENV_INT_NOT_SPECIFIED) ? std::to_string(enable_p2p_access)
+                                                              : CCL_ENV_STR_NOT_SPECIFIED);
 
     LOG_INFO(
         CCL_KERNEL_PATH, ": ", (!kernel_path.empty()) ? kernel_path : CCL_ENV_STR_NOT_SPECIFIED);
     LOG_INFO(CCL_KERNEL_DEBUG, ": ", kernel_debug);
-    LOG_INFO(CCL_KERNEL_GROUP_SIZE,
-             ": ",
-             (kernel_group_size != CCL_ENV_SIZET_NOT_SPECIFIED) ? std::to_string(kernel_group_size)
-                                                                : CCL_ENV_STR_NOT_SPECIFIED);
+    LOG_INFO(CCL_KERNEL_GROUP_SIZE, ": ", kernel_group_size);
     LOG_INFO(CCL_KERNEL_GROUP_COUNT,
              ": ",
              (kernel_group_count != CCL_ENV_SIZET_NOT_SPECIFIED)
                  ? std::to_string(kernel_group_count)
                  : CCL_ENV_STR_NOT_SPECIFIED);
+    LOG_INFO(CCL_KERNEL_MEM_ALIGN, ": ", kernel_mem_align);
     LOG_INFO(CCL_KERNEL_SYNC, ": ", enable_kernel_sync);
     LOG_INFO(CCL_KERNEL_1S_LEAD, ": ", kernel_1s_lead);
     LOG_INFO(CCL_KERNEL_1S_USE_COPY_OPS, ": ", enable_kernel_1s_copy_ops);
     LOG_INFO(CCL_KERNEL_1S_IPC_WA, ": ", enable_kernel_1s_ipc_wa);
+    LOG_INFO(CCL_KERNEL_SINGLE_REDUCE_PEERS, ": ", enable_kernel_single_reduce_peers);
     LOG_INFO(CCL_KERNEL_CLOSE_FD_WA, ": ", enable_close_fd_wa);
 
     LOG_INFO(CCL_SYCL_OUTPUT_EVENT, ": ", enable_sycl_output_event);
@@ -647,13 +731,18 @@ void env_data::print(int rank) {
     LOG_INFO(CCL_ZE_BARRIER, ": ", enable_ze_barrier);
     LOG_INFO(CCL_ZE_BIDIR_ALGO, ": ", enable_ze_bidir_algo);
     LOG_INFO(CCL_ZE_CACHE, ": ", enable_ze_cache);
-    LOG_INFO(CCL_ZE_CACHE_IPC_HANDLES, ": ", enable_ze_cache_ipc_handles);
-    LOG_INFO(CCL_ZE_CACHE_IPC_HANDLES_THRESHOLD, ": ", ze_cache_ipc_handles_threshold);
+    LOG_INFO(CCL_ZE_CACHE_OPEN_IPC_HANDLES, ": ", enable_ze_cache_open_ipc_handles);
+    LOG_INFO(CCL_ZE_CACHE_OPEN_IPC_HANDLES_THRESHOLD, ": ", ze_cache_open_ipc_handles_threshold);
+    LOG_INFO(CCL_ZE_CACHE_GET_IPC_HANDLES, ": ", enable_ze_cache_get_ipc_handles);
     LOG_INFO(CCL_ZE_SINGLE_LIST, ": ", enable_ze_single_list);
     LOG_INFO(CCL_ZE_DISABLE_FAMILY_CHECK, ": ", disable_ze_family_check);
     LOG_INFO(CCL_ZE_DISABLE_PORT_CHECK, ": ", disable_ze_port_check);
+    LOG_INFO(CCL_ZE_DISABLE_OVERSUBSCRIPTION_CHECK, ": ", ze_disable_oversubscription_check);
     LOG_INFO(CCL_ZE_SERIALIZE, ": ", ze_serialize_mode);
-    LOG_INFO(CCL_ZE_COPY_ENGINE, ": ", str_by_enum(ze_copy_engine_names, ze_copy_engine));
+    LOG_INFO(CCL_ZE_COPY_ENGINE, ": ", str_by_enum(ccl::ze::copy_engine_names, ze_copy_engine));
+    LOG_INFO(CCL_ZE_H2D_COPY_ENGINE,
+             ": ",
+             str_by_enum(ccl::ze::h2d_copy_engine_names, ze_h2d_copy_engine));
     LOG_INFO(CCL_ZE_MAX_COMPUTE_QUEUES,
              ": ",
              (ze_max_compute_queues != CCL_ENV_SIZET_NOT_SPECIFIED)
@@ -664,6 +753,7 @@ void env_data::print(int rank) {
              (ze_max_copy_queues != CCL_ENV_SIZET_NOT_SPECIFIED)
                  ? std::to_string(ze_max_copy_queues)
                  : CCL_ENV_STR_NOT_SPECIFIED);
+    LOG_INFO(CCL_ZE_ENABLE_CCS_FALLBACK_FOR_COPY, ": ", ze_enable_ccs_fallback_for_copy);
     LOG_INFO(CCL_ZE_LIST_DUMP, ": ", enable_ze_list_dump);
     LOG_INFO(CCL_ZE_QUEUE_INDEX_OFFSET, ": ", ze_queue_index_offset);
     LOG_INFO(CCL_ZE_CLOSE_IPC_WA, ": ", ze_close_ipc_wa);
@@ -673,8 +763,15 @@ void env_data::print(int rank) {
     LOG_INFO(CCL_ZE_ENABLE, ": ", ze_enable);
     LOG_INFO(CCL_ZE_FINI_WA, ": ", ze_fini_wa);
     LOG_INFO(CCL_ZE_MULTI_WORKERS, ": ", ze_multi_workers);
+    LOG_INFO(CCL_ZE_IPC_EXCHANGE, ": ", str_by_enum(ze::ipc_exchange_names, ze_ipc_exchange));
 #endif // CCL_ENABLE_SYCL
 
+#ifdef CCL_ENABLE_PMIX
+    LOG_INFO(CCL_PMIX_LIBRARY_PATH,
+             ": ",
+             (!pmix_lib_path.empty()) ? pmix_lib_path : CCL_ENV_STR_NOT_SPECIFIED);
+#endif // CCL_ENABLE_PMIX
+
 #ifdef CCL_ENABLE_ITT
     LOG_INFO(CCL_ITT_LEVEL, ": ", itt_level);
 #endif // CCL_ENABLE_ITT
diff --git a/src/common/env/env.hpp b/src/common/env/env.hpp
index 51dddb046..25bd8033d 100644
--- a/src/common/env/env.hpp
+++ b/src/common/env/env.hpp
@@ -30,6 +30,10 @@
 #include "comp/bf16/bf16_utils.hpp"
 #include "comp/fp16/fp16_utils.hpp"
 #include "sched/cache/cache.hpp"
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+#include "common/global/ze/ze_fd_manager.hpp"
+#include "sched/entry/ze/ze_primitives.hpp"
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
 #include "topology/topo_manager.hpp"
 
 constexpr const char* CCL_ENV_STR_NOT_SPECIFIED = "<not specified>";
@@ -41,6 +45,8 @@ constexpr const char* CCL_ABORT_ON_THROW = "CCL_ABORT_ON_THROW";
 constexpr const char* CCL_QUEUE_DUMP = "CCL_QUEUE_DUMP";
 constexpr const char* CCL_SCHED_DUMP = "CCL_SCHED_DUMP";
 constexpr const char* CCL_SCHED_PROFILE = "CCL_SCHED_PROFILE";
+// maximum amount of time in seconds an entry can spend in update. for debug purpose
+constexpr const char* CCL_ENTRY_MAX_UPDATE_TIME_SEC = "CCL_ENTRY_MAX_UPDATE_TIME_SEC";
 
 constexpr const char* CCL_FRAMEWORK = "CCL_FRAMEWORK";
 
@@ -70,6 +76,7 @@ constexpr const char* CCL_MNIC_OFFSET = "CCL_MNIC_OFFSET";
 constexpr const char* CCL_ALGO_FALLBACK = "CCL_ALGO_FALLBACK";
 constexpr const char* CCL_ALLGATHERV = "CCL_ALLGATHERV";
 constexpr const char* CCL_ALLREDUCE = "CCL_ALLREDUCE";
+constexpr const char* CCL_ALLREDUCE_SCALEOUT = "CCL_ALLREDUCE_SCALEOUT";
 constexpr const char* CCL_ALLTOALL = "CCL_ALLTOALL";
 constexpr const char* CCL_ALLTOALLV = "CCL_ALLTOALLV";
 constexpr const char* CCL_BARRIER = "CCL_BARRIER";
@@ -95,22 +102,29 @@ constexpr const char* CCL_BUFFER_CACHE = "CCL_BUFFER_CACHE";
 constexpr const char* CCL_STRICT_ORDER = "CCL_STRICT_ORDER";
 constexpr const char* CCL_STAGING_BUFFER = "CCL_STAGING_BUFFER";
 constexpr const char* CCL_OP_SYNC = "CCL_OP_SYNC";
+constexpr const char* CCL_USE_EXTERNAL_QUEUE = "CCL_USE_EXTERNAL_QUEUE";
 
 constexpr const char* CCL_CHUNK_COUNT = "CCL_CHUNK_COUNT";
 constexpr const char* CCL_MIN_CHUNK_SIZE = "CCL_MIN_CHUNK_SIZE";
 constexpr const char* CCL_RS_CHUNK_COUNT = "CCL_RS_CHUNK_COUNT";
 constexpr const char* CCL_RS_MIN_CHUNK_SIZE = "CCL_RS_MIN_CHUNK_SIZE";
-constexpr const char* CCL_AR2D_CHUNK_COUNT = "CCL_AR2D_CHUNK_COUNT";
-constexpr const char* CCL_AR2D_MIN_CHUNK_SIZE = "CCL_AR2D_MIN_CHUNK_SIZE";
 
+#ifdef CCL_ENABLE_SYCL
+// use alternative allgatherv topo algorithm
 constexpr const char* CCL_ALLGATHERV_TOPO_LARGE_SCALE = "CCL_ALLGATHERV_TOPO_LARGE_SCALE";
-
-constexpr const char* CCL_ALLREDUCE_2D_BASE_SIZE = "CCL_ALLREDUCE_2D_BASE_SIZE";
-constexpr const char* CCL_ALLREDUCE_2D_SWITCH_DIMS = "CCL_ALLREDUCE_2D_SWITCH_DIMS";
+constexpr const char* CCL_ALLGATHERV_TOPO_READ = "CCL_ALLGATHERV_TOPO_READ";
+constexpr const char* CCL_ALLTOALLV_TOPO_READ = "CCL_ALLTOALLV_TOPO_READ";
+constexpr const char* CCL_REDUCE_SCATTER_MONOLITHIC_KERNEL = "CCL_REDUCE_SCATTER_MONOLITHIC_KERNEL";
+constexpr const char* CCL_ALLGATHERV_MONOLITHIC_KERNEL = "CCL_ALLGATHERV_MONOLITHIC_KERNEL";
+#endif // CCL_ENABLE_SYCL
 
 constexpr const char* CCL_ALLREDUCE_NREDUCE_BUFFERING = "CCL_ALLREDUCE_NREDUCE_BUFFERING";
 constexpr const char* CCL_ALLREDUCE_NREDUCE_SEGMENT_SIZE = "CCL_ALLREDUCE_NREDUCE_SEGMENT_SIZE";
 
+constexpr const char* CCL_ALLREDUCE_2D_CHUNK_COUNT = "CCL_ALLREDUCE_2D_CHUNK_COUNT";
+constexpr const char* CCL_ALLREDUCE_2D_MIN_CHUNK_SIZE = "CCL_ALLREDUCE_2D_MIN_CHUNK_SIZE";
+constexpr const char* CCL_ALLREDUCE_2D_SWITCH_DIMS = "CCL_ALLREDUCE_2D_SWITCH_DIMS";
+
 constexpr const char* CCL_ALLTOALL_SCATTER_MAX_OPS = "CCL_ALLTOALL_SCATTER_MAX_OPS";
 
 constexpr const char* CCL_BACKEND = "CCL_BACKEND";
@@ -119,10 +133,12 @@ constexpr const char* CCL_KERNEL_PATH = "CCL_KERNEL_PATH";
 constexpr const char* CCL_KERNEL_DEBUG = "CCL_KERNEL_DEBUG";
 constexpr const char* CCL_KERNEL_GROUP_SIZE = "CCL_KERNEL_GROUP_SIZE";
 constexpr const char* CCL_KERNEL_GROUP_COUNT = "CCL_KERNEL_GROUP_COUNT";
+constexpr const char* CCL_KERNEL_MEM_ALIGN = "CCL_KERNEL_MEM_ALIGN";
 constexpr const char* CCL_KERNEL_SYNC = "CCL_KERNEL_SYNC";
 constexpr const char* CCL_KERNEL_1S_LEAD = "CCL_KERNEL_1S_LEAD";
 constexpr const char* CCL_KERNEL_1S_USE_COPY_OPS = "CCL_KERNEL_1S_USE_COPY_OPS";
 constexpr const char* CCL_KERNEL_1S_IPC_WA = "CCL_KERNEL_1S_IPC_WA";
+constexpr const char* CCL_KERNEL_SINGLE_REDUCE_PEERS = "CCL_KERNEL_SINGLE_REDUCE_PEERS";
 constexpr const char* CCL_KERNEL_CLOSE_FD_WA = "CCL_KERNEL_CLOSE_FD_WA";
 
 constexpr const char* CCL_LOCAL_RANK = "CCL_LOCAL_RANK";
@@ -134,18 +150,33 @@ constexpr const char* CCL_TOPO_ALGO = "CCL_TOPO_ALGO";
 constexpr const char* CCL_TOPO_COLOR = "CCL_TOPO_COLOR";
 constexpr const char* CCL_TOPO_P2P_ACCESS = "CCL_TOPO_P2P_ACCESS";
 
+#ifdef CCL_ENABLE_MPI
+constexpr const char* CCL_MPI_LIBRARY_PATH = "CCL_MPI_LIBRARY_PATH";
+#endif // CCL_ENABLE_MPI
+constexpr const char* CCL_OFI_LIBRARY_PATH = "CCL_OFI_LIBRARY_PATH";
+
+#ifdef CCL_ENABLE_SYCL
 constexpr const char* CCL_SYCL_OUTPUT_EVENT = "CCL_SYCL_OUTPUT_EVENT";
 constexpr const char* CCL_USE_HMEM = "CCL_USE_HMEM";
 
 constexpr const char* CCL_ZE_BARRIER = "CCL_ZE_BARRIER";
 constexpr const char* CCL_ZE_BIDIR_ALGO = "CCL_ZE_BIDIR_ALGO";
 constexpr const char* CCL_ZE_CACHE = "CCL_ZE_CACHE";
-constexpr const char* CCL_ZE_CACHE_IPC_HANDLES = "CCL_ZE_CACHE_IPC_HANDLES";
-constexpr const char* CCL_ZE_CACHE_IPC_HANDLES_THRESHOLD = "CCL_ZE_CACHE_IPC_HANDLES_THRESHOLD";
+constexpr const char* CCL_ZE_CACHE_OPEN_IPC_HANDLES = "CCL_ZE_CACHE_OPEN_IPC_HANDLES";
+constexpr const char* CCL_ZE_CACHE_OPEN_IPC_HANDLES_THRESHOLD =
+    "CCL_ZE_CACHE_OPEN_IPC_HANDLES_THRESHOLD";
+constexpr const char* CCL_ZE_CACHE_GET_IPC_HANDLES = "CCL_ZE_CACHE_GET_IPC_HANDLES";
+constexpr const char* CCL_ZE_DISABLE_OVERSUBSCRIPTION_CHECK =
+    "CCL_ZE_DISABLE_OVERSUBSCRIPTION_CHECK";
 constexpr const char* CCL_ZE_SERIALIZE = "CCL_ZE_SERIALIZE";
+
 constexpr const char* CCL_ZE_COPY_ENGINE = "CCL_ZE_COPY_ENGINE";
+constexpr const char* CCL_ZE_H2D_COPY_ENGINE = "CCL_ZE_H2D_COPY_ENGINE";
 constexpr const char* CCL_ZE_MAX_COMPUTE_QUEUES = "CCL_ZE_MAX_COMPUTE_QUEUES";
 constexpr const char* CCL_ZE_MAX_COPY_QUEUES = "CCL_ZE_MAX_COPY_QUEUES";
+// use CCS for intra-card copy if main CE is not available
+constexpr const char* CCL_ZE_ENABLE_CCS_FALLBACK_FOR_COPY = "CCL_ZE_ENABLE_CCS_FALLBACK_FOR_COPY";
+
 constexpr const char* CCL_ZE_LIST_DUMP = "CCL_ZE_LIST_DUMP";
 constexpr const char* CCL_ZE_QUEUE_INDEX_OFFSET = "CCL_ZE_QUEUE_INDEX_OFFSET";
 constexpr const char* CCL_ZE_CLOSE_IPC_WA = "CCL_ZE_CLOSE_IPC_WA";
@@ -156,6 +187,12 @@ constexpr const char* CCL_ZE_LIBRARY_PATH = "CCL_ZE_LIBRARY_PATH";
 constexpr const char* CCL_ZE_ENABLE = "CCL_ZE_ENABLE";
 constexpr const char* CCL_ZE_FINI_WA = "CCL_ZE_FINI_WA";
 constexpr const char* CCL_ZE_MULTI_WORKERS = "CCL_ZE_MULTI_WORKERS";
+constexpr const char* CCL_ZE_IPC_EXCHANGE = "CCL_ZE_IPC_EXCHANGE";
+#endif // CCL_ENABLE_SYCL
+
+#ifdef CCL_ENABLE_PMIX
+constexpr const char* CCL_PMIX_LIBRARY_PATH = "CCL_PMIX_LIBRARY_PATH";
+#endif // CCL_ENABLE_PMIX
 
 #ifdef CCL_ENABLE_ITT
 constexpr const char* CCL_ITT_LEVEL = "CCL_ITT_LEVEL";
@@ -176,13 +213,6 @@ enum ccl_atl_send_proxy {
 
 enum ccl_staging_buffer { ccl_staging_regular, ccl_staging_usm };
 
-enum ccl_ze_copy_engine_mode {
-    ccl_ze_copy_engine_none,
-    ccl_ze_copy_engine_main,
-    ccl_ze_copy_engine_link,
-    ccl_ze_copy_engine_auto
-};
-
 enum class backend_mode {
     native,
 #ifdef CCL_ENABLE_STUB_BACKEND
@@ -190,7 +220,14 @@ enum class backend_mode {
 #endif // CCL_ENABLE_STUB_BACKEND
 };
 
-enum class process_launcher_mode { hydra, torch, none };
+enum class process_launcher_mode {
+    hydra,
+    torch,
+#ifdef CCL_ENABLE_PMIX
+    pmix,
+#endif // CCL_ENABLE_PMIX
+    none
+};
 
 namespace ccl {
 
@@ -217,6 +254,7 @@ class env_data {
     int queue_dump;
     int sched_dump;
     int sched_profile;
+    ssize_t entry_max_update_time_sec;
 
     ccl_framework_type fw_type;
 
@@ -246,6 +284,7 @@ class env_data {
        and store only raw strings in env_data
     */
     int enable_algo_fallback;
+    // main algorithm selection
     std::string allgatherv_algo_raw;
     std::string allreduce_algo_raw;
     std::string alltoall_algo_raw;
@@ -254,6 +293,15 @@ class env_data {
     std::string bcast_algo_raw;
     std::string reduce_algo_raw;
     std::string reduce_scatter_algo_raw;
+    // scale-out selection part
+    std::string allgatherv_scaleout_algo_raw;
+    std::string allreduce_scaleout_algo_raw;
+    std::string alltoall_scaleout_algo_raw;
+    std::string alltoallv_scaleout_algo_raw;
+    std::string barrier_scaleout_algo_raw;
+    std::string bcast_scaleout_algo_raw;
+    std::string reduce_scaleout_algo_raw;
+    std::string reduce_scatter_scaleout_algo_raw;
     int enable_unordered_coll;
 
     int enable_fusion;
@@ -273,21 +321,28 @@ class env_data {
     int enable_strict_order;
     ccl_staging_buffer staging_buffer;
     int enable_op_sync;
+    int enable_external_queue;
 
     size_t chunk_count;
     size_t min_chunk_size;
     size_t rs_chunk_count;
     size_t rs_min_chunk_size;
-    size_t ar2d_chunk_count;
-    size_t ar2d_min_chunk_size;
 
+#ifdef CCL_ENABLE_SYCL
     int allgatherv_topo_large_scale;
+    int allgatherv_topo_read;
+    int alltoallv_topo_read;
+    int reduce_scatter_monolithic_kernel;
+    int allgatherv_monolithic_kernel;
+#endif // CCL_ENABLE_SYCL
 
-    ssize_t allreduce_2d_base_size;
-    int allreduce_2d_switch_dims;
     int allreduce_nreduce_buffering;
     ssize_t allreduce_nreduce_segment_size;
 
+    size_t allreduce_2d_chunk_count;
+    size_t allreduce_2d_min_chunk_size;
+    int allreduce_2d_switch_dims;
+
     ssize_t alltoall_scatter_max_ops;
 
     backend_mode backend;
@@ -300,15 +355,22 @@ class env_data {
     topo_color_mode topo_color;
     int enable_p2p_access;
 
+#ifdef CCL_ENABLE_MPI
+    std::string mpi_lib_path;
+#endif // CCL_ENABLE_MPI
+    std::string ofi_lib_path;
+
 #ifdef CCL_ENABLE_SYCL
     std::string kernel_path;
     int kernel_debug;
     ssize_t kernel_group_size;
     ssize_t kernel_group_count;
+    ssize_t kernel_mem_align;
     int enable_kernel_sync;
     int kernel_1s_lead;
     int enable_kernel_1s_copy_ops;
     int enable_kernel_1s_ipc_wa;
+    int enable_kernel_single_reduce_peers;
     int enable_close_fd_wa;
 
     int enable_sycl_output_event;
@@ -317,15 +379,19 @@ class env_data {
     int enable_ze_barrier;
     int enable_ze_bidir_algo;
     int enable_ze_cache;
-    int enable_ze_cache_ipc_handles;
-    int ze_cache_ipc_handles_threshold;
+    int enable_ze_cache_open_ipc_handles;
+    int ze_cache_open_ipc_handles_threshold;
+    int enable_ze_cache_get_ipc_handles;
     int enable_ze_single_list;
     int disable_ze_family_check;
     int disable_ze_port_check;
+    int ze_disable_oversubscription_check;
     int ze_serialize_mode;
-    ccl_ze_copy_engine_mode ze_copy_engine;
+    ccl::ze::copy_engine_mode ze_copy_engine;
+    ccl::ze::h2d_copy_engine_mode ze_h2d_copy_engine;
     ssize_t ze_max_compute_queues;
     ssize_t ze_max_copy_queues;
+    int ze_enable_ccs_fallback_for_copy;
     int enable_ze_list_dump;
     int ze_queue_index_offset;
     int ze_close_ipc_wa;
@@ -333,8 +399,13 @@ class env_data {
     int ze_enable;
     int ze_fini_wa;
     int ze_multi_workers;
+    ccl::ze::ipc_exchange_mode ze_ipc_exchange;
 #endif // CCL_ENABLE_SYCL
 
+#ifdef CCL_ENABLE_PMIX
+    std::string pmix_lib_path;
+#endif // CCL_ENABLE_PMIX
+
 #ifdef CCL_ENABLE_ITT
     int itt_level;
 #endif // CCL_ENABLE_ITT
@@ -432,7 +503,6 @@ class env_data {
     static std::map<ccl_atl_transport, std::string> atl_transport_names;
     static std::map<ccl_atl_send_proxy, std::string> atl_send_proxy_names;
     static std::map<ccl_staging_buffer, std::string> staging_buffer_names;
-    static std::map<ccl_ze_copy_engine_mode, std::string> ze_copy_engine_names;
     static std::map<backend_mode, std::string> backend_names;
     static std::map<process_launcher_mode, std::string> process_launcher_names;
 
diff --git a/src/common/event/impls/host_event.cpp b/src/common/event/impls/host_event.cpp
index 53e0ae621..5419ce29f 100644
--- a/src/common/event/impls/host_event.cpp
+++ b/src/common/event/impls/host_event.cpp
@@ -15,17 +15,31 @@
 */
 #include "common/request/request.hpp"
 #include "common/event/impls/host_event.hpp"
-#include "common/utils/sycl_utils.hpp"
 #include "exec/exec.hpp"
 
+#ifdef CCL_ENABLE_SYCL
+#include "common/utils/sycl_utils.hpp"
+#endif // CCL_ENABLE_SYCL
+
 namespace ccl {
 
 host_event_impl::host_event_impl(ccl_request* r) : req(r) {
     if (!req) {
+        completed = true;
+        return;
+    }
+#ifdef CCL_ENABLE_SYCL
+    native_event = req->share_native_event();
+#endif // CCL_ENABLE_SYCL
+    if (req->synchronous) {
+        if (!ccl::global_data::get().executor.get()->is_locked) {
+            ccl_release_request(req);
+        }
         // if the user calls collective with coll_attr->synchronous=1 then it will be progressed
-        // in place and API will return null event. In this case mark request as completed,
+        // in place and in this case we mark request as completed,
         // all calls to wait() or test() will do nothing
         completed = true;
+        synchronous = true;
     }
 }
 
@@ -50,7 +64,11 @@ host_event_impl::~host_event_impl() {
 
 void host_event_impl::wait() {
     if (!completed) {
-        ccl_wait_impl(ccl::global_data::get().executor.get(), req);
+        auto* exec = ccl::global_data::get().executor.get();
+        ccl_wait_impl(exec, req);
+        if (synchronous && !exec->is_locked) {
+            ccl_release_request(req);
+        }
         completed = true;
     }
 }
@@ -69,7 +87,7 @@ bool host_event_impl::cancel() {
 event::native_t& host_event_impl::get_native() {
 #ifdef CCL_ENABLE_SYCL
     if (ccl::global_data::env().enable_sycl_output_event) {
-        return req->get_native_event();
+        return *native_event;
     }
     else {
         CCL_THROW("get_native() is not available without CCL_SYCL_OUTPUT_EVENT=1 env variable");
diff --git a/src/common/event/impls/host_event.hpp b/src/common/event/impls/host_event.hpp
index d878cbf10..b2be10e2f 100644
--- a/src/common/event/impls/host_event.hpp
+++ b/src/common/event/impls/host_event.hpp
@@ -36,6 +36,12 @@ class host_event_impl final : public event_impl {
 private:
     ccl_request* req = nullptr;
     bool completed = false;
+    bool synchronous = false;
+
+#ifdef CCL_ENABLE_SYCL
+    // the actual sycl::event returned to the user via ccl::event.get_native()
+    std::shared_ptr<sycl::event> native_event;
+#endif // CCL_ENABLE_SYCL
 };
 
 } // namespace ccl
diff --git a/src/common/global/global.cpp b/src/common/global/global.cpp
index 15d7973f8..92a5fdc43 100644
--- a/src/common/global/global.cpp
+++ b/src/common/global/global.cpp
@@ -14,6 +14,8 @@
  limitations under the License.
 */
 #include "coll/selection/selection.hpp"
+#include "common/api_wrapper/api_wrapper.hpp"
+#include "common/api_wrapper/pmix_api_wrapper.hpp"
 #include "common/datatype/datatype.hpp"
 #include "common/global/global.hpp"
 #include "exec/exec.hpp"
@@ -81,11 +83,21 @@ ccl::status global_data::reset() {
     ze_data.reset();
 #endif // CCL_ENABLE_ZE && CCL_ENABLE_SYCL
 
+    pmix_api_fini();
+
+    api_wrappers_fini();
+
     return ccl::status::success;
 }
 
 ccl::status global_data::init() {
     env_object.parse();
+
+    pmix_api_init();
+
+    set_local_coord();
+    api_wrappers_init();
+
     env_object.set_internal_env();
 
     os_info.fill();
@@ -94,27 +106,6 @@ ccl::status global_data::init() {
         env_object.enable_topo_algo = 0;
     }
 
-#if defined(CCL_ENABLE_ZE) && defined(CCL_ENABLE_SYCL)
-    if (ccl::global_data::env().backend == backend_mode::native &&
-        ccl::global_data::env().ze_enable) {
-        LOG_INFO("initializing level-zero api");
-        if (ze_api_init()) {
-            try {
-                ze_data.reset(new ze::global_data_desc);
-            }
-            catch (const ccl::exception& e) {
-                LOG_INFO("could not initialize level-zero: ", e.what());
-            }
-            catch (...) {
-                LOG_INFO("could not initialize level-zero: unknown error");
-            }
-        }
-        else {
-            LOG_INFO("could not initialize level-zero api");
-        }
-    }
-#endif // CCL_ENABLE_ZE && CCL_ENABLE_SYCL
-
     init_resize_dependent_objects();
     init_resize_independent_objects();
 
@@ -157,4 +148,76 @@ void global_data::reset_resize_independent_objects() {
     hwloc_wrapper.reset();
 }
 
+void global_data::getenv_local_coord(const char* local_proc_idx_env_name,
+                                     const char* local_proc_count_env_name) {
+    char* local_idx_env = getenv(local_proc_idx_env_name);
+    char* local_count_env = getenv(local_proc_count_env_name);
+    if (!(local_idx_env && local_count_env)) {
+        LOG_WARN("could not get local_idx/count from environment variables, "
+                 "trying to get them from ATL");
+#if defined(CCL_ENABLE_ZE) && defined(CCL_ENABLE_SYCL)
+        //TODO: set PIDFD in the comments
+        LOG_WARN("fallback to 'sockets' mode of exchange mechanism, to use DRMFD "
+                 "set CCL_LOCAL_RANK/SIZE or use process launcher");
+        global_data::env().ze_ipc_exchange = ccl::ze::ipc_exchange_mode::sockets;
+#endif // CCL_ENABLE_ZE && CCL_ENABLE_SYCL
+        local_proc_idx = CCL_ENV_INT_NOT_SPECIFIED;
+        local_proc_count = CCL_ENV_INT_NOT_SPECIFIED;
+        return;
+    }
+
+    local_proc_idx = std::atoi(local_idx_env);
+    local_proc_count = std::atoi(local_count_env);
+    CCL_THROW_IF_NOT(
+        local_proc_idx != CCL_ENV_INT_NOT_SPECIFIED, "unexpected local_proc_idx ", local_proc_idx);
+    CCL_THROW_IF_NOT(local_proc_count != CCL_ENV_INT_NOT_SPECIFIED,
+                     "unexpected local_proc_count ",
+                     local_proc_count);
+}
+
+void global_data::set_local_coord() {
+    auto& env = ccl::global_data::env();
+
+    if (env.process_launcher == process_launcher_mode::hydra) {
+        getenv_local_coord("MPI_LOCALRANKID", "MPI_LOCALNRANKS");
+    }
+    else if (env.process_launcher == process_launcher_mode::torch) {
+        getenv_local_coord("LOCAL_RANK", "LOCAL_WORLD_SIZE");
+    }
+#ifdef CCL_ENABLE_PMIX
+    else if (env.process_launcher == process_launcher_mode::pmix) {
+        if (!get_pmix_local_coord(&local_proc_idx, &local_proc_count)) {
+            if (local_proc_idx == CCL_ENV_INT_NOT_SPECIFIED ||
+                local_proc_count == CCL_ENV_INT_NOT_SPECIFIED) {
+                LOG_WARN("could not get local_idx/count from environment variables, "
+                         "trying to get them from ATL");
+#if defined(CCL_ENABLE_ZE) && defined(CCL_ENABLE_SYCL)
+                LOG_WARN("fallback to 'sockets' mode of exchange mechanism, to use DRMFD "
+                         "set CCL_LOCAL_RANK/SIZE or use process launcher");
+                global_data::env().ze_ipc_exchange = ccl::ze::ipc_exchange_mode::sockets;
+#endif // CCL_ENABLE_ZE && CCL_ENABLE_SYCL
+            }
+            else {
+                CCL_THROW("unexpected behaviour of get_pmix_local_coord local_proc_idx: ",
+                          local_proc_idx,
+                          ", local_proc_count: ",
+                          local_proc_count);
+            }
+        }
+    }
+#endif // CCL_ENABLE_PMIX
+    else if (env.process_launcher == process_launcher_mode::none) {
+        getenv_local_coord("CCL_LOCAL_RANK", "CCL_LOCAL_SIZE");
+    }
+    else {
+        CCL_THROW("unexpected process launcher");
+    }
+    LOG_INFO("process launcher: ",
+             ccl::env_data::process_launcher_names[env.process_launcher],
+             ", local_proc_idx: ",
+             local_proc_idx,
+             ", local_proc_count: ",
+             local_proc_count);
+}
+
 } // namespace ccl
diff --git a/src/common/global/global.hpp b/src/common/global/global.hpp
index 1749fb54c..2804cd61e 100644
--- a/src/common/global/global.hpp
+++ b/src/common/global/global.hpp
@@ -19,9 +19,8 @@
 #include "common/env/env.hpp"
 
 #if defined(CCL_ENABLE_ZE) && defined(CCL_ENABLE_SYCL)
-#include "common/global/ze_data.hpp"
+#include "common/global/ze/ze_data.hpp"
 #endif // CCL_ENABLE_ZE && CCL_ENABLE_SYCL
-
 #include "common/utils/utils.hpp"
 #include "hwloc/hwloc_wrapper.hpp"
 #include "internal_types.hpp"
@@ -90,12 +89,32 @@ class global_data {
     static thread_local bool is_worker_thread;
     bool is_ft_enabled;
 
+    int get_local_proc_idx() const {
+        return local_proc_idx;
+    }
+    int get_local_proc_count() const {
+        return local_proc_count;
+    }
+
+    void set_local_proc_idx(int local_idx) {
+        local_proc_idx = local_idx;
+    }
+    void set_local_proc_count(int local_count) {
+        local_proc_count = local_count;
+    }
+
 private:
     global_data();
 
     void init_resize_independent_objects();
     void reset_resize_independent_objects();
 
+    int local_proc_idx;
+    int local_proc_count;
+    void getenv_local_coord(const char* local_proc_idx_env_name,
+                            const char* local_proc_count_env_name);
+    void set_local_coord();
+
     env_data env_object;
     os_information os_info;
 };
diff --git a/src/common/global/ze_data.cpp b/src/common/global/ze/ze_data.cpp
similarity index 84%
rename from src/common/global/ze_data.cpp
rename to src/common/global/ze/ze_data.cpp
index 410642aca..65760a17f 100644
--- a/src/common/global/ze_data.cpp
+++ b/src/common/global/ze/ze_data.cpp
@@ -14,7 +14,7 @@
  limitations under the License.
 */
 #include "common/global/global.hpp"
-#include "common/ze/ze_api_wrapper.hpp"
+#include "common/api_wrapper/ze_api_wrapper.hpp"
 
 namespace ccl {
 namespace ze {
@@ -58,7 +58,6 @@ global_data_desc::global_data_desc() {
 
         for (uint32_t idx = 0; idx < device_count; idx++) {
             devices.push_back(device_info(devs[idx], idx));
-            device_handles.push_back(devs[idx]);
         }
 
         for (uint32_t idx = 0; idx < device_count; idx++) {
@@ -71,7 +70,6 @@ global_data_desc::global_data_desc() {
 
             for (uint32_t subdev_idx = 0; subdev_idx < subdevice_count; subdev_idx++) {
                 devices.push_back(device_info(subdevs[subdev_idx], idx));
-                device_handles.push_back(subdevs[subdev_idx]);
             }
         }
     }
@@ -79,6 +77,20 @@ global_data_desc::global_data_desc() {
 
     cache = std::make_unique<ze::cache>(global_data::env().worker_count);
 
+    if (global_data::env().ze_ipc_exchange == ccl::ze::ipc_exchange_mode::pidfd) {
+        if (!ze::fd_manager::is_pidfd_supported()) {
+            global_data::env().ze_ipc_exchange = ccl::ze::ipc_exchange_mode::drmfd;
+            LOG_WARN("pidfd exchange mode is not supported, fallbacks to drmfd");
+        }
+        else {
+            LOG_DEBUG("pidfd exchange mode is verified successfully");
+        }
+    }
+
+    if (global_data::env().ze_ipc_exchange == ccl::ze::ipc_exchange_mode::drmfd) {
+        fd_manager = std::make_unique<ze::fd_manager>();
+    }
+
     LOG_INFO("initialized level-zero");
 }
 
@@ -97,11 +109,8 @@ global_data_desc::~global_data_desc() {
 
     contexts.clear();
     devices.clear();
-    device_handles.clear();
     drivers.clear();
 
-    ze_api_fini();
-
     LOG_INFO("finalized level-zero");
 }
 
diff --git a/src/common/global/ze_data.hpp b/src/common/global/ze/ze_data.hpp
similarity index 94%
rename from src/common/global/ze_data.hpp
rename to src/common/global/ze/ze_data.hpp
index d4da107af..10212646f 100644
--- a/src/common/global/ze_data.hpp
+++ b/src/common/global/ze/ze_data.hpp
@@ -17,6 +17,7 @@
 
 #include <unordered_map>
 
+#include "common/global/ze/ze_fd_manager.hpp"
 #include "sched/entry/ze/ze_cache.hpp"
 #include "sched/entry/ze/ze_primitives.hpp"
 #include "sched/ze/ze_event_manager.hpp"
@@ -24,7 +25,6 @@
 #include "sched/sched_timer.hpp"
 
 namespace ccl {
-
 namespace ze {
 
 struct device_info {
@@ -39,12 +39,13 @@ struct global_data_desc {
     std::vector<ze_driver_handle_t> drivers;
     std::vector<ze_context_handle_t> contexts;
     std::vector<device_info> devices;
-    std::vector<ze_device_handle_t> device_handles;
     std::unique_ptr<ze::cache> cache;
     std::unordered_map<ze_context_handle_t, ccl::ze::dynamic_event_pool> dynamic_event_pools;
 
     std::atomic<size_t> kernel_counter{};
 
+    std::unique_ptr<ze::fd_manager> fd_manager;
+
     global_data_desc();
     global_data_desc(const global_data_desc&) = delete;
     global_data_desc(global_data_desc&&) = delete;
diff --git a/src/common/global/ze/ze_fd_manager.cpp b/src/common/global/ze/ze_fd_manager.cpp
new file mode 100644
index 000000000..00aff4468
--- /dev/null
+++ b/src/common/global/ze/ze_fd_manager.cpp
@@ -0,0 +1,464 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "common/global/global.hpp"
+#include "common/global/ze/ze_fd_manager.hpp"
+#include "common/log/log.hpp"
+#include "common/utils/exchange_utils.hpp"
+#include "common/utils/utils.hpp"
+#include "common/utils/yield.hpp"
+
+#include <dirent.h>
+#ifdef CCL_ENABLE_DRM
+#include "i915_drm.h"
+#endif // CCL_ENABLE_DRM
+
+// pidfd system calls
+#ifndef __NR_pidfd_open
+#define __NR_pidfd_open 434
+#endif // __NR_pidfd_open
+#ifndef __NR_pidfd_getfd
+#define __NR_pidfd_getfd 438
+#endif // __NR_pidfd_getfd
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/shm.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
+namespace ccl {
+namespace ze {
+
+fd_manager::fd_manager() {
+    device_fds = init_device_fds();
+    exchange_device_fds();
+    LOG_DEBUG("init completed");
+}
+
+fd_manager::~fd_manager() {
+    all_socks.clear();
+    all_pids.clear();
+    for (auto fd : device_fds) {
+        close(fd);
+    }
+    device_fds.clear();
+}
+
+bool fd_manager::is_pidfd_supported() {
+    int pid = getpid();
+    char filename[] = "/tmp/oneccl_pidfd_check_getXXXXXXXXXX";
+    std::vector<int> fds;
+    bool result = true;
+
+    auto check_fd = [&](int fd) {
+        if (fd == ccl::utils::invalid_fd) {
+            result = false;
+        }
+        fds.push_back(fd);
+    };
+
+    int file_fd = mkstemp(filename);
+    check_fd(file_fd);
+
+    int pidfd = syscall(__NR_pidfd_open, pid, 0);
+    check_fd(pidfd);
+
+    int dupfd = syscall(__NR_pidfd_getfd, pidfd, file_fd, 0);
+    check_fd(dupfd);
+
+    for (auto &fd : fds) {
+        close(fd);
+    }
+    unlink(filename);
+    return result;
+}
+
+void fd_manager::barrier(void *mem) {
+    static int call_count = 1;
+
+    int local_count = ccl::global_data::get().get_local_proc_count();
+    std::atomic<int> *barrier_counter = static_cast<std::atomic<int> *>(mem);
+    CCL_THROW_IF_NOT(barrier_counter == mem,
+                     "barrier_counter: ",
+                     barrier_counter,
+                     " and mem:",
+                     mem,
+                     " must be the same");
+
+    ++(*barrier_counter);
+    LOG_DEBUG("barrier_counter: ", *barrier_counter);
+
+    while ((*barrier_counter) < (call_count * local_count)) {
+        ccl_yield(ccl::global_data::env().yield_type);
+    }
+    call_count++;
+}
+
+std::string fd_manager::get_shm_filename() {
+    std::string filename = "/dev/shm/ccl-shm-file";
+    uid_t uid = getuid();
+    std::stringstream ss;
+    ss << filename << "-" << std::to_string(uid);
+    return ss.str();
+}
+
+void *fd_manager::create_shared_memory() {
+    int local_count = ccl::global_data::get().get_local_proc_count();
+    auto length = size_per_proc * local_count + counter_offset;
+    int prot = PROT_READ | PROT_WRITE;
+    int flags = MAP_SHARED;
+
+    auto shm_filename = get_shm_filename();
+    int fd = open(shm_filename.c_str(), O_CREAT | O_RDWR, 0666);
+    CCL_THROW_IF_NOT(fd > 0, "open failed: fd: ", fd, ", errno: ", strerror(errno));
+    int ret = ftruncate(fd, length);
+    CCL_THROW_IF_NOT(ret != ccl::utils::invalid_err_code,
+                     "ioctl failed: ret: ",
+                     ret,
+                     ", errno: ",
+                     strerror(errno));
+
+    void *mem = mmap(nullptr, length, prot, flags, fd, 0);
+    CCL_THROW_IF_NOT(mem != MAP_FAILED, "mmap failed: mem: ", mem, ", errno: ", strerror(errno));
+
+    LOG_DEBUG("shm_filename: ", shm_filename, ", mem: ", mem, ", fd: ", fd);
+    barrier(mem);
+
+    close(fd);
+    unlink(shm_filename.c_str());
+    return mem;
+}
+
+std::vector<int> fd_manager::get_device_fds() {
+    return device_fds;
+}
+
+std::vector<int> fd_manager::init_device_fds() {
+    const char *device_dir = "/dev/dri/by-path/";
+    const char *suffix = "-render";
+    char device_name[NAME_MAX];
+    struct dirent *ent = nullptr;
+    std::vector<int> fds;
+
+    DIR *dir = opendir(device_dir);
+    CCL_THROW_IF_NOT(dir, "opendir failed: could not open device directory");
+
+    LOG_DEBUG("search for all devices in the device directory");
+    while ((ent = readdir(dir)) != nullptr) {
+        if (ent->d_name[0] == '.' || strstr(ent->d_name, suffix) == nullptr) {
+            continue;
+        }
+
+        memset(device_name, 0, sizeof(device_name));
+        int ret = snprintf(device_name, NAME_MAX, "%s%s", device_dir, ent->d_name);
+        CCL_THROW_IF_NOT(ret > 0 || ret <= NAME_MAX, "could not create device name");
+
+        int fd = open(device_name, O_RDWR);
+        CCL_THROW_IF_NOT(fd > 0, "open failed: fd: ", fd, ", errno: ", strerror(errno));
+        fds.push_back(fd);
+        CCL_THROW_IF_NOT(
+            fds.back() != ccl::utils::invalid_fd, "unexpected device fd: ", fds.back());
+        LOG_DEBUG("device_name: ", device_name, " device_fd: ", fds.back());
+    }
+    CCL_THROW_IF_NOT(!fds.empty(), "fds is empty");
+    LOG_DEBUG("completed, fds size: ", fds.size());
+    return fds;
+}
+
+int fd_manager::pidfd_open(const int pid) {
+    int fd = syscall(__NR_pidfd_open, pid, 0);
+    CCL_THROW_IF_NOT(fd != ccl::utils::invalid_pid,
+                     "pidfd_open failed: fd: ",
+                     fd,
+                     ", pid: ",
+                     pid,
+                     ", errno: ",
+                     strerror(errno));
+    LOG_DEBUG("pidfd_open: pid: ", pid, ", fd: ", fd);
+    return fd;
+}
+
+int fd_manager::fd_to_mem_handle(int dev_fd, int fd) {
+#ifdef CCL_ENABLE_DRM
+    struct drm_prime_handle req = { 0, 0, 0 };
+    req.fd = fd;
+
+    int ret = ioctl(dev_fd, DRM_IOCTL_PRIME_FD_TO_HANDLE, &req);
+    CCL_THROW_IF_NOT(ret != ccl::utils::invalid_err_code,
+                     "ioctl failed: ret: ",
+                     ret,
+                     ", errno: ",
+                     strerror(errno),
+                     ", dev_fd: ",
+                     dev_fd,
+                     ", fd: ",
+                     fd);
+    LOG_DEBUG("dev_fd: ", dev_fd, ", req.fd: ", req.fd, ", handle: ", req.handle);
+    return req.handle;
+#else // CCL_ENABLE_DRM
+    return ccl::utils::invalid_mem_handle;
+#endif // CCL_ENABLE_DRM
+}
+
+int fd_manager::convert_fd_drmfd(int convert_from_fd, int handle) {
+#ifdef CCL_ENABLE_DRM
+    struct drm_prime_handle req = { 0, 0, 0 };
+    req.flags = DRM_CLOEXEC | DRM_RDWR;
+    req.handle = handle;
+
+    int ret = ioctl(convert_from_fd, DRM_IOCTL_PRIME_HANDLE_TO_FD, &req);
+    CCL_THROW_IF_NOT(ret != ccl::utils::invalid_err_code,
+                     "ioctl failed: ret: ",
+                     ret,
+                     ", errno: ",
+                     strerror(errno),
+                     ", dev_fd: ",
+                     convert_from_fd,
+                     ", handle: ",
+                     handle);
+    LOG_DEBUG("drm: dev_fd: ", convert_from_fd, ", req.handle: ", handle, ", fd: ", req.fd);
+    return req.fd;
+#else // CCL_ENABLE_DRM
+    return ccl::utils::invalid_fd;
+#endif // CCL_ENABLE_DRM
+}
+
+int fd_manager::convert_fd_pidfd(int convert_from_fd, int handle) {
+    int fd = syscall(__NR_pidfd_getfd, convert_from_fd, handle, 0);
+    CCL_THROW_IF_NOT(fd != ccl::utils::invalid_fd,
+                     "pidfd_getfd failed: "
+                     "convert_from_fd: ",
+                     convert_from_fd,
+                     ", fd: ",
+                     fd,
+                     ", handle: ",
+                     handle,
+                     ", errno: ",
+                     strerror(errno));
+    LOG_DEBUG(
+        "pidfd_getfd: convert_from_fd: ", convert_from_fd, ", handle: ", handle, ", fd: ", fd);
+    return fd;
+}
+
+int fd_manager::mem_handle_to_fd(int convert_from_fd, int handle) {
+    int fd = ccl::utils::invalid_fd;
+    if (ccl::global_data::env().ze_ipc_exchange == ccl::ze::ipc_exchange_mode::drmfd) {
+        fd = convert_fd_drmfd(convert_from_fd, handle);
+    }
+    else if (ccl::global_data::env().ze_ipc_exchange == ccl::ze::ipc_exchange_mode::pidfd) {
+        fd = convert_fd_pidfd(convert_from_fd, handle);
+    }
+    else {
+        CCL_THROW("unexpected ipc_exchange_mode");
+    }
+    return fd;
+}
+
+std::vector<int> fd_manager::setup_device_fds(int local_count, int proc_idx) {
+    std::vector<int> fds;
+    if (proc_idx == 0) {
+        fds = device_fds;
+        // send the fds to all other local processes
+        for (int p_idx = 1; p_idx < local_count; p_idx++) {
+            for (auto &fd : fds) {
+                ccl::utils::sendmsg_call(all_socks[p_idx], fd, nullptr, 0, proc_idx);
+            }
+        }
+    }
+    else {
+        // receive the fds from local process 0
+        for (auto fd : device_fds) {
+            close(fd);
+        }
+        fds.resize(device_fds.size());
+        for (auto &fd : fds) {
+            ccl::utils::recvmsg_call(all_socks[0], &fd, nullptr, 0, proc_idx);
+        }
+    }
+    return fds;
+}
+
+void fd_manager::exchange_device_fds() {
+    int sock_err;
+    std::string sock_name;
+    struct sockaddr_un sockaddr;
+    memset(&sockaddr, 0, sizeof(sockaddr));
+    unsigned int sockaddr_len = sizeof(sockaddr);
+    int enable = 1;
+
+    int local_count = ccl::global_data::get().get_local_proc_count();
+    int local_idx = ccl::global_data::get().get_local_proc_idx();
+
+    auto length = size_per_proc * local_count + counter_offset;
+
+    all_pids.resize(local_count, ccl::utils::invalid_pid);
+    all_socks.resize(local_count, ccl::utils::invalid_fd);
+
+    pid_t pid = getpid();
+
+    // send own pid to all processes via shm
+    void *mem = create_shared_memory();
+    void *shmem = (char *)mem + counter_offset;
+
+    ((pid_t *)shmem)[local_idx] = pid;
+
+    barrier(mem);
+
+    for (int i = 0; i < local_count; i++) {
+        all_pids[i] = ((pid_t *)shmem)[i];
+    }
+    CCL_THROW_IF_NOT(!all_pids.empty(), "all_pids shouldn't be empty");
+    LOG_DEBUG("pid exchange is done: ", all_pids.size());
+
+    // create a named socket between local_idx
+    // 0 and all other local processes
+    if (local_idx == 0) {
+        barrier(mem);
+        for (int i = 1; i < local_count; ++i) {
+            std::string remote_sock_name;
+            struct sockaddr_un remote_sockaddr;
+
+            remote_sock_name = "/tmp/ccl-ipc-fd-sock-" + std::to_string(all_pids[i]) + ":" +
+                               std::to_string(i) + "-" + std::to_string(local_idx);
+            sock_name = "/tmp/ccl-ipc-fd-sock-" + std::to_string(pid) + ":" +
+                        std::to_string(local_idx) + "-" + std::to_string(i);
+
+            // create a socket for local proc j
+            all_socks[i] = socket(AF_UNIX, SOCK_STREAM, 0);
+            CCL_THROW_IF_NOT(all_socks[i] != ccl::utils::invalid_fd,
+                             "socket failed: sock_err: ",
+                             all_socks[i],
+                             ", errno: ",
+                             strerror(errno));
+
+            setsockopt(all_socks[i], SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(int));
+            sockaddr.sun_family = AF_UNIX;
+            strcpy(sockaddr.sun_path, sock_name.c_str());
+
+            sock_err = bind(all_socks[i], (struct sockaddr *)&sockaddr, sockaddr_len);
+            CCL_THROW_IF_NOT(sock_err != ccl::utils::invalid_err_code,
+                             "bind failed: sock_err: ",
+                             sock_err,
+                             ", errno: ",
+                             strerror(errno));
+
+            // connect to remote socket for local proc j
+            remote_sockaddr.sun_family = AF_UNIX;
+            strcpy(remote_sockaddr.sun_path, remote_sock_name.c_str());
+
+            sock_err = connect(all_socks[i], (struct sockaddr *)&remote_sockaddr, sockaddr_len);
+            if (sock_err < 0) {
+                if (errno == ECONNREFUSED || errno == ENOENT) {
+                    return;
+                }
+                CCL_THROW("connect failed: error: ",
+                          sock_err,
+                          ", errno: ",
+                          strerror(errno),
+                          ", sock_name: ",
+                          sock_name);
+            }
+        }
+    }
+    else {
+        int sock;
+        // create the local socket name
+        sock_name = "/tmp/ccl-ipc-fd-sock-" + std::to_string(pid) + ":" +
+                    std::to_string(local_idx) + "-" + std::to_string(0);
+        // create a socket for local proc i
+        sock = socket(AF_UNIX, SOCK_STREAM, 0);
+        CCL_THROW_IF_NOT(sock != ccl::utils::invalid_fd,
+                         "socket failed: sock: ",
+                         sock,
+                         ", errno: ",
+                         strerror(errno));
+
+        setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(int));
+        sockaddr.sun_family = AF_UNIX;
+        strcpy(sockaddr.sun_path, sock_name.c_str());
+
+        sock_err = bind(sock, (struct sockaddr *)&sockaddr, sockaddr_len);
+        CCL_THROW_IF_NOT(sock_err != ccl::utils::invalid_err_code,
+                         "bind failed: sock_err: ",
+                         sock_err,
+                         ", errno: ",
+                         strerror(errno));
+
+        // listen to the socket to accept a connection to the other process
+        sock_err = listen(sock, local_count);
+        CCL_THROW_IF_NOT(sock_err != ccl::utils::invalid_err_code,
+                         "listen failed: sock_err: ",
+                         sock_err,
+                         ", errno: ",
+                         strerror(errno));
+
+        // notify the other process that the socket is created and being listened to
+        barrier(mem);
+
+        all_socks[0] = accept(sock, (struct sockaddr *)&sockaddr, &sockaddr_len);
+        CCL_THROW_IF_NOT(all_socks[0] != ccl::utils::invalid_err_code,
+                         "accept failed: sock: ",
+                         all_socks[0],
+                         ", errno: ",
+                         strerror(errno));
+
+        setsockopt(all_socks[0], SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(int));
+        close(sock);
+    }
+
+    LOG_DEBUG("connection is set up");
+    device_fds = setup_device_fds(local_count, local_idx);
+
+    // close sockets
+    if (local_idx == 0) {
+        close_sockets(local_count, local_idx);
+        barrier(mem);
+    }
+    else {
+        barrier(mem);
+        close_sockets(local_count, local_idx);
+    }
+
+    int ret = munmap(mem, length);
+    CCL_THROW_IF_NOT(ret == 0, "munmap failed: ret: ", ret, ", errno: ", strerror(errno));
+}
+
+void fd_manager::close_sockets(int local_count, int proc_idx) {
+    int sock_err;
+    std::string sock_name;
+    for (int i = 0; i < local_count; ++i) {
+        if (all_socks[i] != ccl::utils::invalid_fd) {
+            sock_err = close(all_socks[i]);
+            CCL_THROW_IF_NOT(sock_err != ccl::utils::invalid_err_code,
+                             "close failed: ret",
+                             sock_err,
+                             ", errno: ",
+                             strerror(errno));
+        }
+
+        if (all_pids[proc_idx] != ccl::utils::invalid_pid && proc_idx != i) {
+            sock_name = "/tmp/ccl-ipc-fd-sock-" + std::to_string(all_pids[proc_idx]) + ":" +
+                        std::to_string(proc_idx) + "-" + std::to_string(i);
+            sock_err = unlink(sock_name.c_str());
+        }
+    }
+}
+
+} // namespace ze
+} // namespace ccl
diff --git a/src/common/global/ze/ze_fd_manager.hpp b/src/common/global/ze/ze_fd_manager.hpp
new file mode 100644
index 000000000..1628cee68
--- /dev/null
+++ b/src/common/global/ze/ze_fd_manager.hpp
@@ -0,0 +1,78 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#include "oneapi/ccl/config.h"
+
+#include <map>
+#include <string>
+#include <vector>
+
+namespace ccl {
+namespace ze {
+
+enum class ipc_exchange_mode : int { sockets, drmfd, pidfd };
+static std::map<ipc_exchange_mode, std::string> ipc_exchange_names = {
+    std::make_pair(ipc_exchange_mode::sockets, "sockets"),
+#ifdef CCL_ENABLE_DRM
+    std::make_pair(ipc_exchange_mode::drmfd, "drmfd"),
+#endif // CCL_ENABLE_DRM
+    std::make_pair(ipc_exchange_mode::pidfd, "pidfd")
+};
+
+class fd_manager {
+public:
+    fd_manager();
+    fd_manager(const fd_manager&) = delete;
+    fd_manager(fd_manager&&) = delete;
+    fd_manager& operator=(const fd_manager&) = delete;
+    fd_manager& operator=(fd_manager&&) = delete;
+    ~fd_manager();
+
+    static int mem_handle_to_fd(int convert_from_fd, int fd);
+    static int fd_to_mem_handle(int dev_fd, int handle);
+
+    static bool is_pidfd_supported();
+    static int pidfd_open(const int pid);
+
+    std::vector<int> get_device_fds();
+
+private:
+    void exchange_device_fds();
+    std::vector<int> setup_device_fds(int local_count, int proc_idx);
+
+    void close_sockets(int local_count, int proc_idx);
+
+    std::string get_shm_filename();
+    void* create_shared_memory();
+    void barrier(void* mem);
+
+    static int convert_fd_pidfd(int convert_from_fd, int handle);
+    static int convert_fd_drmfd(int convert_from_fd, int handle);
+
+    const int counter_offset = sizeof(int);
+    const int size_per_proc = sizeof(pid_t);
+
+    std::vector<int> init_device_fds();
+
+    std::vector<int> all_socks;
+    std::vector<pid_t> all_pids;
+
+    std::vector<int> device_fds;
+};
+
+} // namespace ze
+} // namespace ccl
diff --git a/src/common/request/request.hpp b/src/common/request/request.hpp
index d44a093ef..bbd4d8081 100644
--- a/src/common/request/request.hpp
+++ b/src/common/request/request.hpp
@@ -22,7 +22,7 @@
 
 #ifdef CCL_ENABLE_SYCL
 #include <CL/sycl.hpp>
-#endif
+#endif // CCL_ENABLE_SYCL
 
 class ccl_sched;
 
@@ -47,29 +47,44 @@ class alignas(CACHELINE_SIZE) ccl_request {
 
     mutable bool urgent = false;
 
+    bool synchronous = false;
+
 #ifdef CCL_ENABLE_SYCL
     void set_native_event(sycl::event new_event) {
-        native_event = new_event;
+        native_event = std::make_shared<sycl::event>(new_event);
     }
 
     sycl::event& get_native_event() {
+        return *native_event;
+    }
+
+    std::shared_ptr<sycl::event>& share_native_event() {
         return native_event;
     }
 
     void set_sync_event(sycl::event new_event) {
-        sync_event = new_event;
+        sync_event = std::make_shared<sycl::event>(new_event);
     }
 
     sycl::event& get_sync_event() {
-        return sync_event;
+        return *sync_event;
     }
 
     bool has_output_event() const {
-        // by default the event is empty and is_host() is true,
-        // if we actually set it to a non-empty one, is_host() would be false
-        return !native_event.is_host();
+        // by default the event is empty
+        if (!native_event)
+            return false;
+        // running on xpu it'd be true
+        return true;
+    }
+
+    bool has_sync_event() const {
+        if (!sync_event)
+            return false;
+        // running on xpu it'd be true
+        return true;
     }
-#endif
+#endif // CCL_ENABLE_SYCL
 
     ccl_sched* get_sched() {
         return &sched;
@@ -84,16 +99,16 @@ class alignas(CACHELINE_SIZE) ccl_request {
 private:
 #ifdef ENABLE_DEBUG
     void set_dump_callback(dump_func&& callback);
-#endif
+#endif // ENABLE_DEBUG
 
 #ifdef CCL_ENABLE_SYCL
     // The actual event from submit_barrier. It's returned to the user via ccl::event.get_native()
-    sycl::event native_event;
+    std::shared_ptr<sycl::event> native_event;
     // This is basically a wrapped l0 event from sched_base, we need to keep as sycl object because its destructor
     // implies wait on the event, but in our case it's not yet completed(right after we created it from l0 event).
     // So just keep it here until we signal the corresponding l0 event.
-    sycl::event sync_event;
-#endif
+    std::shared_ptr<sycl::event> sync_event;
+#endif // CCL_ENABLE_SYCL
 
     // ref to sched as part of which the request is created, there must be 1-to-1 relation
     ccl_sched& sched;
@@ -102,5 +117,5 @@ class alignas(CACHELINE_SIZE) ccl_request {
     dump_func dump_callback;
     mutable size_t complete_checks_count = 0;
     static constexpr const size_t CHECK_COUNT_BEFORE_DUMP = 40000000;
-#endif
+#endif // ENABLE_DEBUG
 };
diff --git a/src/common/stream/stream.cpp b/src/common/stream/stream.cpp
index a8ec51250..946613ac5 100644
--- a/src/common/stream/stream.cpp
+++ b/src/common/stream/stream.cpp
@@ -18,11 +18,10 @@
 #include "common/stream/stream.hpp"
 #include "common/stream/stream_selector_impl.hpp"
 #include "common/utils/enums.hpp"
-#include "common/utils/sycl_utils.hpp"
 #include "oneapi/ccl/native_device_api/export_api.hpp"
 
 #ifdef CCL_ENABLE_SYCL
-#include <CL/sycl/backend/level_zero.hpp>
+#include "common/utils/sycl_utils.hpp"
 #endif // CCL_ENABLE_SYCL
 
 namespace ccl {
@@ -36,7 +35,8 @@ std::string to_string(device_family family) {
 } // namespace ccl
 
 std::string to_string(const stream_type& type) {
-    using stream_str_enum = utils::enum_to_str<utils::enum_to_underlying(stream_type::last_value)>;
+    using stream_str_enum =
+        ccl::utils::enum_to_str<ccl::utils::enum_to_underlying(stream_type::last_value)>;
     return stream_str_enum({ "host", "cpu", "gpu" }).choose(type, "unknown");
 }
 
@@ -49,7 +49,7 @@ ccl_stream::ccl_stream(stream_type type,
     native_stream = stream;
 
 #ifdef CCL_ENABLE_SYCL
-    cl::sycl::property_list props{};
+    sycl::property_list props{};
     if (stream.is_in_order()) {
         props = { sycl::property::queue::in_order{} };
     }
@@ -69,7 +69,40 @@ ccl_stream::ccl_stream(stream_type type,
     if (backend == ccl::utils::get_level_zero_backend() && ccl::global_data::get().ze_data) {
         device = sycl::get_native<ccl::utils::get_level_zero_backend()>(stream.get_device());
         context = sycl::get_native<ccl::utils::get_level_zero_backend()>(stream.get_context());
+        // TODO: after compiler 20220316, L0 cmd queue from sycl queue can be available only after sycl queue is called to submit kernel.
+        // To WA this, submit barrier to create available L0 cmd queue when using external queue
+        if (ccl::global_data::env().enable_external_queue) {
+            stream.ext_oneapi_submit_barrier();
+        }
+        cmd_queue = sycl::get_native<ccl::utils::get_level_zero_backend()>(stream);
+        LOG_DEBUG("command queue initialized from external sycl queue is ", cmd_queue);
         device_family = ccl::ze::get_device_family(device);
+
+        ccl::ze::ze_queue_properties_t queue_props;
+        ccl::ze::get_queues_properties(device, &queue_props);
+
+        if (!ccl::global_data::env().disable_ze_family_check) {
+            if (queue_props.size() == 1 && queue_props.front().numQueues == 1 &&
+                (device_family == ccl::device_family::unknown)) {
+                ze_device_properties_t dev_props = ccl::ze::default_device_props;
+                ZE_CALL(zeDeviceGetProperties, (device, &dev_props));
+                bool is_integrated = dev_props.flags & ZE_DEVICE_PROPERTY_FLAG_INTEGRATED;
+
+                if (!is_integrated) {
+                    LOG_WARN("usage of discrete device with unexpected properties"
+                             " (id: ",
+                             dev_props.deviceId,
+                             ", name: ",
+                             (strlen(dev_props.name) ? dev_props.name : "<empty>"),
+                             ", flags: ",
+                             ccl::ze::flags_to_string<ze_device_property_flag_t>(dev_props.flags),
+                             ")"
+                             ", set ",
+                             CCL_ZE_DISABLE_FAMILY_CHECK,
+                             "=1 to hide this message");
+                }
+            }
+        }
     }
 #endif // CCL_ENABLE_ZE
 }
@@ -98,7 +131,7 @@ std::string ccl_stream::to_string() const {
 #ifdef CCL_ENABLE_SYCL
     ss << "{ "
        << "type: " << ::to_string(type) << ", in_order: " << native_stream.is_in_order()
-       << ", device: " << native_stream.get_device().get_info<cl::sycl::info::device::name>()
+       << ", device: " << native_stream.get_device().get_info<sycl::info::device::name>()
        << ", device_family: " << ccl::to_string(device_family) << " }";
 #else // CCL_ENABLE_SYCL
     ss << reinterpret_cast<void*>(native_stream.get());
@@ -123,7 +156,7 @@ bool ccl_stream::is_gpu() const {
 }
 
 #ifdef CCL_ENABLE_SYCL
-cl::sycl::backend ccl_stream::get_backend() const {
+sycl::backend ccl_stream::get_backend() const {
     return backend;
 }
 #ifdef CCL_ENABLE_ZE
@@ -139,5 +172,13 @@ ze_context_handle_t ccl_stream::get_ze_context() const {
     CCL_THROW_IF_NOT(context, "no context");
     return context;
 }
+
+ze_command_queue_handle_t ccl_stream::get_ze_command_queue() const {
+    CCL_THROW_IF_NOT(backend == ccl::utils::get_level_zero_backend());
+    if (ccl::global_data::env().enable_external_queue) {
+        CCL_THROW_IF_NOT(cmd_queue, "no command queue");
+    }
+    return cmd_queue;
+}
 #endif // CCL_ENABLE_ZE
 #endif // CCL_ENBALE_SYCL
diff --git a/src/common/stream/stream.hpp b/src/common/stream/stream.hpp
index e5bff9ebf..a2bbfb490 100644
--- a/src/common/stream/stream.hpp
+++ b/src/common/stream/stream.hpp
@@ -24,7 +24,7 @@
 #include "oneapi/ccl/type_traits.hpp"
 
 #ifdef CCL_ENABLE_SYCL
-#include <CL/sycl/backend_types.hpp>
+#include "common/utils/sycl_utils.hpp"
 #endif // CCL_ENABLE_SYCL
 
 namespace ccl {
@@ -57,10 +57,11 @@ class alignas(CACHELINE_SIZE) ccl_stream : public stream_selector {
     bool is_gpu() const;
 
 #ifdef CCL_ENABLE_SYCL
-    cl::sycl::backend get_backend() const;
+    sycl::backend get_backend() const;
 #ifdef CCL_ENABLE_ZE
     ze_device_handle_t get_ze_device() const;
     ze_context_handle_t get_ze_context() const;
+    ze_command_queue_handle_t get_ze_command_queue() const;
 #endif // CCL_ENABLE_ZE
 #endif // CCL_ENBALE_SYCL
 
@@ -93,11 +94,12 @@ class alignas(CACHELINE_SIZE) ccl_stream : public stream_selector {
     ccl::device_family device_family;
 
 #ifdef CCL_ENABLE_SYCL
-    cl::sycl::backend backend;
+    sycl::backend backend;
 
 #ifdef CCL_ENABLE_ZE
     ze_device_handle_t device{};
     ze_context_handle_t context{};
+    ze_command_queue_handle_t cmd_queue{};
 #endif // CCL_ENABLE_ZE
 #endif // CCL_ENBALE_SYCL
 };
diff --git a/src/common/stream/stream_selector.hpp b/src/common/stream/stream_selector.hpp
index e39ba5a03..5743e6ab8 100644
--- a/src/common/stream/stream_selector.hpp
+++ b/src/common/stream/stream_selector.hpp
@@ -16,7 +16,7 @@
 #pragma once
 
 #ifdef CCL_ENABLE_ZE
-#include "common/ze/ze_api_wrapper.hpp"
+#include "common/api_wrapper/ze_api_wrapper.hpp"
 #endif // CCL_ENABLE_ZE
 
 #ifdef CCL_ENABLE_SYCL
diff --git a/src/common/stream/stream_selector_impl.hpp b/src/common/stream/stream_selector_impl.hpp
index 4e8a8ba4f..27fb6f5c9 100644
--- a/src/common/stream/stream_selector_impl.hpp
+++ b/src/common/stream/stream_selector_impl.hpp
@@ -27,10 +27,7 @@ std::unique_ptr<ccl_stream> stream_selector::create(stream_native_t& native_stre
     stream_type type = stream_type::host;
 
 #ifdef CCL_ENABLE_SYCL
-    if (native_stream.get_device().is_host()) {
-        type = stream_type::host;
-    }
-    else if (native_stream.get_device().is_cpu()) {
+    if (native_stream.get_device().is_cpu()) {
         type = stream_type::cpu;
     }
     else if (native_stream.get_device().is_gpu()) {
@@ -41,7 +38,7 @@ std::unique_ptr<ccl_stream> stream_selector::create(stream_native_t& native_stre
             "core",
             "create_stream",
             std::string("unsupported SYCL queue's device type:\n") +
-                native_stream.get_device().template get_info<cl::sycl::info::device::name>() +
+                native_stream.get_device().template get_info<sycl::info::device::name>() +
                 std::string("supported types: host, cpu, gpu"));
     }
 #endif // CCL_ENABLE_SYCL
diff --git a/src/common/utils/buffer.hpp b/src/common/utils/buffer.hpp
index 095e20245..75f9ff0f8 100644
--- a/src/common/utils/buffer.hpp
+++ b/src/common/utils/buffer.hpp
@@ -95,49 +95,49 @@ class ccl_buffer {
         CCL_ASSERT(check_offset());
     }
 
-    ccl_buffer& operator=(const ccl_buffer& src) {
-        if (this != &src) {
-            this->src = src.src;
-            this->size = src.size;
-            this->offset = src.offset;
-            this->type = src.type;
+    ccl_buffer& operator=(const ccl_buffer& other) {
+        if (this != &other) {
+            this->src = other.src;
+            this->size = other.size;
+            this->offset = other.offset;
+            this->type = other.type;
             CCL_ASSERT(check_offset());
         }
         return *this;
     }
 
-    void set(void* src, ssize_t size, size_t offset, ccl_buffer_type type) {
+    void set(void* buf_src, ssize_t buf_size, size_t buf_offset, ccl_buffer_type buf_type) {
         LOG_TRACE("set: src ",
-                  src,
+                  buf_src,
                   ", size ",
-                  size,
+                  buf_size,
                   ", offset ",
-                  offset,
+                  buf_offset,
                   ", type ",
-                  type,
+                  buf_type,
                   ", old src: ",
                   this->src);
-        CCL_ASSERT(src, "new src is null");
+        CCL_ASSERT(buf_src, "new src is null");
 
-        this->src = src;
-        this->size = size;
-        this->offset = offset;
-        this->type = type;
+        this->src = buf_src;
+        this->size = buf_size;
+        this->offset = buf_offset;
+        this->type = buf_type;
 
         CCL_ASSERT(check_offset());
     }
 
-    void set(void* src) {
-        set(src, -1, 0, ccl_buffer_type::DIRECT);
+    void set(void* buf_src) {
+        set(buf_src, -1, 0, ccl_buffer_type::DIRECT);
     }
-    void set(void* src, ssize_t size) {
-        set(src, size, 0, ccl_buffer_type::DIRECT);
+    void set(void* buf_src, ssize_t buf_size) {
+        set(buf_src, buf_size, 0, ccl_buffer_type::DIRECT);
     }
-    void set(void* src, ssize_t size, ccl_buffer_type type) {
-        set(src, size, 0, type);
+    void set(void* buf_src, ssize_t buf_size, ccl_buffer_type buf_type) {
+        set(buf_src, buf_size, 0, buf_type);
     }
-    void set(void* src, ssize_t size, size_t offset) {
-        set(src, size, offset, ccl_buffer_type::DIRECT);
+    void set(void* buf_src, ssize_t buf_size, size_t buf_offset) {
+        set(buf_src, buf_size, buf_offset, ccl_buffer_type::DIRECT);
     }
 
     void* get_src() const {
diff --git a/src/common/utils/enums.hpp b/src/common/utils/enums.hpp
index d6250fc70..c07388973 100644
--- a/src/common/utils/enums.hpp
+++ b/src/common/utils/enums.hpp
@@ -18,6 +18,7 @@
 #include <iostream>
 #include <type_traits>
 
+namespace ccl {
 namespace utils {
 namespace detail {
 struct failure_callback {
@@ -75,3 +76,4 @@ constexpr typename std::underlying_type<enumeration_type>::type enum_to_underlyi
     return static_cast<typename std::underlying_type<enumeration_type>::type>(val);
 }
 } // namespace utils
+} // namespace ccl
diff --git a/src/common/utils/exchange_utils.cpp b/src/common/utils/exchange_utils.cpp
new file mode 100644
index 000000000..de35fb7e3
--- /dev/null
+++ b/src/common/utils/exchange_utils.cpp
@@ -0,0 +1,225 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "common/global/global.hpp"
+#include "common/log/log.hpp"
+#include "common/utils/exchange_utils.hpp"
+#include "common/utils/fd_info.hpp"
+
+namespace ccl {
+namespace utils {
+
+bool allgather(std::shared_ptr<atl_base_comm> comm,
+               const void* send_buf,
+               void* recv_buf,
+               int bytes,
+               bool sync) {
+    std::vector<int> recv_bytes(comm->get_size(), bytes);
+    return allgatherv(comm, send_buf, recv_buf, recv_bytes, sync);
+}
+
+bool allgatherv(std::shared_ptr<atl_base_comm> comm,
+                const void* send_buf,
+                void* recv_buf,
+                const std::vector<int>& recv_bytes,
+                bool sync) {
+    atl_req_t req{};
+    bool ret = true;
+    int comm_rank = comm->get_rank();
+    int comm_size = comm->get_size();
+
+    CCL_THROW_IF_NOT((int)recv_bytes.size() == comm->get_size(),
+                     "unexpected recv_bytes size ",
+                     recv_bytes.size(),
+                     ", comm_size ",
+                     comm_size);
+
+    std::vector<int> offsets(comm_size, 0);
+    for (int i = 1; i < comm_size; i++) {
+        offsets[i] = offsets[i - 1] + recv_bytes[i - 1];
+    }
+
+    comm->allgatherv(0 /* ep_idx */,
+                     send_buf,
+                     recv_bytes[comm_rank],
+                     recv_buf,
+                     recv_bytes.data(),
+                     offsets.data(),
+                     req);
+    if (sync) {
+        comm->wait(0 /* ep_idx */, req);
+    }
+    else {
+        CCL_THROW("unexpected sync parameter");
+    }
+    return ret;
+}
+
+int check_msg_retval(std::string operation_name,
+                     ssize_t bytes,
+                     struct iovec iov,
+                     struct msghdr msg,
+                     size_t union_size,
+                     int sock,
+                     int fd) {
+    LOG_DEBUG(operation_name,
+              ": ",
+              bytes,
+              ", expected_bytes:",
+              iov.iov_len,
+              ", expected size of cntr_buf: ",
+              union_size,
+              " -> gotten cntr_buf: ",
+              msg.msg_controllen,
+              ", socket: ",
+              sock,
+              ", fd: ",
+              fd);
+    int ret = -1;
+    if (bytes == static_cast<ssize_t>(iov.iov_len)) {
+        ret = 0;
+    }
+    else if (bytes < 0) {
+        ret = -errno;
+    }
+    else {
+        ret = -EIO;
+    }
+    return ret;
+}
+
+void sendmsg_fd(int sock, int fd, void* payload, int payload_len) {
+    CCL_THROW_IF_NOT(fd >= 0, "unexpected fd value");
+    char empty_buf;
+    struct iovec iov;
+    memset(&iov, 0, sizeof(iov));
+    if (!payload) {
+        iov.iov_base = &empty_buf;
+        iov.iov_len = 1;
+    }
+    else {
+        iov.iov_base = payload;
+        iov.iov_len = payload_len;
+    }
+
+    union {
+        struct cmsghdr align;
+        char cntr_buf[CMSG_SPACE(sizeof(int))]{};
+    } u;
+
+    struct msghdr msg;
+    memset(&msg, 0, sizeof(msg));
+    msg.msg_control = u.cntr_buf;
+    msg.msg_controllen = sizeof(u.cntr_buf);
+    msg.msg_iov = &iov;
+    msg.msg_iovlen = 1;
+
+    auto cmsg = CMSG_FIRSTHDR(&msg);
+    cmsg->cmsg_len = CMSG_LEN(sizeof(fd));
+    cmsg->cmsg_level = SOL_SOCKET;
+    cmsg->cmsg_type = SCM_RIGHTS;
+    *(int*)CMSG_DATA(cmsg) = fd;
+
+    ssize_t send_bytes = sendmsg(sock, &msg, 0);
+    CCL_THROW_IF_NOT(
+        !check_msg_retval("sendmsg", send_bytes, iov, msg, sizeof(u.cntr_buf), sock, fd),
+        " errno: ",
+        strerror(errno));
+}
+
+void recvmsg_fd(int sock, int* fd, void* payload, int payload_len) {
+    CCL_THROW_IF_NOT(fd != nullptr, "unexpected fd value");
+    char empty_buf;
+    struct iovec iov;
+    memset(&iov, 0, sizeof(iov));
+    if (!payload) {
+        iov.iov_base = &empty_buf;
+        iov.iov_len = 1;
+    }
+    else {
+        iov.iov_base = payload;
+        iov.iov_len = payload_len;
+    }
+
+    union {
+        struct cmsghdr align;
+        char cntr_buf[CMSG_SPACE(sizeof(int))]{};
+    } u;
+
+    struct msghdr msg;
+    memset(&msg, 0, sizeof(msg));
+    msg.msg_control = u.cntr_buf;
+    msg.msg_controllen = sizeof(u.cntr_buf);
+    msg.msg_iov = &iov;
+    msg.msg_iovlen = 1;
+
+    ssize_t recv_bytes = recvmsg(sock, &msg, 0);
+    CCL_THROW_IF_NOT(
+        !check_msg_retval("recvmsg", recv_bytes, iov, msg, sizeof(u.cntr_buf), sock, *fd),
+        " errno: ",
+        strerror(errno));
+
+    if (msg.msg_flags & (MSG_CTRUNC | MSG_TRUNC)) {
+        std::string flag_str = "";
+        if (msg.msg_flags & MSG_CTRUNC) {
+            flag_str += " MSG_CTRUNC";
+        }
+        if (msg.msg_flags & MSG_TRUNC) {
+            flag_str += " MSG_TRUNC";
+        }
+
+        /** MSG_CTRUNC message can be in case of:
+         * - remote peer send invalid fd, so msg_controllen == 0
+         * - limit of fds reached in the current process, so msg_controllen == 0
+         * - the remote peer control message > msg_control buffer size
+         */
+        CCL_THROW("control or usual message is truncated:",
+                  flag_str,
+                  " control message size: ",
+                  msg.msg_controllen,
+                  ", ",
+                  to_string(ccl::utils::get_fd_info()));
+    }
+
+    for (auto cmsg = CMSG_FIRSTHDR(&msg); cmsg != nullptr; cmsg = CMSG_NXTHDR(&msg, cmsg)) {
+        if (cmsg->cmsg_len == CMSG_LEN(sizeof(int)) && cmsg->cmsg_level == SOL_SOCKET &&
+            cmsg->cmsg_type == SCM_RIGHTS) {
+            memcpy(fd, CMSG_DATA(cmsg), sizeof(int));
+            break;
+        }
+    }
+
+    // we assume that the message has a strict format and size, if not this means that something
+    // is wrong.
+    size_t expected_len = 1;
+    if (payload) {
+        expected_len = payload_len;
+    }
+    if (msg.msg_iov[0].iov_len != expected_len) {
+        CCL_THROW("received data in unexpected format");
+    }
+}
+
+void sendmsg_call(int sock, int fd, void* payload, int payload_len, const int rank) {
+    sendmsg_fd(sock, fd, payload, payload_len);
+    LOG_DEBUG("send: rank[", rank, "], send fd: ", fd, ", sock: ", sock);
+}
+
+void recvmsg_call(int sock, int* fd, void* payload, int payload_len, const int rank) {
+    recvmsg_fd(sock, fd, payload, payload_len);
+    LOG_DEBUG("recv: rank[", rank, "], got fd: ", fd, ", sock: ", sock);
+}
+} // namespace utils
+} // namespace ccl
diff --git a/src/common/utils/exchange_utils.hpp b/src/common/utils/exchange_utils.hpp
new file mode 100644
index 000000000..af5106b64
--- /dev/null
+++ b/src/common/utils/exchange_utils.hpp
@@ -0,0 +1,53 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#include "atl/atl_base_comm.hpp"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <memory>
+#include <sys/socket.h>
+#include <sys/un.h>
+
+namespace ccl {
+namespace utils {
+bool allgather(std::shared_ptr<atl_base_comm> comm,
+               const void* send_buf,
+               void* recv_buf,
+               int bytes,
+               bool sync = true);
+bool allgatherv(std::shared_ptr<atl_base_comm> comm,
+                const void* send_buf,
+                void* recv_buf,
+                const std::vector<int>& recv_bytes,
+                bool sync = true);
+
+int check_msg_retval(std::string operation_name,
+                     ssize_t bytes,
+                     struct iovec iov,
+                     struct msghdr msg,
+                     size_t union_size,
+                     int sock,
+                     int fd);
+void sendmsg_fd(int sock, int fd, void* payload, int payload_len);
+void recvmsg_fd(int sock, int* fd, void* payload, int payload_len);
+
+void sendmsg_call(int sock, int fd, void* payload, int payload_len, const int rank);
+void recvmsg_call(int sock, int* fd, void* payload, int payload_len, const int rank);
+
+} // namespace utils
+} // namespace ccl
diff --git a/src/common/utils/sycl_utils.cpp b/src/common/utils/sycl_utils.cpp
index 63c6ddfe3..bd9f4bf84 100644
--- a/src/common/utils/sycl_utils.cpp
+++ b/src/common/utils/sycl_utils.cpp
@@ -17,8 +17,6 @@
 #include "common/stream/stream.hpp"
 #include "common/utils/sycl_utils.hpp"
 
-#include <CL/sycl/backend/level_zero.hpp>
-
 namespace ccl {
 namespace utils {
 
@@ -43,10 +41,7 @@ std::string usm_type_to_str(sycl::usm::alloc type) {
 }
 
 std::string sycl_device_to_str(const sycl::device& dev) {
-    if (dev.is_host()) {
-        return "host";
-    }
-    else if (dev.is_cpu()) {
+    if (dev.is_cpu()) {
         return "cpu";
     }
     else if (dev.is_gpu()) {
@@ -60,31 +55,31 @@ std::string sycl_device_to_str(const sycl::device& dev) {
     }
 }
 
-sycl::event submit_barrier(cl::sycl::queue queue) {
-#if DPCPP_VERSION >= 140000
+sycl::event submit_barrier(sycl::queue queue) {
+#if ICPX_VERSION >= 140000
     return queue.ext_oneapi_submit_barrier();
-#elif DPCPP_VERSION < 140000
+#elif ICPX_VERSION < 140000
     return queue.submit_barrier();
-#endif // DPCPP_VERSION
+#endif // ICPX_VERSION
 }
 
-sycl::event submit_barrier(cl::sycl::queue queue, sycl::event event) {
-#if DPCPP_VERSION >= 140000
+sycl::event submit_barrier(sycl::queue queue, sycl::event event) {
+#if ICPX_VERSION >= 140000
     return queue.ext_oneapi_submit_barrier({ event });
-#elif DPCPP_VERSION < 140000
+#elif ICPX_VERSION < 140000
     return queue.submit_barrier({ event });
-#endif // DPCPP_VERSION
+#endif // ICPX_VERSION
 }
 
 #ifdef CCL_ENABLE_SYCL_INTEROP_EVENT
 sycl::event make_event(const sycl::context& context, const ze_event_handle_t& sync_event) {
-#if DPCPP_VERSION >= 140000
+#if ICPX_VERSION >= 140000
     return sycl::make_event<sycl::backend::ext_oneapi_level_zero>(
         { sync_event, sycl::ext::oneapi::level_zero::ownership::keep }, context);
-#elif DPCPP_VERSION < 140000
+#elif ICPX_VERSION < 140000
     return sycl::level_zero::make<sycl::event>(
         context, sync_event, sycl::level_zero::ownership::keep);
-#endif // DPCPP_VERSION
+#endif // ICPX_VERSION
 }
 #endif // CCL_ENABLE_SYCL_INTEROP_EVENT
 
diff --git a/src/common/utils/sycl_utils.hpp b/src/common/utils/sycl_utils.hpp
index cf413a9c7..e144034aa 100644
--- a/src/common/utils/sycl_utils.hpp
+++ b/src/common/utils/sycl_utils.hpp
@@ -15,16 +15,44 @@
 */
 #pragma once
 
-#if defined(CCL_ENABLE_ZE) && defined(CCL_ENABLE_SYCL)
-
-#include "common/ze/ze_api_wrapper.hpp"
+#include "common/api_wrapper/ze_api_wrapper.hpp"
 
+#if __has_include(<sycl/sycl.hpp>)
+#include <sycl/sycl.hpp>
+#elif __has_include(<CL/sycl.hpp>)
 #include <CL/sycl.hpp>
+#else
+#error "Unsupported compiler"
+#endif
+
+#if defined(__INTEL_LLVM_COMPILER)
+#if (__INTEL_LLVM_COMPILER < 20230000)
+#define CCL_USE_SYCL121_API 1
+#else // (__INTEL_LLVM_COMPILER < 20230000)
+#define CCL_USE_SYCL121_API 0
+#endif // (__INTEL_LLVM_COMPILER < 20230000)
+#elif defined(__LIBSYCL_MAJOR_VERSION)
+#if (__LIBSYCL_MAJOR_VERSION < 6)
+#define CCL_USE_SYCL121_API 1
+#else // (__LIBSYCL_MAJOR_VERSION < 6)
+#define CCL_USE_SYCL121_API 0
+#endif // (__LIBSYCL_MAJOR_VERSION < 6)
+#else // __INTEL_LLVM_COMPILER || __LIBSYCL_MAJOR_VERSION
+#error "Unsupported compiler"
+#endif
+
+#if CCL_USE_SYCL121_API
+#include <CL/sycl/backend_types.hpp>
+#include <CL/sycl/backend/level_zero.hpp>
+#else // CCL_USE_SYCL121_API
+#include <sycl/backend_types.hpp>
+#include <sycl/ext/oneapi/backend/level_zero.hpp>
+#endif // CCL_USE_SYCL121_API
 
 #ifdef SYCL_LANGUAGE_VERSION
-#define DPCPP_VERSION __clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__
+#define ICPX_VERSION __clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__
 #else // SYCL_LANGUAGE_VERSION
-#define DPCPP_VERSION 0
+#define ICPX_VERSION 0
 #endif // SYCL_LANGUAGE_VERSION
 
 class ccl_stream;
@@ -39,15 +67,15 @@ std::string usm_type_to_str(sycl::usm::alloc type);
 std::string sycl_device_to_str(const sycl::device& dev);
 
 constexpr sycl::backend get_level_zero_backend() {
-#if DPCPP_VERSION >= 140000
+#if ICPX_VERSION >= 140000
     return sycl::backend::ext_oneapi_level_zero;
-#elif DPCPP_VERSION < 140000
+#elif ICPX_VERSION < 140000
     return sycl::backend::level_zero;
-#endif // DPCPP_VERSION
+#endif // ICPX_VERSION
 }
 
-sycl::event submit_barrier(cl::sycl::queue queue);
-sycl::event submit_barrier(cl::sycl::queue queue, sycl::event event);
+sycl::event submit_barrier(sycl::queue queue);
+sycl::event submit_barrier(sycl::queue queue, sycl::event event);
 
 #ifdef CCL_ENABLE_SYCL_INTEROP_EVENT
 sycl::event make_event(const sycl::context& context, const ze_event_handle_t& sync_event);
@@ -57,5 +85,3 @@ ze_event_handle_t get_native_event(sycl::event event);
 
 } // namespace utils
 } // namespace ccl
-
-#endif // CCL_ENABLE_ZE && CCL_ENABLE_SYCL
diff --git a/src/common/utils/utils.cpp b/src/common/utils/utils.cpp
index 387e15a8c..25c17aa35 100644
--- a/src/common/utils/utils.cpp
+++ b/src/common/utils/utils.cpp
@@ -72,5 +72,20 @@ void str_to_array(const std::string& input_str,
     result.push_back(input_str.substr(last));
 }
 
+uintptr_t get_aligned_offset_byte(const void* ptr,
+                                  const size_t buf_size_bytes,
+                                  const size_t mem_align_bytes) {
+    // find the number of data items to remove to start from aligned bytes
+    unsigned long pre_align_offset_byte = (uintptr_t)ptr % mem_align_bytes;
+    if (pre_align_offset_byte != 0) {
+        pre_align_offset_byte = mem_align_bytes - pre_align_offset_byte;
+    }
+    // make sure to use only the required number of threads for very small data count
+    if (buf_size_bytes < pre_align_offset_byte) {
+        pre_align_offset_byte = buf_size_bytes;
+    }
+    return pre_align_offset_byte;
+}
+
 } // namespace utils
 } // namespace ccl
diff --git a/src/common/utils/utils.hpp b/src/common/utils/utils.hpp
index 44b06f946..b6c7ef5d4 100644
--- a/src/common/utils/utils.hpp
+++ b/src/common/utils/utils.hpp
@@ -109,30 +109,31 @@
         size_t alignment = CCL_REG_MSG_ALIGNMENT; \
         if (size >= CCL_LARGE_MSG_THRESHOLD) \
             alignment = CCL_LARGE_MSG_ALIGNMENT; \
-        void* ptr = CCL_MEMALIGN_IMPL(size, alignment); \
-        CCL_THROW_IF_NOT(ptr, "CCL cannot allocate bytes: ", size, ", out of memory, ", name); \
-        ptr; \
+        void* mem_ptr = CCL_MEMALIGN_IMPL(size, alignment); \
+        CCL_THROW_IF_NOT(mem_ptr, "CCL cannot allocate bytes: ", size, ", out of memory, ", name); \
+        mem_ptr; \
     })
 
 #define CCL_MEMALIGN_WRAPPER(size, align, name) \
     ({ \
-        void* ptr = CCL_MEMALIGN_IMPL(size, align); \
-        CCL_THROW_IF_NOT(ptr, "CCL cannot allocate bytes: ", size, ", out of memory, ", name); \
-        ptr; \
+        void* mem_ptr = CCL_MEMALIGN_IMPL(size, align); \
+        CCL_THROW_IF_NOT(mem_ptr, "CCL cannot allocate bytes: ", size, ", out of memory, ", name); \
+        mem_ptr; \
     })
 
 #define CCL_REALLOC_WRAPPER(old_ptr, old_size, new_size, align, name) \
     ({ \
-        void* ptr = CCL_REALLOC_IMPL(old_ptr, old_size, new_size, align); \
-        CCL_THROW_IF_NOT(ptr, "CCL cannot allocate bytes: ", new_size, ", out of memory, ", name); \
-        ptr; \
+        void* mem_ptr = CCL_REALLOC_IMPL(old_ptr, old_size, new_size, align); \
+        CCL_THROW_IF_NOT( \
+            mem_ptr, "CCL cannot allocate bytes: ", new_size, ", out of memory, ", name); \
+        mem_ptr; \
     })
 
 #define CCL_CALLOC_WRAPPER(size, align, name) \
     ({ \
-        void* ptr = CCL_CALLOC_IMPL(size, align); \
-        CCL_THROW_IF_NOT(ptr, "CCL cannot allocate bytes: ", size, ", out of memory, ", name); \
-        ptr; \
+        void* mem_ptr = CCL_CALLOC_IMPL(size, align); \
+        CCL_THROW_IF_NOT(mem_ptr, "CCL cannot allocate bytes: ", size, ", out of memory, ", name); \
+        mem_ptr; \
     })
 
 #define CCL_MALLOC(size, name)          CCL_MALLOC_WRAPPER(size, name)
@@ -145,6 +146,14 @@
 /* other */
 namespace ccl {
 namespace utils {
+static constexpr int invalid_context_id = -1;
+static constexpr int invalid_device_id = -1;
+static constexpr int invalid_err_code = -1;
+static constexpr int invalid_fd = -1;
+static constexpr int invalid_mem_handle = -1;
+static constexpr int invalid_pid = -1;
+
+enum class align_kernels { unaligned, aligned, count };
 
 size_t get_ptr_diff(const void* ptr1, const void* ptr2);
 size_t pof2(size_t number);
@@ -212,5 +221,9 @@ void str_to_array(const std::string& input_str,
 std::string get_substring_between_delims(std::string& full_str,
                                          const std::string& start_delim,
                                          const std::string& stop_delim);
+
+uintptr_t get_aligned_offset_byte(const void* ptr,
+                                  const size_t buf_size_bytes,
+                                  const size_t mem_align_bytes);
 } // namespace utils
 } // namespace ccl
diff --git a/src/common/ze/ze_api_wrapper.cpp b/src/common/ze/ze_api_wrapper.cpp
deleted file mode 100644
index cf176b8f7..000000000
--- a/src/common/ze/ze_api_wrapper.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#include "common/global/global.hpp"
-#include "common/log/log.hpp"
-#include "common/stream/stream.hpp"
-#include "common/ze/ze_api_wrapper.hpp"
-#include "sched/entry/ze/ze_primitives.hpp"
-
-namespace ccl {
-
-static void* libze_handle;
-libze_ops_t libze_ops;
-
-bool ze_api_init() {
-    // lib_path specifies the name and full path to the level-zero library
-    // it should be absolute and validated path
-    // pointing to desired libze_loader library
-    std::string lib_path = ccl::global_data::env().ze_lib_path;
-
-    if (lib_path.empty()) {
-        lib_path = "libze_loader.so";
-    }
-
-    libze_handle = dlopen(lib_path.c_str(), RTLD_LAZY | RTLD_GLOBAL);
-    if (!libze_handle) {
-        LOG_WARN("could not open level-zero library: ", lib_path.c_str(), ", error: ", dlerror());
-        return false;
-    }
-
-    void** ops = (void**)((void*)&libze_ops);
-    int fn_count = sizeof(fn_names) / sizeof(*fn_names);
-
-    for (int i = 0; i < fn_count; ++i) {
-        ops[i] = dlsym(libze_handle, fn_names[i]);
-        if (!ops[i]) {
-            LOG_WARN("dlsym is failed on: ", fn_names[i], ", error: ", dlerror());
-            return false;
-        }
-        LOG_DEBUG("dlsym loaded of ", fn_count, " - ", i + 1, ": ", fn_names[i]);
-    }
-
-    return true;
-}
-
-void ze_api_fini() {
-    if (libze_handle) {
-        dlclose(libze_handle);
-        libze_handle = nullptr;
-    }
-}
-
-} //namespace ccl
diff --git a/src/comp/bf16/bf16.cpp b/src/comp/bf16/bf16.cpp
index b79915e63..f1a59d14a 100644
--- a/src/comp/bf16/bf16.cpp
+++ b/src/comp/bf16/bf16.cpp
@@ -24,111 +24,146 @@
 #define CCL_BF16_SHIFT     16
 
 std::map<ccl_bf16_impl_type, std::string> bf16_impl_names = {
-    std::make_pair(ccl_bf16_no_compiler_support, "no_compiler_support"),
-    std::make_pair(ccl_bf16_no_hardware_support, "no_hardware_support"),
+    std::make_pair(ccl_bf16_scalar, "scalar"),
     std::make_pair(ccl_bf16_avx512f, "avx512f"),
     std::make_pair(ccl_bf16_avx512bf, "avx512bf")
 };
 
-std::map<ccl_bf16_impl_type, std::string> bf16_env_impl_names = {
-    std::make_pair(ccl_bf16_avx512f, "avx512f"),
-    std::make_pair(ccl_bf16_avx512bf, "avx512bf")
-};
+typedef float (*ccl_bf16_reduction_scalar_func_ptr)(float a, float b);
 
-#ifdef CCL_BF16_COMPILER
+inline float bf16_sum_scalar(float a, float b) {
+    return a + b;
+}
+
+inline float bf16_prod_scalar(float a, float b) {
+    return a * b;
+}
+
+inline float bf16_min_scalar(float a, float b) {
+    return std::min(a, b);
+}
+
+inline float bf16_max_scalar(float a, float b) {
+    return std::max(a, b);
+}
+
+inline uint16_t ccl_convert_fp32_to_bf16_scalar(float val) {
+    uint16_t int_val = 0;
+    memcpy(&int_val, reinterpret_cast<uint8_t*>(&val) + 2, sizeof(int_val));
+    return int_val;
+}
+
+inline float ccl_convert_bf16_to_fp32_scalar(uint16_t val) {
+    float ret = 0;
+    uint32_t temp = static_cast<uint32_t>(val) << CCL_BF16_SHIFT;
+    memcpy(&ret, &temp, sizeof(temp));
+    return ret;
+}
+
+void ccl_bf16_reduce_scalar_impl(const void* in_buf,
+                                 void* inout_buf,
+                                 size_t in_count,
+                                 ccl::reduction op) {
+    ccl_bf16_reduction_scalar_func_ptr func = nullptr;
+    switch (op) {
+        case ccl::reduction::sum: func = &bf16_sum_scalar; break;
+        case ccl::reduction::prod: func = &bf16_prod_scalar; break;
+        case ccl::reduction::min: func = &bf16_min_scalar; break;
+        case ccl::reduction::max: func = &bf16_max_scalar; break;
+        default: CCL_FATAL("unexpected value ", ccl::utils::enum_to_underlying(op));
+    }
+
+    uint16_t* in_buf_int = (uint16_t*)in_buf;
+    uint16_t* inout_buf_int = (uint16_t*)inout_buf;
+
+    for (size_t i = 0; i < in_count; i++) {
+        float in_value_1 = ccl_convert_bf16_to_fp32_scalar(in_buf_int[i]);
+        float in_value_2 = ccl_convert_bf16_to_fp32_scalar(inout_buf_int[i]);
+        float out_value = func(in_value_1, in_value_2);
+        inout_buf_int[i] = ccl_convert_fp32_to_bf16_scalar(out_value);
+    }
+}
 
 void ccl_bf16_reduce(const void* in_buf,
-                     size_t in_cnt,
+                     size_t in_count,
                      void* inout_buf,
-                     size_t* out_cnt,
+                     size_t* out_count,
                      ccl::reduction op) {
-    LOG_DEBUG("BF16 reduction for %zu elements\n", in_cnt);
+    LOG_DEBUG("BF16 reduction for %zu elements", in_count);
 
-    if (out_cnt != nullptr) {
-        *out_cnt = in_cnt;
+    if (out_count != nullptr) {
+        *out_count = in_count;
     }
 
-    ccl_bf16_reduce_impl(in_buf, inout_buf, in_cnt, op);
+    auto bf16_impl_type = ccl::global_data::env().bf16_impl_type;
+
+    if (bf16_impl_type == ccl_bf16_scalar) {
+        ccl_bf16_reduce_scalar_impl(in_buf, inout_buf, in_count, op);
+    }
+    else {
+#ifdef CCL_BF16_COMPILER
+        ccl_bf16_reduce_impl(in_buf, inout_buf, in_count, op);
+#else // CCL_BF16_COMPILER
+        CCL_THROW("unexpected bf16_impl_type: ", bf16_impl_type);
+#endif // CCL_BF16_COMPILER
+    }
 }
 
+#ifdef CCL_BF16_COMPILER
 void ccl_convert_fp32_to_bf16(const void* src, void* dst) {
 #ifdef CCL_BF16_AVX512BF_COMPILER
     if (ccl::global_data::env().bf16_impl_type == ccl_bf16_avx512bf) {
-        _mm256_storeu_si256((__m256i*)(dst), (__m256i)_mm512_cvtneps_pbh(_mm512_loadu_ps(src)));
+        ccl_fp32_store_as_bf16_avx512bf(src, dst);
     }
     else
-#endif
+#endif // CCL_BF16_AVX512BF_COMPILER
     {
-        _mm256_storeu_si256((__m256i*)(dst),
-                            _mm512_cvtepi32_epi16(_mm512_bsrli_epi128(_mm512_loadu_si512(src), 2)));
+        ccl_fp32_store_as_bf16_avx512f(src, dst);
     }
 }
 
 void ccl_convert_bf16_to_fp32(const void* src, void* dst) {
-    _mm512_storeu_si512(
-        dst,
-        _mm512_bslli_epi128(_mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i const*)src)), 2));
+    ccl_bf16_load_as_fp32(src, dst);
 }
+#endif // CCL_BF16_COMPILER
 
 void ccl_convert_fp32_to_bf16_arrays(void* fp32_buf, void* bf16_buf, size_t count) {
-    int int_val = 0, int_val_shifted = 0;
     float* fp32_buf_float = (float*)fp32_buf;
-    size_t limit = (count / CCL_FLOATS_IN_M512) * CCL_FLOATS_IN_M512;
+    uint16_t* bf16_buf_int = (uint16_t*)bf16_buf;
+
+    size_t limit = 0;
 
-    for (size_t i = 0; i < limit; i += CCL_FLOATS_IN_M512) {
-        ccl_convert_fp32_to_bf16(fp32_buf_float + i, ((unsigned char*)bf16_buf) + (2 * i));
+#ifdef CCL_BF16_COMPILER
+    if (ccl::global_data::env().bf16_impl_type != ccl_bf16_scalar) {
+        limit = (count / CCL_FLOATS_IN_M512) * CCL_FLOATS_IN_M512;
+        for (size_t i = 0; i < limit; i += CCL_FLOATS_IN_M512) {
+            ccl_convert_fp32_to_bf16(fp32_buf_float + i, ((unsigned char*)bf16_buf) + (2 * i));
+        }
     }
+#endif // CCL_BF16_COMPILER
 
-    /* proceed remaining float's in buffer */
+    /* process remaining fp32 values */
     for (size_t i = limit; i < count; i++) {
-        /* iterate over bf16_buf */
-        int* send_bfp_tail = (int*)(((char*)bf16_buf) + (2 * i));
-        /* copy float (4 bytes) data as is to int variable, */
-        memcpy(&int_val, &fp32_buf_float[i], 4);
-        /* then perform shift and */
-        int_val_shifted = int_val >> CCL_BF16_SHIFT;
-        /* save pointer to result */
-        *send_bfp_tail = int_val_shifted;
+        bf16_buf_int[i] = ccl_convert_fp32_to_bf16_scalar(fp32_buf_float[i]);
     }
 }
 
 void ccl_convert_bf16_to_fp32_arrays(void* bf16_buf, float* fp32_buf, size_t count) {
-    int int_val = 0, int_val_shifted = 0;
-    size_t limit = (count / CCL_FLOATS_IN_M512) * CCL_FLOATS_IN_M512;
+    uint16_t* bf16_buf_int = (uint16_t*)bf16_buf;
+
+    size_t limit = 0;
 
-    for (size_t i = 0; i < limit; i += CCL_FLOATS_IN_M512) {
-        ccl_convert_bf16_to_fp32((char*)bf16_buf + (2 * i), fp32_buf + i);
+#ifdef CCL_BF16_COMPILER
+    if (ccl::global_data::env().bf16_impl_type != ccl_bf16_scalar) {
+        limit = (count / CCL_FLOATS_IN_M512) * CCL_FLOATS_IN_M512;
+        for (size_t i = 0; i < limit; i += CCL_FLOATS_IN_M512) {
+            ccl_convert_bf16_to_fp32((char*)bf16_buf + (2 * i), fp32_buf + i);
+        }
     }
+#endif // CCL_BF16_COMPILER
 
-    /* proceed remaining bf16's in buffer */
+    /* process remaining bf16 values */
     for (size_t i = limit; i < count; i++) {
-        /* iterate over bf16_buf */
-        int* recv_bfp_tail = (int*)((char*)bf16_buf + (2 * i));
-        /* copy bf16 data as is to int variable, */
-        memcpy(&int_val, recv_bfp_tail, 4);
-        /* then perform shift and */
-        int_val_shifted = int_val << CCL_BF16_SHIFT;
-        /* copy result to output */
-        memcpy((fp32_buf + i), &int_val_shifted, 4);
+        fp32_buf[i] = ccl_convert_bf16_to_fp32_scalar(bf16_buf_int[i]);
     }
 }
-
-#else // CCL_BF16_COMPILER
-
-void ccl_bf16_reduce(const void* in_buf,
-                     size_t in_cnt,
-                     void* inout_buf,
-                     size_t* out_cnt,
-                     ccl::reduction reduction_op) {
-    CCL_FATAL("BF16 reduction was requested but CCL was compiled w/o BF16 support");
-}
-
-void ccl_convert_fp32_to_bf16_arrays(void* fp32_buf, void* bf16_buf, size_t count) {
-    CCL_FATAL("FP32->BF16 conversion was requested but CCL was compiled w/o BF16 support");
-}
-
-void ccl_convert_bf16_to_fp32_arrays(void* bf16_buf, float* fp32_buf, size_t count) {
-    CCL_FATAL("BF16->FP32 conversion was requested but CCL was compiled w/o BF16 support");
-}
-
-#endif // CCL_BF16_COMPILER
diff --git a/src/comp/bf16/bf16_intrisics.hpp b/src/comp/bf16/bf16_intrisics.hpp
index de8670895..7daccd502 100644
--- a/src/comp/bf16/bf16_intrisics.hpp
+++ b/src/comp/bf16/bf16_intrisics.hpp
@@ -15,17 +15,6 @@
 */
 #pragma once
 
-#ifdef CCL_BF16_COMPILER
-
-#include <immintrin.h>
-#include <inttypes.h>
-
-#include "common/global/global.hpp"
-#include "comp/bf16/bf16_utils.hpp"
-#include "oneapi/ccl/types.hpp"
-
-#define CCL_BF16_IN_M256 16
-
 #ifdef CCL_BF16_TARGET_ATTRIBUTES
 
 #ifdef CCL_BF16_AVX512BF_COMPILER
@@ -52,6 +41,17 @@
 
 #endif // CCL_BF16_TARGET_ATTRIBUTES
 
+#ifdef CCL_BF16_COMPILER
+
+#include <immintrin.h>
+#include <inttypes.h>
+
+#include "common/global/global.hpp"
+#include "comp/bf16/bf16_utils.hpp"
+#include "oneapi/ccl/types.hpp"
+
+#define CCL_BF16_IN_M256 16
+
 typedef __m512 (*ccl_bf16_reduction_func_ptr)(__m512 a, __m512 b);
 BF16_TARGET_ATTRIBUTE_BWF __m512 bf16_sum_wrap(__m512 a, __m512 b);
 BF16_TARGET_ATTRIBUTE_BWF __m512 bf16_prod_wrap(__m512 a, __m512 b);
@@ -73,7 +73,7 @@ BF16_INLINE_TARGET_ATTRIBUTE_BW void ccl_fp32_store_as_bf16_avx512f(const void*
 BF16_INLINE_TARGET_ATTRIBUTE void ccl_fp32_store_as_bf16_avx512bf(const void* src, void* dst) {
     _mm256_storeu_si256((__m256i*)(dst), (__m256i)_mm512_cvtneps_pbh(_mm512_loadu_ps(src)));
 }
-#endif
+#endif // CCL_BF16_AVX512BF_COMPILER
 
 #define CCL_BF16_DEFINE_REDUCE_FUNC(impl_type) \
 \
@@ -116,7 +116,7 @@ BF16_INLINE_TARGET_ATTRIBUTE void ccl_fp32_store_as_bf16_avx512bf(const void* sr
 CCL_BF16_DEFINE_REDUCE_FUNC(avx512f);
 #ifdef CCL_BF16_AVX512BF_COMPILER
 CCL_BF16_DEFINE_REDUCE_FUNC(avx512bf);
-#endif
+#endif // CCL_BF16_AVX512BF_COMPILER
 
 BF16_INLINE_TARGET_ATTRIBUTE_ALL void ccl_bf16_reduce_impl(const void* in_buf,
                                                            void* inout_buf,
@@ -128,17 +128,22 @@ BF16_INLINE_TARGET_ATTRIBUTE_ALL void ccl_bf16_reduce_impl(const void* in_buf,
         case ccl::reduction::prod: func = &bf16_prod_wrap; break;
         case ccl::reduction::min: func = &bf16_min_wrap; break;
         case ccl::reduction::max: func = &bf16_max_wrap; break;
-        default: CCL_FATAL("unexpected value ", utils::enum_to_underlying(op));
+        default: CCL_FATAL("unexpected value ", ccl::utils::enum_to_underlying(op));
     }
 
     auto impl_type = ccl::global_data::env().bf16_impl_type;
 
-    if (impl_type == ccl_bf16_avx512f)
+    if (impl_type == ccl_bf16_avx512f) {
         ccl_bf16_reduce_impl_avx512f(in_buf, inout_buf, in_cnt, func);
+    }
 #ifdef CCL_BF16_AVX512BF_COMPILER
-    else if (impl_type == ccl_bf16_avx512bf)
+    else if (impl_type == ccl_bf16_avx512bf) {
         ccl_bf16_reduce_impl_avx512bf(in_buf, inout_buf, in_cnt, func);
-#endif
+    }
+#endif // CCL_BF16_AVX512BF_COMPILER
+    else {
+        CCL_THROW("unexpected bf16_impl_type: ", impl_type);
+    }
 }
 
 #endif // CCL_BF16_COMPILER
diff --git a/src/comp/bf16/bf16_utils.hpp b/src/comp/bf16/bf16_utils.hpp
index 06d845d68..2017b3917 100644
--- a/src/comp/bf16/bf16_utils.hpp
+++ b/src/comp/bf16/bf16_utils.hpp
@@ -21,21 +21,17 @@
 
 #ifdef CCL_BF16_COMPILER
 #include <immintrin.h>
-#endif
+#endif // CCL_BF16_COMPILER
 
-typedef enum {
-    ccl_bf16_no_compiler_support = 0,
-    ccl_bf16_no_hardware_support,
-    ccl_bf16_avx512f,
-    ccl_bf16_avx512bf
-} ccl_bf16_impl_type;
+typedef enum { ccl_bf16_scalar = 0, ccl_bf16_avx512f, ccl_bf16_avx512bf } ccl_bf16_impl_type;
 
 extern std::map<ccl_bf16_impl_type, std::string> bf16_impl_names;
-extern std::map<ccl_bf16_impl_type, std::string> bf16_env_impl_names;
 
 __attribute__((__always_inline__)) inline std::set<ccl_bf16_impl_type> ccl_bf16_get_impl_types() {
     std::set<ccl_bf16_impl_type> result;
 
+    result.insert(ccl_bf16_scalar);
+
 #ifdef CCL_BF16_COMPILER
     int is_avx512f_enabled = 0;
     int is_avx512bf_enabled = 0;
@@ -66,12 +62,7 @@ __attribute__((__always_inline__)) inline std::set<ccl_bf16_impl_type> ccl_bf16_
 
     if (is_avx512bf_enabled)
         result.insert(ccl_bf16_avx512bf);
-
-    if (!is_avx512f_enabled && !is_avx512bf_enabled)
-        result.insert(ccl_bf16_no_hardware_support);
-#else
-    result.insert(ccl_bf16_no_compiler_support);
-#endif
+#endif // CCL_BF16_COMPILER
 
     return result;
 }
diff --git a/src/comp/comp.cpp b/src/comp/comp.cpp
index b8976e970..add8ec816 100644
--- a/src/comp/comp.cpp
+++ b/src/comp/comp.cpp
@@ -20,11 +20,11 @@
 #include "common/global/global.hpp"
 #include "common/utils/enums.hpp"
 #include "common/utils/memcpy.hpp"
-#include "common/utils/sycl_utils.hpp"
 #include "oneapi/ccl/types.hpp"
 #include "sched/queue/queue.hpp"
 
 #ifdef CCL_ENABLE_SYCL
+#include "common/utils/sycl_utils.hpp"
 #include <CL/sycl.hpp>
 #endif // CCL_ENABLE_SYCL
 
@@ -53,7 +53,7 @@
                     inout_buf_##type[i] = std::max(in_buf_##type[i], inout_buf_##type[i]); \
                 } \
                 break; \
-            default: CCL_FATAL("unexpected value ", utils::enum_to_underlying(reduction)); \
+            default: CCL_FATAL("unexpected value ", ccl::utils::enum_to_underlying(reduction)); \
         } \
     } while (0)
 
diff --git a/src/comp/fp16/fp16_intrisics.hpp b/src/comp/fp16/fp16_intrisics.hpp
index ca88b0dba..1675019f3 100644
--- a/src/comp/fp16/fp16_intrisics.hpp
+++ b/src/comp/fp16/fp16_intrisics.hpp
@@ -171,7 +171,7 @@ FP16_INLINE_TARGET_ATTRIBUTE_ALL void ccl_fp16_reduce_impl(const void* in_buf,
             case ccl::reduction::prod: func_256 = &fp16_prod_wrap_256; break;
             case ccl::reduction::min: func_256 = &fp16_min_wrap_256; break;
             case ccl::reduction::max: func_256 = &fp16_max_wrap_256; break;
-            default: CCL_FATAL("unexpected value ", utils::enum_to_underlying(op));
+            default: CCL_FATAL("unexpected value ", ccl::utils::enum_to_underlying(op));
         }
         ccl_fp16_reduce_impl_256(in_buf, inout_buf, in_cnt, func_256);
     }
@@ -181,7 +181,7 @@ FP16_INLINE_TARGET_ATTRIBUTE_ALL void ccl_fp16_reduce_impl(const void* in_buf,
             case ccl::reduction::prod: func_512 = &fp16_prod_wrap_512; break;
             case ccl::reduction::min: func_512 = &fp16_min_wrap_512; break;
             case ccl::reduction::max: func_512 = &fp16_max_wrap_512; break;
-            default: CCL_FATAL("unexpected value ", utils::enum_to_underlying(op));
+            default: CCL_FATAL("unexpected value ", ccl::utils::enum_to_underlying(op));
         }
         ccl_fp16_reduce_impl_512(in_buf, inout_buf, in_cnt, func_512);
     }
diff --git a/src/exec/exec.cpp b/src/exec/exec.cpp
index 527804ce0..5ce3d257a 100644
--- a/src/exec/exec.cpp
+++ b/src/exec/exec.cpp
@@ -22,12 +22,13 @@
 #include "sched/sched.hpp"
 
 size_t ccl_executor::get_worker_idx_by_sched_id(ccl_sched* sched) {
-    return sched->sched_id % workers.size();
-}
+    if (sched->get_scaleout_flag()) {
+        // sched->sched_id is same as master sched id for all sub schedules
+        // so use the op_id to get different workers for different sub schedules
+        return sched->get_op_id() % workers.size();
+    }
 
-size_t ccl_executor::get_worker_idx_round_robin(ccl_sched* sched) {
-    ++rr_worker_idx %= workers.size();
-    return rr_worker_idx;
+    return sched->sched_id % workers.size();
 }
 
 size_t ccl_executor::calculate_atl_ep_count(size_t worker_count) {
@@ -75,53 +76,60 @@ std::unique_ptr<ccl_sched_queue> ccl_executor::create_sched_queue(size_t idx,
 ccl_executor::ccl_executor(const char* main_addr) {
     auto& env = ccl::global_data::env();
 
-    get_worker_idx_fn = (env.enable_fusion || env.enable_unordered_coll)
-                            ? &ccl_executor::get_worker_idx_by_sched_id
-                            : &ccl_executor::get_worker_idx_round_robin;
-
     /* generate ATL attr for all future communicators */
     atl_comm_manager::set_internal_env(generate_atl_attr(env));
     atl_comm_manager::set_executor(this);
 }
 
-void ccl_executor::start_workers(int proc_idx, int proc_count) {
-    set_local_coord(proc_idx, proc_count);
-    auto& env = ccl::global_data::env();
-    CCL_THROW_IF_NOT(env.env_2_worker_affinity(get_local_proc_idx(), get_local_proc_count()));
-    CCL_THROW_IF_NOT(env.env_2_worker_mem_affinity(get_local_proc_count()));
-    start_workers();
-}
-
-void ccl_executor::start_workers() {
+void ccl_executor::start_workers(atl_proc_coord_t& coord) {
     auto& env = ccl::global_data::env();
+    auto& global_data = ccl::global_data::get();
 
     auto worker_count = env.worker_count;
     auto ep_count = calculate_atl_ep_count(worker_count);
 
+    // firstly, check if an end user sets local coordinates.
+    // if it is not, take coordinates from ATL coord structure
+    if (global_data.get_local_proc_idx() == CCL_ENV_INT_NOT_SPECIFIED ||
+        global_data.get_local_proc_count() == CCL_ENV_INT_NOT_SPECIFIED) {
+        global_data.set_local_proc_idx(coord.local_idx);
+        global_data.set_local_proc_count(coord.local_count);
+        LOG_INFO("local_proc_idx: ",
+                 global_data.get_local_proc_idx(),
+                 ", local_proc_count: ",
+                 global_data.get_local_proc_count(),
+                 " are set by ATL transport");
+    }
+
+    CCL_THROW_IF_NOT(env.env_2_worker_affinity(global_data.get_local_proc_idx(),
+                                               global_data.get_local_proc_count()));
+    CCL_THROW_IF_NOT(env.env_2_worker_mem_affinity(global_data.get_local_proc_count()));
+
     if (env.worker_offload) {
-        CCL_THROW_IF_NOT(env.worker_affinity.size() >= get_local_proc_count() * worker_count,
-                         "unexpected worker affinity length ",
-                         env.worker_affinity.size(),
-                         ", should be ",
-                         get_local_proc_count() * worker_count);
+        CCL_THROW_IF_NOT(
+            env.worker_affinity.size() >= global_data.get_local_proc_count() * worker_count,
+            "unexpected worker affinity length ",
+            env.worker_affinity.size(),
+            ", should be ",
+            global_data.get_local_proc_count() * worker_count);
     }
 
     size_t ep_per_worker = ep_count / worker_count;
     for (size_t idx = 0; idx < worker_count; idx++) {
         if (env.enable_fusion && idx == 0) {
             LOG_DEBUG("create service worker");
-            workers.emplace_back(new ccl_service_worker(idx,
-                                                        create_sched_queue(idx, ep_per_worker),
-                                                        *ccl::global_data::get().fusion_manager));
+            workers.emplace_back(new ccl_service_worker(
+                idx, create_sched_queue(idx, ep_per_worker), *global_data.fusion_manager));
         }
         else {
             workers.emplace_back(new ccl_worker(idx, create_sched_queue(idx, ep_per_worker)));
         }
 
         if (env.worker_offload) {
-            size_t cpu_affinity = env.worker_affinity[get_local_proc_idx() * worker_count + idx];
+            size_t cpu_affinity =
+                env.worker_affinity[global_data.get_local_proc_idx() * worker_count + idx];
             size_t mem_affinity =
-                env.worker_mem_affinity[get_local_proc_idx() * worker_count + idx];
+                env.worker_mem_affinity[global_data.get_local_proc_idx() * worker_count + idx];
 
             CCL_THROW_IF_NOT(
                 workers.back()->start(cpu_affinity, mem_affinity) == ccl::status::success,
@@ -129,7 +137,7 @@ void ccl_executor::start_workers() {
                 idx);
 
             LOG_DEBUG("started worker: local_proc_idx ",
-                      get_local_proc_idx(),
+                      global_data.get_local_proc_idx(),
                       ", worker_idx ",
                       idx,
                       ", cpu: ",
@@ -246,7 +254,7 @@ void ccl_executor::start(ccl_sched* sched, bool extra_sched) {
     size_t worker_idx;
     auto& partial_scheds = sched->get_subscheds();
     for (size_t idx = 0; idx < partial_scheds.size(); idx++) {
-        worker_idx = (this->*get_worker_idx_fn)(partial_scheds[idx].get());
+        worker_idx = get_worker_idx_by_sched_id(partial_scheds[idx].get());
         LOG_DEBUG(
             "worker idx: ", worker_idx, ", coll: ", ccl_coll_type_to_str(sched->coll_param.ctype));
         workers[worker_idx]->add(partial_scheds[idx].get());
@@ -288,51 +296,6 @@ void ccl_executor::do_work() {
     }
 }
 
-void ccl_executor::getenv_local_coord(const char* local_proc_idx_env_name,
-                                      const char* local_proc_count_env_name) {
-    char* local_idx_env = getenv(local_proc_idx_env_name);
-    char* local_count_env = getenv(local_proc_count_env_name);
-    if (local_idx_env && local_count_env) {
-        local_proc_idx = std::atoi(local_idx_env);
-        local_proc_count = std::atoi(local_count_env);
-        CCL_THROW_IF_NOT(local_proc_idx != CCL_ENV_INT_NOT_SPECIFIED,
-                         "unexpected local_proc_idx ",
-                         local_proc_idx);
-        CCL_THROW_IF_NOT(local_proc_count != CCL_ENV_INT_NOT_SPECIFIED,
-                         "unexpected local_proc_count ",
-                         local_proc_count);
-    }
-    else {
-        LOG_WARN(local_idx_env, " or ", local_count_env, " not found");
-        LOG_WARN("use local_proc_idx: ", local_proc_idx, " , local_proc_count: ", local_proc_count)
-    }
-}
-
-void ccl_executor::set_local_coord(int proc_idx, int proc_count) {
-    local_proc_idx = proc_idx;
-    local_proc_count = proc_count;
-    auto& env = ccl::global_data::env();
-
-    if (env.process_launcher == process_launcher_mode::hydra) {
-        getenv_local_coord("MPI_LOCALRANKID", "MPI_LOCALNRANKS");
-    }
-    else if (env.process_launcher == process_launcher_mode::torch) {
-        getenv_local_coord("LOCAL_RANK", "LOCAL_WORLD_SIZE");
-    }
-    else if (env.process_launcher == process_launcher_mode::none) {
-        getenv_local_coord("CCL_LOCAL_RANK", "CCL_LOCAL_SIZE");
-    }
-    else {
-        CCL_THROW("unexpected process launcher");
-    }
-    LOG_INFO("process launcher: ",
-             ccl::env_data::process_launcher_names[env.process_launcher],
-             ", local_proc_idx: ",
-             local_proc_idx,
-             ", local_proc_count: ",
-             local_proc_count);
-}
-
 size_t ccl_executor::get_worker_count() const {
     return workers.size();
 }
diff --git a/src/exec/exec.hpp b/src/exec/exec.hpp
index 51f447960..9f115f33d 100644
--- a/src/exec/exec.hpp
+++ b/src/exec/exec.hpp
@@ -68,8 +68,7 @@ class alignas(CACHELINE_SIZE) ccl_executor {
     void wait(const ccl_request* req);
     bool test(const ccl_request* req);
 
-    void start_workers();
-    void start_workers(int local_proc_idx, int local_proc_count);
+    void start_workers(atl_proc_coord_t& coord);
     bool are_workers_started() {
         return workers_started;
     };
@@ -85,13 +84,6 @@ class alignas(CACHELINE_SIZE) ccl_executor {
     void unlock_workers();
     bool is_locked = false;
 
-    int get_local_proc_idx() const {
-        return local_proc_idx;
-    }
-    int get_local_proc_count() const {
-        return local_proc_count;
-    }
-
     static size_t calculate_atl_ep_count(size_t worker_count);
     static atl_attr_t generate_atl_attr(const ccl::env_data& env);
 
@@ -101,18 +93,11 @@ class alignas(CACHELINE_SIZE) ccl_executor {
 
     std::unique_ptr<ccl_sched_queue> create_sched_queue(size_t idx, size_t ep_per_worker);
     void do_work();
-    void set_local_coord(int proc_idx, int proc_count);
-    void getenv_local_coord(const char* local_idx_env_name, const char* local_count_env_name);
 
     std::vector<std::unique_ptr<ccl_worker>> workers;
     // TODO: Rework to support listener
     //  std::unique_ptr<ccl_listener> listener;
 
-    typedef size_t (ccl_executor::*get_worker_idx_fn_t)(ccl_sched* sched);
-    get_worker_idx_fn_t get_worker_idx_fn;
-    size_t rr_worker_idx = 0; /* to distribute work in round-robin */
-    int local_proc_idx;
-    int local_proc_count;
     bool workers_started = false;
 };
 
@@ -126,13 +111,14 @@ inline void ccl_release_sched(ccl_sched* sched) {
 }
 
 inline void ccl_release_request(ccl_request* req) {
-    CCL_THROW_IF_NOT(req->get_sched(), "sched is not set for request");
-
     auto* sched = req->get_sched();
+
+    CCL_THROW_IF_NOT(sched, "sched is not set for request");
+
     // if the released request is not the current active one, then we need
     // to explicitly delete it, otherwise it's going to be deleted in sched's
     // destructor
-    if (req != req->get_sched()->get_request()) {
+    if (req != sched->get_request()) {
         LOG_DEBUG("deleting req ", req, " detached from sched ", sched);
         delete req;
     }
@@ -149,7 +135,8 @@ inline void ccl_wait_impl(ccl_executor* exec, ccl_request* request) {
             request,
             " completed, sched ",
             ccl_coll_type_to_str(static_cast<sched_type*>(request->get_sched())->coll_param.ctype));
-        ccl_release_request(request);
+        if (!request->synchronous)
+            ccl_release_request(request);
     }
 }
 
diff --git a/src/exec/thread/base_thread.cpp b/src/exec/thread/base_thread.cpp
index 247c1789a..930946f71 100644
--- a/src/exec/thread/base_thread.cpp
+++ b/src/exec/thread/base_thread.cpp
@@ -113,13 +113,13 @@ int ccl_base_thread::get_real_cpu_affinity() {
         LOG_ERROR("pthread_getaffinity_np failed, err ", pthread_err);
     }
 
-    for (int idx = 0; idx < CPU_SETSIZE; idx++) {
-        if (__CPU_ISSET_S(idx, sizeof(cpu_set_t), &cpuset)) {
+    for (int cpu_idx = 0; cpu_idx < CPU_SETSIZE; cpu_idx++) {
+        if (__CPU_ISSET_S(cpu_idx, sizeof(cpu_set_t), &cpuset)) {
             if (result == CCL_UNDEFINED_CPU_ID) {
-                result = idx;
+                result = cpu_idx;
             }
             else {
-                CCL_THROW("multiple affinity cores, previous ", result, ", new ", idx);
+                CCL_THROW("multiple affinity cores, previous ", result, ", new ", cpu_idx);
             }
         }
     }
diff --git a/src/kernels/bf16.h b/src/kernels/bf16.h
new file mode 100644
index 000000000..d0da9cdc8
--- /dev/null
+++ b/src/kernels/bf16.h
@@ -0,0 +1,80 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#ifdef CCL_BF16_GPU_TRUNCATE
+
+float __bf16_to_fp32(ushort V) {
+    uint temp = convert_uint(V) << 16;
+    return as_float(temp);
+}
+
+ushort __fp32_to_bf16(float V) {
+    ushort2 temp = as_ushort2(V);
+    return temp.s1;
+}
+
+#else // CCL_BF16_GPU_TRUNCATE
+
+#ifdef cl_intel_bfloat16_conversions
+#pragma OPENCL EXTENSION cl_intel_bfloat16_conversions : enable
+#else // cl_intel_bfloat16_conversions
+
+// declare SPIR-V intrinsics directly
+ushort __builtin_spirv_OpConvertFToBF16INTEL_f32(float);
+float __builtin_spirv_OpConvertBF16ToFINTEL_i16(ushort);
+
+// implement built-in functions using these intrinsics
+#define __ovld __attribute__((overloadable))
+ushort __ovld intel_convert_bfloat16_as_ushort(float f) {
+    return __builtin_spirv_OpConvertFToBF16INTEL_f32(f);
+}
+
+float __ovld intel_convert_as_bfloat16_float(ushort u) {
+    return __builtin_spirv_OpConvertBF16ToFINTEL_i16(u);
+}
+
+#endif // cl_intel_bfloat16_conversions
+
+float __bf16_to_fp32(ushort V) {
+    return intel_convert_as_bfloat16_float(V);
+}
+
+ushort __fp32_to_bf16(float V) {
+    return intel_convert_bfloat16_as_ushort(V);
+}
+
+#endif // CCL_BF16_GPU_TRUNCATE
+
+#define DEFINE_BF16SUM_OP(T) \
+    T __bf16_sum_##T(T lhs, T rhs) { \
+        return __fp32_to_bf16(__bf16_to_fp32(lhs) + __bf16_to_fp32(rhs)); \
+    }
+
+#define DEFINE_BF16PROD_OP(T) \
+    T __bf16_prod_##T(T lhs, T rhs) { \
+        return __fp32_to_bf16(__bf16_to_fp32(lhs) * __bf16_to_fp32(rhs)); \
+    }
+
+#define DEFINE_BF16MIN_OP(T) \
+    T __bf16_min_##T(T lhs, T rhs) { \
+        return __fp32_to_bf16(min(__bf16_to_fp32(lhs), __bf16_to_fp32(rhs))); \
+    }
+
+#define DEFINE_BF16MAX_OP(T) \
+    T __bf16_max_##T(T lhs, T rhs) { \
+        return __fp32_to_bf16(max(__bf16_to_fp32(lhs), __bf16_to_fp32(rhs))); \
+    }
diff --git a/src/kernels/common.h b/src/kernels/common.h
index fd73ad8fa..c73f9101e 100644
--- a/src/kernels/common.h
+++ b/src/kernels/common.h
@@ -18,7 +18,8 @@
 #pragma OPENCL EXTENSION cl_intel_subgroups : enable
 #pragma OPENCL EXTENSION cl_khr_subgroups : enable
 
-#include "lp.h"
+#include "bf16.h"
+#include "fp16.h"
 
 #define FORMAT_int8_t  "%hhd"
 #define FORMAT_int16_t "%d"
diff --git a/src/kernels/lp.h b/src/kernels/fp16.h
similarity index 81%
rename from src/kernels/lp.h
rename to src/kernels/fp16.h
index 890a254c7..7090c2da5 100644
--- a/src/kernels/lp.h
+++ b/src/kernels/fp16.h
@@ -13,151 +13,119 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#pragma once
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-#ifdef CCL_BF16_GPU_TRUNCATE
-float __bf16_to_fp32(ushort V) {
-    uint temp = convert_uint(V) << 16;
-    return as_float(temp);
-}
-
-ushort __fp32_to_bf16(float V) {
-    ushort2 temp = as_ushort2(V);
-    return temp.s1;
-}
-#endif // CCL_BF16_GPU_TRUNCATE
-
-#define DEFINE_BF16SUM_OP(T) \
-    T __bf16_sum_##T(T lhs, T rhs) { \
-        return __fp32_to_bf16(__bf16_to_fp32(lhs) + __bf16_to_fp32(rhs)); \
-    }
-
-#define DEFINE_BF16PROD_OP(T) \
-    T __bf16_prod_##T(T lhs, T rhs) { \
-        return __fp32_to_bf16(__bf16_to_fp32(lhs) * __bf16_to_fp32(rhs)); \
-    }
-
-#define DEFINE_BF16MIN_OP(T) \
-    T __bf16_min_##T(T lhs, T rhs) { \
-        return __fp32_to_bf16(min(__bf16_to_fp32(lhs), __bf16_to_fp32(rhs))); \
-    }
-
-#define DEFINE_BF16MAX_OP(T) \
-    T __bf16_max_##T(T lhs, T rhs) { \
-        return __fp32_to_bf16(max(__bf16_to_fp32(lhs), __bf16_to_fp32(rhs))); \
-    }
-
-#ifdef CCL_FP16_GPU_TRUNCATE
-/*
-Truncation routines for converting fp32 <-> fp16
-
-fp16 has 1 sign bit, 5 exponent bits and 10 significand bits with exponent
-offset 15 - https://en.wikipedia.org/wiki/Half-precision_floating-point_format
-
-For fp16 -> fp32
-
-The sign & significand bits are unchanged, but the exponent must be properly
-re-offset (i.e. convert the fp16 offset -> fp32 offset). Care must also be taken
-to saturate the fp32 result if the fp16 result is saturated. Denormals must be
-flushed to 0.
-
-For fp32 -> fp16
-
-Similar to fp16 -> fp32 except that the exponent must be checked for saturation
-since the range of the exponent is signficantly smaller than that of fp32.
-*/
-float __fp16_to_fp32(half V) {
-    uint ans_bits = 0;
-    uint exp_bits = as_ushort(V) & 0x7C00;
-    uint significand_bits = as_ushort(V) & 0x03FF;
-    if (exp_bits == 0x7C00) {
-        ans_bits = ((as_ushort(V) & 0x8000) << 16) | 0x7F800000 | (significand_bits << 13);
-    }
-    else if (exp_bits == 0x0000) {
-        if (significand_bits != 0x00000000) {
-            ans_bits = ((as_ushort(V) & 0x8000) << 16);
-        }
-        else {
-            ans_bits = ((as_ushort(V) & 0x8000) << 16) | (significand_bits << 13);
-        }
-    }
-    else {
-        ans_bits = ((as_ushort(V) & 0x8000) << 16) | ((exp_bits + 0x1C000) << 13) |
-                   (significand_bits << 13);
-    }
-    return as_float(ans_bits);
-}
-
-half __fp32_to_fp16(float V) {
-    ushort ans;
-    uint exp_bits = (as_uint(V) & 0x7F800000);
-    uint significand_bits = (as_uint(V) & 0x007FFFFF);
-    if (exp_bits == 0x00000000) {
-        ans = (as_uint(V) & 0x80000000) >> 16;
-    }
-    else if (exp_bits == 0x7F800000) {
-        if (significand_bits != 0) {
-            ans = ((as_uint(V) & 0x80000000) >> 16) | 0x00007C01;
-        }
-        else {
-            ans = ((as_uint(V) & 0x80000000) >> 16) | 0x00007C00;
-        }
-    }
-    else if (exp_bits < 0x38800000) {
-        ans = 0xFC00;
-    }
-    else if (exp_bits > 0x47000000) {
-        ans = 0x7C00;
-    }
-    else {
-        ans = ((as_uint(V) & 0x80000000) >> 16) |
-              ((((as_uint(V) & 0x7F800000) >> 23) - 112) << 10) | ((as_uint(V) & 0x007FFFFF) >> 13);
-    }
-    return as_half(ans);
-}
-
-#define DEFINE_FP16SUM_OP(T) \
-    T __sum_##T(T lhs, T rhs) { \
-        return __fp32_to_fp16(__fp16_to_fp32(lhs) + __fp16_to_fp32(rhs)); \
-    }
-
-#define DEFINE_FP16PROD_OP(T) \
-    T __prod_##T(T lhs, T rhs) { \
-        return __fp32_to_fp16(__fp16_to_fp32(lhs) * __fp16_to_fp32(rhs)); \
-    }
-
-#define DEFINE_FP16MIN_OP(T) \
-    T __min_##T(T lhs, T rhs) { \
-        return __fp32_to_fp16(min(__fp16_to_fp32(lhs), __fp16_to_fp32(rhs))); \
-    }
-
-#define DEFINE_FP16MAX_OP(T) \
-    T __max_##T(T lhs, T rhs) { \
-        return __fp32_to_fp16(max(__fp16_to_fp32(lhs), __fp16_to_fp32(rhs))); \
-    }
-
-#else // CCL_FP16_GPU_TRUNCATE
-
-#define DEFINE_FP16SUM_OP(T) \
-    T __sum_##T(T lhs, T rhs) { \
-        return lhs + rhs; \
-    }
-
-#define DEFINE_FP16PROD_OP(T) \
-    T __prod_##T(T lhs, T rhs) { \
-        return lhs * rhs; \
-    }
-
-#define DEFINE_FP16MIN_OP(T) \
-    T __min_##T(T lhs, T rhs) { \
-        return min(lhs, rhs); \
-    }
-
-#define DEFINE_FP16MAX_OP(T) \
-    T __max_##T(T lhs, T rhs) { \
-        return max(lhs, rhs); \
-    }
-
-#endif // CCL_FP16_GPU_TRUNCATE
+#pragma once
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+#ifdef CCL_FP16_GPU_TRUNCATE
+/*
+Truncation routines for converting fp32 <-> fp16
+
+fp16 has 1 sign bit, 5 exponent bits and 10 significand bits with exponent
+offset 15 - https://en.wikipedia.org/wiki/Half-precision_floating-point_format
+
+For fp16 -> fp32
+
+The sign & significand bits are unchanged, but the exponent must be properly
+re-offset (i.e. convert the fp16 offset -> fp32 offset). Care must also be taken
+to saturate the fp32 result if the fp16 result is saturated. Denormals must be
+flushed to 0.
+
+For fp32 -> fp16
+
+Similar to fp16 -> fp32 except that the exponent must be checked for saturation
+since the range of the exponent is signficantly smaller than that of fp32.
+*/
+float __fp16_to_fp32(half V) {
+    uint ans_bits = 0;
+    uint exp_bits = as_ushort(V) & 0x7C00;
+    uint significand_bits = as_ushort(V) & 0x03FF;
+    if (exp_bits == 0x7C00) {
+        ans_bits = ((as_ushort(V) & 0x8000) << 16) | 0x7F800000 | (significand_bits << 13);
+    }
+    else if (exp_bits == 0x0000) {
+        if (significand_bits != 0x00000000) {
+            ans_bits = ((as_ushort(V) & 0x8000) << 16);
+        }
+        else {
+            ans_bits = ((as_ushort(V) & 0x8000) << 16) | (significand_bits << 13);
+        }
+    }
+    else {
+        ans_bits = ((as_ushort(V) & 0x8000) << 16) | ((exp_bits + 0x1C000) << 13) |
+                   (significand_bits << 13);
+    }
+    return as_float(ans_bits);
+}
+
+half __fp32_to_fp16(float V) {
+    ushort ans;
+    uint exp_bits = (as_uint(V) & 0x7F800000);
+    uint significand_bits = (as_uint(V) & 0x007FFFFF);
+    if (exp_bits == 0x00000000) {
+        ans = (as_uint(V) & 0x80000000) >> 16;
+    }
+    else if (exp_bits == 0x7F800000) {
+        if (significand_bits != 0) {
+            ans = ((as_uint(V) & 0x80000000) >> 16) | 0x00007C01;
+        }
+        else {
+            ans = ((as_uint(V) & 0x80000000) >> 16) | 0x00007C00;
+        }
+    }
+    else if (exp_bits < 0x38800000) {
+        ans = 0xFC00;
+    }
+    else if (exp_bits > 0x47000000) {
+        ans = 0x7C00;
+    }
+    else {
+        ans = ((as_uint(V) & 0x80000000) >> 16) |
+              ((((as_uint(V) & 0x7F800000) >> 23) - 112) << 10) | ((as_uint(V) & 0x007FFFFF) >> 13);
+    }
+    return as_half(ans);
+}
+
+#define DEFINE_FP16SUM_OP(T) \
+    T __sum_##T(T lhs, T rhs) { \
+        return __fp32_to_fp16(__fp16_to_fp32(lhs) + __fp16_to_fp32(rhs)); \
+    }
+
+#define DEFINE_FP16PROD_OP(T) \
+    T __prod_##T(T lhs, T rhs) { \
+        return __fp32_to_fp16(__fp16_to_fp32(lhs) * __fp16_to_fp32(rhs)); \
+    }
+
+#define DEFINE_FP16MIN_OP(T) \
+    T __min_##T(T lhs, T rhs) { \
+        return __fp32_to_fp16(min(__fp16_to_fp32(lhs), __fp16_to_fp32(rhs))); \
+    }
+
+#define DEFINE_FP16MAX_OP(T) \
+    T __max_##T(T lhs, T rhs) { \
+        return __fp32_to_fp16(max(__fp16_to_fp32(lhs), __fp16_to_fp32(rhs))); \
+    }
+
+#else // CCL_FP16_GPU_TRUNCATE
+
+#define DEFINE_FP16SUM_OP(T) \
+    T __sum_##T(T lhs, T rhs) { \
+        return lhs + rhs; \
+    }
+
+#define DEFINE_FP16PROD_OP(T) \
+    T __prod_##T(T lhs, T rhs) { \
+        return lhs * rhs; \
+    }
+
+#define DEFINE_FP16MIN_OP(T) \
+    T __min_##T(T lhs, T rhs) { \
+        return min(lhs, rhs); \
+    }
+
+#define DEFINE_FP16MAX_OP(T) \
+    T __max_##T(T lhs, T rhs) { \
+        return max(lhs, rhs); \
+    }
+
+#endif // CCL_FP16_GPU_TRUNCATE
diff --git a/src/kernels/kernels.cl b/src/kernels/kernels.cl
index 47bcdf28b..dbc5aa49a 100644
--- a/src/kernels/kernels.cl
+++ b/src/kernels/kernels.cl
@@ -28,8 +28,9 @@ __kernel void empty_kernel(int my_rank,
         size_t thread_id = get_global_id(0); \
         for (size_t i = 0; thread_id + i < count; i += work_group_size) { \
             const size_t idx = thread_id + i; \
-            output_buffer[idx] = OpFunc(input_buffer[idx], peer_input_buffer[idx]); \
-            peer_output_buffer[idx] = output_buffer[idx]; \
+            Dtype ret = OpFunc(input_buffer[idx], peer_input_buffer[idx]); \
+            output_buffer[idx] = ret; \
+            peer_output_buffer[idx] = ret; \
         } \
     }
 
@@ -62,6 +63,213 @@ __kernel void empty_kernel(int my_rank,
         } \
     }
 
+#define DEFINE_REDUCE_SINGLE_LOCAL_INPLACE_KERNEL(DtypeName, Dtype, OpName, OpFunc) \
+    __kernel void reduce_single_local_inplace_kernel_##DtypeName##_##OpName( \
+        ulong count, \
+        int peer_count, \
+        const __global Dtype* input_buffer, \
+        __global Dtype* inoutput_buffer) { \
+        DEBUG_BLOCK(printf("in reduce_single_local_inplace_kernel\n")); \
+        size_t work_group_size = get_global_size(0); \
+        size_t thread_id = get_global_id(0); \
+        for (size_t i = 0; thread_id + i < count; i += work_group_size) { \
+            const size_t idx = thread_id + i; \
+            Dtype ret = OpFunc(input_buffer[idx], inoutput_buffer[idx]); \
+            for (int j = 1; j < peer_count; j++) { \
+                ret = OpFunc(inoutput_buffer[j * count + idx], ret); \
+            } \
+            inoutput_buffer[idx] = ret; \
+        } \
+    }
+
+#define DEFINE_REDUCE_MONOLITHIC_1_KERNEL(DtypeName, Dtype, OpName, OpFunc) \
+    __kernel void reduce_monolithic_kernel_1_##DtypeName##_##OpName( \
+        ulong count, \
+        const __global Dtype* input_buffer, \
+        const __global Dtype* peer_buffer1, \
+        __global Dtype* output_buffer) { \
+        DEBUG_BLOCK(printf("in reduce_monolithic_kernel_1\n")); \
+        const size_t work_group_size = get_global_size(0); \
+        const size_t thread_id = get_global_id(0); \
+        for (size_t idx = thread_id; idx < count; idx += work_group_size) { \
+            Dtype sum = input_buffer[idx]; \
+            sum = OpFunc(sum, peer_buffer1[idx]); \
+            output_buffer[idx] = sum; \
+        } \
+    }
+
+#define DEFINE_REDUCE_MONOLITHIC_2_KERNEL(DtypeName, Dtype, OpName, OpFunc) \
+    __kernel void reduce_monolithic_kernel_2_##DtypeName##_##OpName( \
+        ulong count, \
+        const __global Dtype* input_buffer, \
+        const __global Dtype* peer_buffer1, \
+        const __global Dtype* peer_buffer2, \
+        __global Dtype* output_buffer) { \
+        DEBUG_BLOCK(printf("in reduce_monolithic_kernel_2\n")); \
+        const size_t work_group_size = get_global_size(0); \
+        const size_t thread_id = get_global_id(0); \
+        for (size_t idx = thread_id; idx < count; idx += work_group_size) { \
+            Dtype sum = input_buffer[idx]; \
+            sum = OpFunc(sum, peer_buffer1[idx]); \
+            sum = OpFunc(sum, peer_buffer2[idx]); \
+            output_buffer[idx] = sum; \
+        } \
+    }
+
+#define DEFINE_REDUCE_MONOLITHIC_3_KERNEL(DtypeName, Dtype, OpName, OpFunc) \
+    __kernel void reduce_monolithic_kernel_3_##DtypeName##_##OpName( \
+        ulong count, \
+        const __global Dtype* input_buffer, \
+        const __global Dtype* peer_buffer1, \
+        const __global Dtype* peer_buffer2, \
+        const __global Dtype* peer_buffer3, \
+        __global Dtype* output_buffer) { \
+        DEBUG_BLOCK(printf("in reduce_monolithic_kernel_3\n")); \
+        const size_t work_group_size = get_global_size(0); \
+        const size_t thread_id = get_global_id(0); \
+        for (size_t idx = thread_id; idx < count; idx += work_group_size) { \
+            Dtype sum = input_buffer[idx]; \
+            sum = OpFunc(sum, peer_buffer1[idx]); \
+            sum = OpFunc(sum, peer_buffer2[idx]); \
+            sum = OpFunc(sum, peer_buffer3[idx]); \
+            output_buffer[idx] = sum; \
+        } \
+    }
+
+#define DEFINE_REDUCE_MONOLITHIC_4_KERNEL(DtypeName, Dtype, OpName, OpFunc) \
+    __kernel void reduce_monolithic_kernel_4_##DtypeName##_##OpName( \
+        ulong count, \
+        const __global Dtype* input_buffer, \
+        const __global Dtype* peer_buffer1, \
+        const __global Dtype* peer_buffer2, \
+        const __global Dtype* peer_buffer3, \
+        const __global Dtype* peer_buffer4, \
+        __global Dtype* output_buffer) { \
+        DEBUG_BLOCK(printf("in reduce_monolithic_kernel_4\n")); \
+        const size_t work_group_size = get_global_size(0); \
+        const size_t thread_id = get_global_id(0); \
+        for (size_t idx = thread_id; idx < count; idx += work_group_size) { \
+            Dtype sum = input_buffer[idx]; \
+            sum = OpFunc(sum, peer_buffer1[idx]); \
+            sum = OpFunc(sum, peer_buffer2[idx]); \
+            sum = OpFunc(sum, peer_buffer3[idx]); \
+            sum = OpFunc(sum, peer_buffer4[idx]); \
+            output_buffer[idx] = sum; \
+        } \
+    }
+
+#define DEFINE_REDUCE_MONOLITHIC_5_KERNEL(DtypeName, Dtype, OpName, OpFunc) \
+    __kernel void reduce_monolithic_kernel_5_##DtypeName##_##OpName( \
+        ulong count, \
+        const __global Dtype* input_buffer, \
+        const __global Dtype* peer_buffer1, \
+        const __global Dtype* peer_buffer2, \
+        const __global Dtype* peer_buffer3, \
+        const __global Dtype* peer_buffer4, \
+        const __global Dtype* peer_buffer5, \
+        __global Dtype* output_buffer) { \
+        DEBUG_BLOCK(printf("in reduce_monolithic_kernel_5\n")); \
+        const size_t work_group_size = get_global_size(0); \
+        const size_t thread_id = get_global_id(0); \
+        for (size_t idx = thread_id; idx < count; idx += work_group_size) { \
+            Dtype sum = input_buffer[idx]; \
+            sum = OpFunc(sum, peer_buffer1[idx]); \
+            sum = OpFunc(sum, peer_buffer2[idx]); \
+            sum = OpFunc(sum, peer_buffer3[idx]); \
+            sum = OpFunc(sum, peer_buffer4[idx]); \
+            sum = OpFunc(sum, peer_buffer5[idx]); \
+            output_buffer[idx] = sum; \
+        } \
+    }
+
+#define DEFINE_WRITE_MONOLITHIC_1_KERNEL(DtypeName, Dtype, OpName, OpFunc) \
+    __kernel void write_monolithic_kernel_1_##DtypeName##_##OpName( \
+        ulong count, const __global Dtype* input_buffer, __global Dtype* peer_buffer1) { \
+        DEBUG_BLOCK(printf("in write_monolithic_kernel_1 count %d\n", count)); \
+        const size_t work_group_size = get_global_size(0); \
+        const size_t thread_id = get_global_id(0); \
+        for (size_t idx = thread_id; idx < count; idx += work_group_size) { \
+            const Dtype val = input_buffer[idx]; \
+            peer_buffer1[idx] = val; \
+        } \
+    }
+
+#define DEFINE_WRITE_MONOLITHIC_2_KERNEL(DtypeName, Dtype, OpName, OpFunc) \
+    __kernel void write_monolithic_kernel_2_##DtypeName##_##OpName( \
+        ulong count, \
+        const __global Dtype* input_buffer, \
+        __global Dtype* peer_buffer1, \
+        __global Dtype* peer_buffer2) { \
+        DEBUG_BLOCK(printf("in write_monolithic_kernel_2 count %d\n", count)); \
+        const size_t work_group_size = get_global_size(0); \
+        const size_t thread_id = get_global_id(0); \
+        for (size_t idx = thread_id; idx < count; idx += work_group_size) { \
+            const Dtype val = input_buffer[idx]; \
+            peer_buffer1[idx] = val; \
+            peer_buffer2[idx] = val; \
+        } \
+    }
+
+#define DEFINE_WRITE_MONOLITHIC_3_KERNEL(DtypeName, Dtype, OpName, OpFunc) \
+    __kernel void write_monolithic_kernel_3_##DtypeName##_##OpName( \
+        ulong count, \
+        const __global Dtype* input_buffer, \
+        __global Dtype* peer_buffer1, \
+        __global Dtype* peer_buffer2, \
+        __global Dtype* peer_buffer3) { \
+        DEBUG_BLOCK(printf("in write_monolithic_kernel_3 count %d\n", count)); \
+        const size_t work_group_size = get_global_size(0); \
+        const size_t thread_id = get_global_id(0); \
+        for (size_t idx = thread_id; idx < count; idx += work_group_size) { \
+            const Dtype val = input_buffer[idx]; \
+            peer_buffer1[idx] = val; \
+            peer_buffer2[idx] = val; \
+            peer_buffer3[idx] = val; \
+        } \
+    }
+
+#define DEFINE_WRITE_MONOLITHIC_4_KERNEL(DtypeName, Dtype, OpName, OpFunc) \
+    __kernel void write_monolithic_kernel_4_##DtypeName##_##OpName( \
+        ulong count, \
+        const __global Dtype* input_buffer, \
+        __global Dtype* peer_buffer1, \
+        __global Dtype* peer_buffer2, \
+        __global Dtype* peer_buffer3, \
+        __global Dtype* peer_buffer4) { \
+        DEBUG_BLOCK(printf("in write_monolithic_kernel_4 count %d\n", count)); \
+        const size_t work_group_size = get_global_size(0); \
+        const size_t thread_id = get_global_id(0); \
+        for (size_t idx = thread_id; idx < count; idx += work_group_size) { \
+            const Dtype val = input_buffer[idx]; \
+            peer_buffer1[idx] = val; \
+            peer_buffer2[idx] = val; \
+            peer_buffer3[idx] = val; \
+            peer_buffer4[idx] = val; \
+        } \
+    }
+
+#define DEFINE_WRITE_MONOLITHIC_5_KERNEL(DtypeName, Dtype, OpName, OpFunc) \
+    __kernel void write_monolithic_kernel_5_##DtypeName##_##OpName( \
+        ulong count, \
+        const __global Dtype* input_buffer, \
+        __global Dtype* peer_buffer1, \
+        __global Dtype* peer_buffer2, \
+        __global Dtype* peer_buffer3, \
+        __global Dtype* peer_buffer4, \
+        __global Dtype* peer_buffer5) { \
+        DEBUG_BLOCK(printf("in write_monolithic_kernel_5 count %d\n", count)); \
+        const size_t work_group_size = get_global_size(0); \
+        const size_t thread_id = get_global_id(0); \
+        for (size_t idx = thread_id; idx < count; idx += work_group_size) { \
+            const Dtype val = input_buffer[idx]; \
+            peer_buffer1[idx] = val; \
+            peer_buffer2[idx] = val; \
+            peer_buffer3[idx] = val; \
+            peer_buffer4[idx] = val; \
+            peer_buffer5[idx] = val; \
+        } \
+    }
+
 // Define kernels for a specific reduction operation for all supported datatypes
 #define DEFINE_KERNELS_WITH_OP(KernelName, OpName) \
     DEFINE_##KernelName##_KERNEL(int8, char, OpName, __##OpName##_##char) \
@@ -138,3 +346,21 @@ DEFINE_FP16OPS(half)
 DEFINE_ALL_KERNELS(ALLREDUCE)
 DEFINE_ALL_KERNELS(REDUCE_LOCAL_OUTOFPLACE)
 DEFINE_ALL_KERNELS(REDUCE_LOCAL_INPLACE)
+DEFINE_ALL_KERNELS(REDUCE_SINGLE_LOCAL_INPLACE)
+DEFINE_ALL_KERNELS(REDUCE_MONOLITHIC_1)
+DEFINE_ALL_KERNELS(REDUCE_MONOLITHIC_2)
+DEFINE_ALL_KERNELS(REDUCE_MONOLITHIC_3)
+DEFINE_ALL_KERNELS(REDUCE_MONOLITHIC_4)
+DEFINE_ALL_KERNELS(REDUCE_MONOLITHIC_5)
+
+DEFINE_KERNELS_WITH_OP(WRITE_MONOLITHIC_1, custom)
+DEFINE_KERNELS_WITH_OP(WRITE_MONOLITHIC_2, custom)
+DEFINE_KERNELS_WITH_OP(WRITE_MONOLITHIC_3, custom)
+DEFINE_KERNELS_WITH_OP(WRITE_MONOLITHIC_4, custom)
+DEFINE_KERNELS_WITH_OP(WRITE_MONOLITHIC_5, custom)
+
+DEFINE_KERNELS_WITH_LP_OP(WRITE_MONOLITHIC_1, custom)
+DEFINE_KERNELS_WITH_LP_OP(WRITE_MONOLITHIC_2, custom)
+DEFINE_KERNELS_WITH_LP_OP(WRITE_MONOLITHIC_3, custom)
+DEFINE_KERNELS_WITH_LP_OP(WRITE_MONOLITHIC_4, custom)
+DEFINE_KERNELS_WITH_LP_OP(WRITE_MONOLITHIC_5, custom)
diff --git a/src/kernels/kernels.spv b/src/kernels/kernels.spv
index 4144e53a1..ac4b8eecf 100644
Binary files a/src/kernels/kernels.spv and b/src/kernels/kernels.spv differ
diff --git a/src/native_device_api/sycl/export.cpp b/src/native_device_api/sycl/export.cpp
index 1ca28290d..18c39eebc 100644
--- a/src/native_device_api/sycl/export.cpp
+++ b/src/native_device_api/sycl/export.cpp
@@ -46,7 +46,7 @@ generic_context_type<cl_backend_type::dpcpp_sycl_l0>::get() const noexcept {
  */
 generic_device_type<cl_backend_type::dpcpp_sycl_l0>::generic_device_type(
     device_index_type id,
-    cl::sycl::info::device_type type /* = info::device_type::gpu*/)
+    sycl::info::device_type type /* = info::device_type::gpu*/)
         : device() {
     if ((std::get<0>(id) == ccl::unused_index_value) &&
         (std::get<1>(id) == ccl::unused_index_value) &&
@@ -57,41 +57,41 @@ generic_device_type<cl_backend_type::dpcpp_sycl_l0>::generic_device_type(
     LOG_DEBUG("Try to find SYCL device by index: ",
               id,
               ", type: ",
-              static_cast<typename std::underlying_type<cl::sycl::info::device_type>::type>(type));
+              static_cast<typename std::underlying_type<sycl::info::device_type>::type>(type));
 
-    auto platforms = cl::sycl::platform::get_platforms();
+    auto platforms = sycl::platform::get_platforms();
     LOG_DEBUG("Found CL plalforms: ", platforms.size());
     auto platform_it =
-        std::find_if(platforms.begin(), platforms.end(), [](const cl::sycl::platform& pl) {
-            return pl.get_info<cl::sycl::info::platform::name>().find("Level-Zero") !=
+        std::find_if(platforms.begin(), platforms.end(), [](const sycl::platform& pl) {
+            return pl.get_info<sycl::info::platform::name>().find("Level-Zero") !=
                    std::string::npos;
-            //or platform.get_backend() == cl::sycl::backend::ext_oneapi_level_zero
+            //or platform.get_backend() == sycl::backend::ext_oneapi_level_zero
         });
     if (platform_it == platforms.end()) {
         std::stringstream ss;
         ss << "cannot find Level-Zero platform. Supported platforms are:\n";
         for (const auto& pl : platforms) {
-            ss << "Platform:\nprofile: " << pl.get_info<cl::sycl::info::platform::profile>()
-               << "\nversion: " << pl.get_info<cl::sycl::info::platform::version>()
-               << "\nname: " << pl.get_info<cl::sycl::info::platform::name>()
-               << "\nvendor: " << pl.get_info<cl::sycl::info::platform::vendor>();
+            ss << "Platform:\nprofile: " << pl.get_info<sycl::info::platform::profile>()
+               << "\nversion: " << pl.get_info<sycl::info::platform::version>()
+               << "\nname: " << pl.get_info<sycl::info::platform::name>()
+               << "\nvendor: " << pl.get_info<sycl::info::platform::vendor>();
         }
 
         CCL_THROW("cannot find device by id: " + ccl::to_string(id) + ", reason:\n" + ss.str());
     }
 
     LOG_DEBUG("Platform:\nprofile: ",
-              platform_it->get_info<cl::sycl::info::platform::profile>(),
+              platform_it->get_info<sycl::info::platform::profile>(),
               "\nversion: ",
-              platform_it->get_info<cl::sycl::info::platform::version>(),
+              platform_it->get_info<sycl::info::platform::version>(),
               "\nname: ",
-              platform_it->get_info<cl::sycl::info::platform::name>(),
+              platform_it->get_info<sycl::info::platform::name>(),
               "\nvendor: ",
-              platform_it->get_info<cl::sycl::info::platform::vendor>());
+              platform_it->get_info<sycl::info::platform::vendor>());
 }
 
 generic_device_type<cl_backend_type::dpcpp_sycl_l0>::generic_device_type(
-    const cl::sycl::device& in_device)
+    const sycl::device& in_device)
         : device(in_device) {}
 
 device_index_type generic_device_type<cl_backend_type::dpcpp_sycl_l0>::get_id() const {
diff --git a/src/parallelizer/parallelizer.cpp b/src/parallelizer/parallelizer.cpp
index cf6d10110..db84e4717 100644
--- a/src/parallelizer/parallelizer.cpp
+++ b/src/parallelizer/parallelizer.cpp
@@ -19,10 +19,13 @@
 #include "coll/coll_util.hpp"
 #include "coll/selection/selection.hpp"
 #include "common/global/global.hpp"
-#include "common/utils/sycl_utils.hpp"
 #include "parallelizer/parallelizer.hpp"
 #include "sched/entry/factory/entry_factory.hpp"
 
+#ifdef CCL_ENABLE_SYCL
+#include "common/utils/sycl_utils.hpp"
+#endif // CCL_ENABLE_SYCL
+
 #define CCL_ATL_LARGE_MSG_SIZE (1024 * 1024 * 1024)
 
 ccl::status ccl_parallelizer::process(ccl_sched* sched, bool update_sched_id) {
@@ -465,6 +468,7 @@ ccl::status ccl_parallelizer::process_base(ccl_sched* sched, bool update_sched_i
                 param.reduction = coll_param.reduction;
                 param.comm = comm;
                 param.stream = coll_param.stream;
+                param.is_scaleout = coll_param.is_scaleout;
                 ccl::add_coll_entry(part_scheds[idx].get(), param);
             }
             break;
diff --git a/src/sched/buffer/buffer_manager.hpp b/src/sched/buffer/buffer_manager.hpp
index 5c34973a1..ed7874db1 100644
--- a/src/sched/buffer/buffer_manager.hpp
+++ b/src/sched/buffer/buffer_manager.hpp
@@ -16,7 +16,7 @@
 #pragma once
 
 #ifdef CCL_ENABLE_ZE
-#include "common/ze/ze_api_wrapper.hpp"
+#include "common/api_wrapper/ze_api_wrapper.hpp"
 #endif // CCL_ENABLE_ZE
 #ifdef CCL_ENABLE_SYCL
 #include <CL/sycl.hpp>
diff --git a/src/sched/cache/key.cpp b/src/sched/cache/key.cpp
index cab6eda15..1637cd42b 100644
--- a/src/sched/cache/key.cpp
+++ b/src/sched/cache/key.cpp
@@ -165,8 +165,8 @@ size_t ccl_sched_key_hasher::operator()(const ccl_sched_key& k) const {
            e.g. sum(a[idx]*(idx+1)) */
         size_t vec1_sum = std::accumulate(k.vec1.begin(), k.vec1.end(), 0);
         size_t vec2_sum = std::accumulate(k.vec2.begin(), k.vec2.end(), 0);
-        hash_value += k.f.ctype + utils::enum_to_underlying(k.f.dtype) +
-                      utils::enum_to_underlying(k.f.reduction) + k.f.count1 + k.f.count2 +
+        hash_value += k.f.ctype + ccl::utils::enum_to_underlying(k.f.dtype) +
+                      ccl::utils::enum_to_underlying(k.f.reduction) + k.f.count1 + k.f.count2 +
                       k.f.root + (size_t)k.f.buf1 + (size_t)k.f.buf2 + (size_t)k.f.comm +
                       (size_t)k.f.reduction_fn + vec1_sum + vec2_sum;
     }
diff --git a/src/sched/entry/coll/coll_entry.cpp b/src/sched/entry/coll/coll_entry.cpp
index 82d0cf2e2..35fc307db 100644
--- a/src/sched/entry/coll/coll_entry.cpp
+++ b/src/sched/entry/coll/coll_entry.cpp
@@ -38,7 +38,8 @@ ccl::status coll_entry::build_sched(ccl_sched* sched, const ccl_coll_entry_param
                                            param.count,
                                            param.dtype,
                                            param.reduction,
-                                           param.comm);
+                                           param.comm,
+                                           param.is_scaleout);
             break;
         }
         case ccl_coll_alltoall: {
diff --git a/src/sched/entry/coll/coll_entry_param.hpp b/src/sched/entry/coll/coll_entry_param.hpp
index b0bf58ac8..05f0a1ac6 100644
--- a/src/sched/entry/coll/coll_entry_param.hpp
+++ b/src/sched/entry/coll/coll_entry_param.hpp
@@ -23,6 +23,8 @@ struct ccl_coll_entry_param {
     ccl_buffer recv_buf{};
     size_t count{};
     size_t send_count{};
+    std::vector<ccl_buffer> send_bufs;
+    std::vector<ccl_buffer> recv_bufs;
     const size_t* send_counts{};
     const size_t* recv_counts{};
     ccl_datatype dtype{};
@@ -31,4 +33,5 @@ struct ccl_coll_entry_param {
     ccl_comm* comm{};
     ccl_stream* stream{};
     ccl_coll_algo hint_algo{};
+    bool is_scaleout{ false };
 };
diff --git a/src/sched/entry/coll/direct/allgatherv_entry.hpp b/src/sched/entry/coll/direct/allgatherv_entry.hpp
index 03416b893..c1bea8c6e 100644
--- a/src/sched/entry/coll/direct/allgatherv_entry.hpp
+++ b/src/sched/entry/coll/direct/allgatherv_entry.hpp
@@ -26,39 +26,34 @@ class allgatherv_entry : public base_coll_entry {
     allgatherv_entry() = delete;
     allgatherv_entry(ccl_sched* sched,
                      const ccl_buffer send_buf,
-                     size_t send_cnt,
+                     size_t send_count,
                      ccl_buffer recv_buf,
-                     const size_t* recv_cnts,
+                     const size_t* recv_counts,
                      const ccl_datatype& dtype,
                      ccl_comm* comm)
             : base_coll_entry(sched),
               send_buf(send_buf),
-              send_cnt(send_cnt),
+              send_count(send_count),
               recv_buf(recv_buf),
-              recv_cnts(recv_cnts),
+              recv_counts(recv_counts, recv_counts + comm->size()),
               dtype(dtype),
               comm(comm),
-              recv_bytes(nullptr),
-              offsets(nullptr),
+              recv_bytes(comm->size()),
+              offsets(comm->size()),
               sum_recv_bytes(0) {}
 
     void start() override {
         size_t dt_size = dtype.size();
-        size_t send_bytes = send_cnt * dt_size;
+        size_t send_bytes = send_count * dt_size;
         int comm_size = comm->size();
         int i;
 
-        if (!recv_bytes && !offsets) {
-            recv_bytes = static_cast<int*>(CCL_MALLOC(comm_size * sizeof(int), "recv_bytes"));
-            offsets = static_cast<int*>(CCL_MALLOC(comm_size * sizeof(int), "offsets"));
-        }
-
-        recv_bytes[0] = recv_cnts[0] * dt_size;
+        recv_bytes[0] = recv_counts[0] * dt_size;
         offsets[0] = 0;
         sum_recv_bytes = recv_bytes[0];
 
         for (i = 1; i < comm_size; i++) {
-            recv_bytes[i] = recv_cnts[i] * dt_size;
+            recv_bytes[i] = recv_counts[i] * dt_size;
             offsets[i] = offsets[i - 1] + recv_bytes[i - 1]; // treat buffers as char buffers
             sum_recv_bytes += recv_bytes[i];
         }
@@ -68,8 +63,8 @@ class allgatherv_entry : public base_coll_entry {
                                                                    send_buf.get_ptr(send_bytes),
                                                                    send_bytes,
                                                                    recv_buf.get_ptr(sum_recv_bytes),
-                                                                   recv_bytes,
-                                                                   offsets,
+                                                                   recv_bytes.data(),
+                                                                   offsets.data(),
                                                                    req);
 
         if (unlikely(atl_status != ATL_STATUS_SUCCESS)) {
@@ -91,11 +86,6 @@ class allgatherv_entry : public base_coll_entry {
         }
     }
 
-    ~allgatherv_entry() {
-        CCL_FREE(recv_bytes);
-        CCL_FREE(offsets);
-    }
-
     const char* name() const override {
         return class_name();
     }
@@ -105,18 +95,18 @@ class allgatherv_entry : public base_coll_entry {
         ccl_logger::format(str,
                            "dt ",
                            ccl::global_data::get().dtypes->name(dtype),
-                           ", send_cnt ",
-                           send_cnt,
+                           ", send_count ",
+                           send_count,
                            ", send_buf ",
                            send_buf,
-                           ", recv_cnt ",
-                           recv_cnts,
+                           ", recv_counts[0] ",
+                           recv_counts[0],
                            ", recv_buf ",
                            recv_buf,
-                           ", recv_bytes ",
-                           recv_bytes,
-                           ", offsets ",
-                           offsets,
+                           ", recv_bytes[0] ",
+                           recv_bytes[0],
+                           ", offsets[0] ",
+                           offsets[0],
                            ", comm_id ",
                            comm->get_comm_id(),
                            ", req ",
@@ -125,15 +115,15 @@ class allgatherv_entry : public base_coll_entry {
     }
 
 private:
-    ccl_buffer send_buf;
-    size_t send_cnt;
-    ccl_buffer recv_buf;
-    const size_t* recv_cnts;
-    ccl_datatype dtype;
-    ccl_comm* comm;
+    const ccl_buffer send_buf;
+    const size_t send_count;
+    const ccl_buffer recv_buf;
+    const std::vector<size_t> recv_counts;
+    const ccl_datatype dtype;
+    const ccl_comm* comm;
     atl_req_t req{};
 
-    int* recv_bytes;
-    int* offsets;
+    std::vector<int> recv_bytes;
+    std::vector<int> offsets;
     size_t sum_recv_bytes;
 };
diff --git a/src/sched/entry/coll/direct/alltoallv_entry.hpp b/src/sched/entry/coll/direct/alltoallv_entry.hpp
index f95376ecf..a45cd47cb 100644
--- a/src/sched/entry/coll/direct/alltoallv_entry.hpp
+++ b/src/sched/entry/coll/direct/alltoallv_entry.hpp
@@ -27,22 +27,22 @@ class alltoallv_entry : public base_coll_entry {
     alltoallv_entry() = delete;
     alltoallv_entry(ccl_sched* sched,
                     const ccl_buffer send_buf,
-                    const size_t* send_cnts,
+                    const size_t* send_counts,
                     ccl_buffer recv_buf,
-                    const size_t* recv_cnts,
+                    const size_t* recv_counts,
                     const ccl_datatype& dtype,
                     ccl_comm* comm)
             : base_coll_entry(sched),
               send_buf(send_buf),
-              send_cnts(send_cnts),
+              send_counts(send_counts, send_counts + comm->size()),
               recv_buf(recv_buf),
-              recv_cnts(recv_cnts),
+              recv_counts(recv_counts, recv_counts + comm->size()),
               dtype(dtype),
               comm(comm),
-              send_bytes(nullptr),
-              recv_bytes(nullptr),
-              send_offsets(nullptr),
-              recv_offsets(nullptr),
+              send_bytes(comm->size()),
+              recv_bytes(comm->size()),
+              send_offsets(comm->size()),
+              recv_offsets(comm->size()),
               sum_send_bytes(0),
               sum_recv_bytes(0) {}
 
@@ -53,23 +53,16 @@ class alltoallv_entry : public base_coll_entry {
         sum_recv_bytes = 0;
         sum_send_bytes = 0;
 
-        if (!send_bytes && !recv_bytes && !send_offsets && !recv_offsets) {
-            send_bytes = static_cast<int*>(CCL_MALLOC(comm_size * sizeof(int), "send_bytes"));
-            recv_bytes = static_cast<int*>(CCL_MALLOC(comm_size * sizeof(int), "recv_bytes"));
-            send_offsets = static_cast<int*>(CCL_MALLOC(comm_size * sizeof(int), "send_offsets"));
-            recv_offsets = static_cast<int*>(CCL_MALLOC(comm_size * sizeof(int), "recv_offsets"));
-        }
-
-        send_bytes[0] = send_cnts[0] * dt_size;
-        recv_bytes[0] = recv_cnts[0] * dt_size;
+        send_bytes[0] = send_counts[0] * dt_size;
+        recv_bytes[0] = recv_counts[0] * dt_size;
         send_offsets[0] = 0;
         recv_offsets[0] = 0;
         sum_send_bytes = send_bytes[0];
         sum_recv_bytes = recv_bytes[0];
 
         for (i = 1; i < comm_size; i++) {
-            send_bytes[i] = send_cnts[i] * dt_size;
-            recv_bytes[i] = recv_cnts[i] * dt_size;
+            send_bytes[i] = send_counts[i] * dt_size;
+            recv_bytes[i] = recv_counts[i] * dt_size;
             send_offsets[i] =
                 send_offsets[i - 1] + send_bytes[i - 1]; // treat buffers as char buffers
             recv_offsets[i] = recv_offsets[i - 1] + recv_bytes[i - 1];
@@ -81,11 +74,11 @@ class alltoallv_entry : public base_coll_entry {
 
         atl_status_t atl_status = comm->get_atl_comm()->alltoallv(sched->bin->get_atl_ep(),
                                                                   send_buf.get_ptr(sum_send_bytes),
-                                                                  send_bytes,
-                                                                  send_offsets,
+                                                                  send_bytes.data(),
+                                                                  send_offsets.data(),
                                                                   recv_buf.get_ptr(sum_recv_bytes),
-                                                                  recv_bytes,
-                                                                  recv_offsets,
+                                                                  recv_bytes.data(),
+                                                                  recv_offsets.data(),
                                                                   req);
 
         if (unlikely(atl_status != ATL_STATUS_SUCCESS)) {
@@ -107,13 +100,6 @@ class alltoallv_entry : public base_coll_entry {
         }
     }
 
-    ~alltoallv_entry() {
-        CCL_FREE(send_bytes);
-        CCL_FREE(recv_bytes);
-        CCL_FREE(send_offsets);
-        CCL_FREE(recv_offsets);
-    }
-
     const char* name() const override {
         return class_name();
     }
@@ -123,22 +109,22 @@ class alltoallv_entry : public base_coll_entry {
         ccl_logger::format(str,
                            "dt ",
                            ccl::global_data::get().dtypes->name(dtype),
-                           ", send_cnts ",
-                           send_cnts,
+                           ", send_counts[0] ",
+                           send_counts[0],
                            ", send_buf ",
                            send_buf,
-                           ", send_bytes ",
-                           send_bytes,
-                           ", send_offsets ",
-                           send_offsets,
-                           ", recv_cnts ",
-                           recv_cnts,
+                           ", send_bytes[0] ",
+                           send_bytes[0],
+                           ", send_offsets[0] ",
+                           send_offsets[0],
+                           ", recv_counts[0] ",
+                           recv_counts[0],
                            ", recv_buf ",
                            recv_buf,
-                           ", recv_bytes ",
-                           recv_bytes,
-                           ", recv_offsets ",
-                           recv_offsets,
+                           ", recv_bytes[0] ",
+                           recv_bytes[0],
+                           ", recv_offsets[0] ",
+                           recv_offsets[0],
                            ", comm_id ",
                            comm->get_comm_id(),
                            ", req ",
@@ -147,18 +133,18 @@ class alltoallv_entry : public base_coll_entry {
     }
 
 private:
-    ccl_buffer send_buf;
-    const size_t* send_cnts;
-    ccl_buffer recv_buf;
-    const size_t* recv_cnts;
-    ccl_datatype dtype;
-    ccl_comm* comm;
+    const ccl_buffer send_buf;
+    const std::vector<size_t> send_counts;
+    const ccl_buffer recv_buf;
+    const std::vector<size_t> recv_counts;
+    const ccl_datatype dtype;
+    const ccl_comm* comm;
     atl_req_t req{};
 
-    int* send_bytes;
-    int* recv_bytes;
-    int* send_offsets;
-    int* recv_offsets;
+    std::vector<int> send_bytes;
+    std::vector<int> recv_bytes;
+    std::vector<int> send_offsets;
+    std::vector<int> recv_offsets;
     size_t sum_send_bytes;
     size_t sum_recv_bytes;
 };
diff --git a/src/sched/entry/copy/copy_entry.cpp b/src/sched/entry/copy/copy_entry.cpp
index afb42ee40..c77bf9508 100644
--- a/src/sched/entry/copy/copy_entry.cpp
+++ b/src/sched/entry/copy/copy_entry.cpp
@@ -13,14 +13,12 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
+#include "common/global/global.hpp"
 #include "sched/entry/copy/copy_entry.hpp"
 #include "sched/queue/queue.hpp"
 
 #ifdef CCL_ENABLE_SYCL
-#include <CL/sycl.hpp>
-#include <CL/sycl/backend_types.hpp>
 #include "common/utils/sycl_utils.hpp"
-
 #ifdef CCL_ENABLE_ZE
 #include "sched/entry/ze/ze_copy_entry.hpp"
 #endif // CCL_ENABLE_ZE
@@ -181,6 +179,25 @@ void copy_entry::reset(size_t idx) {
 #endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
 }
 
+void copy_entry::dump_detail(std::stringstream& str) const {
+    ccl_logger::format(str,
+                       "dt ",
+                       ccl::global_data::get().dtypes->name(dtype),
+                       ", count ",
+                       count,
+                       ", in_buf ",
+                       in_buf,
+                       ", out_buf ",
+                       out_buf,
+                       ", in_buf_offset ",
+                       attr.in_buf_offset,
+                       ", out_buf_offset ",
+                       attr.out_buf_offset,
+                       ", direction ",
+                       to_string(attr.direction),
+                       "\n");
+}
+
 void copy_entry::do_regular_copy() {
     size_t bytes = dtype.size() * count;
     auto comp_status =
diff --git a/src/sched/entry/copy/copy_entry.hpp b/src/sched/entry/copy/copy_entry.hpp
index 1dcfb9844..cd518a269 100644
--- a/src/sched/entry/copy/copy_entry.hpp
+++ b/src/sched/entry/copy/copy_entry.hpp
@@ -47,24 +47,7 @@ class copy_entry : public sched_entry {
     void reset(size_t idx) override;
 
 protected:
-    void dump_detail(std::stringstream& str) const override {
-        ccl_logger::format(str,
-                           "dt ",
-                           ccl::global_data::get().dtypes->name(dtype),
-                           ", count ",
-                           count,
-                           ", in_buf ",
-                           in_buf,
-                           ", out_buf ",
-                           out_buf,
-                           ", in_buf_offset ",
-                           attr.in_buf_offset,
-                           ", out_buf_offset ",
-                           attr.out_buf_offset,
-                           ", direction ",
-                           to_string(attr.direction),
-                           "\n");
-    }
+    void dump_detail(std::stringstream& str) const override;
 
 private:
     ccl_buffer in_buf{};
diff --git a/src/sched/entry/copy/copy_helper.cpp b/src/sched/entry/copy/copy_helper.cpp
index 8db75901f..cfe1814c2 100644
--- a/src/sched/entry/copy/copy_helper.cpp
+++ b/src/sched/entry/copy/copy_helper.cpp
@@ -13,8 +13,24 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
+#include "common/global/global.hpp"
 #include "sched/entry/copy/copy_helper.hpp"
 
+copy_attr::copy_attr()
+        : peer_rank(ccl_comm::invalid_rank),
+          peer_buf_idx(0),
+          direction(copy_direction::undefined),
+          map_comm(nullptr),
+          in_buf_offset(0),
+          out_buf_offset(0),
+          use_nontemporal(false)
+#ifdef CCL_ENABLE_ZE
+          ,
+          hint_queue_index(0)
+#endif // CCL_ENABLE_ZE
+{
+}
+
 copy_attr::copy_attr(int peer_rank,
                      size_t peer_buf_idx,
                      copy_direction direction,
@@ -34,8 +50,43 @@ copy_attr::copy_attr(copy_direction direction, size_t in_buf_offset, size_t out_
           out_buf_offset(out_buf_offset) {}
 
 using copy_direction_str_enum =
-    utils::enum_to_str<utils::enum_to_underlying(copy_direction::d2d) + 1>;
+    ccl::utils::enum_to_str<ccl::utils::enum_to_underlying(copy_direction::c2c) + 1>;
 std::string to_string(copy_direction val) {
-    return copy_direction_str_enum({ "UNDEFINED", "H2H", "D2H", "H2D", "D2D" })
+    return copy_direction_str_enum({ "UNDEFINED", "H2H", "D2H", "H2D", "D2D", "T2T", "C2C" })
         .choose(val, "UNKNOWN");
 }
+
+#ifdef CCL_ENABLE_SYCL
+
+sycl_copier::sycl_copier(copy_direction direction,
+                         ccl_buffer in_buf,
+                         ccl_buffer out_buf,
+                         size_t count,
+                         const ccl_datatype& dtype,
+                         bool is_sycl_buf,
+                         size_t in_buf_offset,
+                         size_t out_buf_offset)
+        : direction(direction),
+          in_buf(in_buf),
+          out_buf(out_buf),
+          count(count),
+          dtype(dtype),
+          is_sycl_buf(is_sycl_buf),
+          in_buf_offset(in_buf_offset),
+          out_buf_offset(out_buf_offset) {}
+
+bool sycl_copier::is_completed() const {
+    return e.get_info<sycl::info::event::command_execution_status>() ==
+           sycl::info::event_command_status::complete;
+}
+
+void sycl_copier::set_queue(const sycl::queue* external_q) {
+    q = const_cast<sycl::queue*>(external_q);
+    CCL_THROW_IF_NOT(q);
+}
+
+std::string sycl_copier::get_dtype_name(const ccl_datatype& dt) const {
+    return ccl::global_data::get().dtypes->name(dt);
+}
+
+#endif // CCL_ENABLE_SYCL
diff --git a/src/sched/entry/copy/copy_helper.hpp b/src/sched/entry/copy/copy_helper.hpp
index 3066358d5..4363d5c23 100644
--- a/src/sched/entry/copy/copy_helper.hpp
+++ b/src/sched/entry/copy/copy_helper.hpp
@@ -16,43 +16,43 @@
 #pragma once
 
 #include "common/datatype/datatype.hpp"
-#include "common/global/global.hpp"
 #include "common/utils/buffer.hpp"
 #include "common/utils/enums.hpp"
 #include "common/utils/tuple.hpp"
+
+#ifdef CCL_ENABLE_SYCL
 #include "common/utils/sycl_utils.hpp"
+#endif // CCL_ENABLE_SYCL
 
-enum class copy_direction { undefined, h2h, d2h, h2d, d2d };
+enum class copy_direction { undefined, h2h, d2h, h2d, d2d, t2t, c2c };
 std::string to_string(copy_direction val);
 
+class ccl_comm;
+
 struct copy_attr {
-    int peer_rank = ccl_comm::invalid_rank;
-    size_t peer_buf_idx = 0;
-    copy_direction direction = copy_direction::undefined;
-    ccl_comm* map_comm = nullptr;
-    size_t in_buf_offset = 0;
-    size_t out_buf_offset = 0;
-    bool use_nontemporal = false;
+    int peer_rank;
+    size_t peer_buf_idx;
+    copy_direction direction;
+    ccl_comm* map_comm;
+    size_t in_buf_offset;
+    size_t out_buf_offset;
+    bool use_nontemporal;
 
 #ifdef CCL_ENABLE_ZE
-    int hint_queue_index = 0;
-    bool is_peer_card_copy = false;
+    int hint_queue_index;
 #endif // CCL_ENABLE_ZE
 
-    copy_attr() {}
-
+    copy_attr();
     copy_attr(int peer_rank,
               size_t peer_buf_idx,
               copy_direction direction,
               ccl_comm* map_comm = nullptr,
               size_t in_buf_offset = 0,
               size_t out_buf_offset = 0);
-
     copy_attr(copy_direction direction, size_t in_buf_offset = 0, size_t out_buf_offset = 0);
 };
 
 #ifdef CCL_ENABLE_SYCL
-
 struct sycl_copier {
     sycl_copier() = default;
     sycl_copier(copy_direction direction,
@@ -62,35 +62,20 @@ struct sycl_copier {
                 const ccl_datatype& dtype,
                 bool is_sycl_buf = false,
                 size_t in_buf_offset = 0,
-                size_t out_buf_offset = 0)
-            : direction(direction),
-              in_buf(in_buf),
-              out_buf(out_buf),
-              count(count),
-              dtype(dtype),
-              is_sycl_buf(is_sycl_buf),
-              in_buf_offset(in_buf_offset),
-              out_buf_offset(out_buf_offset) {}
-
-    bool is_completed() {
-        return (e.get_info<sycl::info::event::command_execution_status>() ==
-                sycl::info::event_command_status::complete)
-                   ? true
-                   : false;
-    }
+                size_t out_buf_offset = 0);
 
-    void set_queue(sycl::queue* external_q) {
-        q = external_q;
-        CCL_THROW_IF_NOT(q);
-    }
+    bool is_completed() const;
+    void set_queue(const sycl::queue* external_q);
 
     template <size_t index, class specific_sycl_buffer>
     void invoke() {
-        if (index == (int)(dtype.idx())) {
+        const bool dtype_idx_is_matched = index == (int)(dtype.idx());
+
+        if (dtype_idx_is_matched) {
             LOG_DEBUG("visitor matched index: ",
                       index,
-                      ", ccl: ",
-                      ccl::global_data::get().dtypes->name(dtype),
+                      ", dt: ",
+                      get_dtype_name(dtype),
                       ", in: ",
                       __PRETTY_FUNCTION__);
 
@@ -179,13 +164,15 @@ struct sycl_copier {
         else {
             LOG_TRACE("visitor skipped index: ",
                       index,
-                      ", ccl: ",
-                      ccl::global_data::get().dtypes->name(dtype),
+                      ", dt: ",
+                      get_dtype_name(dtype),
                       ", in: ",
                       __PRETTY_FUNCTION__);
         }
     }
 
+    std::string get_dtype_name(const ccl_datatype& dt) const;
+
     copy_direction direction;
     ccl_buffer in_buf;
     ccl_buffer out_buf;
@@ -197,5 +184,4 @@ struct sycl_copier {
     size_t out_buf_offset;
     sycl::event e;
 };
-
 #endif // CCL_ENABLE_SYCL
diff --git a/src/sched/entry/deps_entry.hpp b/src/sched/entry/deps_entry.hpp
index a7b4ef41e..0c8a8c4dd 100644
--- a/src/sched/entry/deps_entry.hpp
+++ b/src/sched/entry/deps_entry.hpp
@@ -1,6 +1,9 @@
 #pragma once
 
+#ifdef CCL_ENABLE_SYCL
 #include "common/utils/sycl_utils.hpp"
+#endif // CCL_ENABLE_SYCL
+
 #include "sched/entry/entry.hpp"
 
 class deps_entry : public sched_entry {
diff --git a/src/sched/entry/entry.cpp b/src/sched/entry/entry.cpp
index 12c7a3088..a44d587c0 100644
--- a/src/sched/entry/entry.cpp
+++ b/src/sched/entry/entry.cpp
@@ -18,6 +18,13 @@
 #include "sched/entry/entry.hpp"
 #include "sched/sched.hpp"
 
+sched_entry::sched_entry(ccl_sched* sched, bool is_barrier) : sched(sched), barrier(is_barrier) {
+    use_total_timer = ccl::global_data::env().sched_profile;
+    detect_update_time_expiration =
+        ccl::global_data::env().entry_max_update_time_sec != CCL_ENV_SIZET_NOT_SPECIFIED;
+    use_update_timer = ccl::global_data::env().sched_profile || detect_update_time_expiration;
+}
+
 void sched_entry::do_progress() {
     if (is_completed())
         return;
@@ -34,8 +41,8 @@ void sched_entry::do_progress() {
         bool took_credits = false;
         if (status == ccl_sched_entry_status_not_started) {
             took_credits = sched->flow_control.take_credit();
-            if (took_credits && ccl::global_data::env().sched_profile) {
-                timer.start();
+            if (took_credits && use_total_timer) {
+                total_timer.start();
             }
         }
         else if (status == ccl_sched_entry_status_again) {
@@ -56,7 +63,31 @@ void sched_entry::do_progress() {
     }
     else if (status == ccl_sched_entry_status_started) {
         LOG_TRACE("update entry ", name());
+
+        if (use_update_timer && !update_timer.is_started()) {
+            update_timer.start();
+        }
+        else if (update_timer.is_started() && detect_update_time_expiration) {
+            // do this before entry::update so entry can handle this state inside update
+            long double seconds = update_timer.get_elapsed_usec() / 1000000;
+            if (seconds >= ccl::global_data::env().entry_max_update_time_sec) {
+                is_update_time_expired = true;
+            }
+        }
+
         update();
+
+        if (use_update_timer) {
+            update_timer.update();
+        }
+
+        // ignore timeout on coll entry
+        // actual timeout will be reported from sub-entries
+        if (strcmp(name(), "COLL") != 0) {
+            CCL_THROW_IF_NOT(
+                !is_update_time_expired, "entry ", name(), " ", this, " update time expired");
+        }
+
         CCL_THROW_IF_NOT(status >= ccl_sched_entry_status_started,
                          "bad status ",
                          status,
@@ -66,8 +97,8 @@ void sched_entry::do_progress() {
     }
 
     if (status == ccl_sched_entry_status_complete) {
-        if (ccl::global_data::env().sched_profile) {
-            timer.stop();
+        if (use_total_timer) {
+            total_timer.update();
         }
 
         if (exec_mode == ccl_sched_entry_exec_once) {
@@ -99,8 +130,13 @@ void sched_entry::update() {
 }
 
 void sched_entry::reset(size_t idx) {
-    if (ccl::global_data::env().sched_profile) {
-        timer.reset();
+    if (use_total_timer) {
+        total_timer.reset();
+    }
+
+    if (use_update_timer) {
+        update_timer.reset();
+        is_update_time_expired = false;
     }
 
     if (status == ccl_sched_entry_status_complete_once) {
@@ -160,6 +196,10 @@ void sched_entry::set_exec_mode(ccl_sched_entry_exec_mode mode) {
     exec_mode = mode;
 }
 
+std::string sched_entry::name_ext() const {
+    return std::string(name());
+}
+
 void sched_entry::dump_detail(std::stringstream& str) const {}
 
 void sched_entry::update_status(atl_status_t atl_status) {
diff --git a/src/sched/entry/entry.hpp b/src/sched/entry/entry.hpp
index 9d3d4f2e2..19e7df0c7 100644
--- a/src/sched/entry/entry.hpp
+++ b/src/sched/entry/entry.hpp
@@ -52,9 +52,7 @@ enum ccl_condition {
 class alignas(CACHELINE_SIZE) sched_entry {
 public:
     sched_entry() = delete;
-    explicit sched_entry(ccl_sched* sched, bool is_barrier = false)
-            : sched(sched),
-              barrier(is_barrier) {}
+    explicit sched_entry(ccl_sched* sched, bool is_barrier = false);
 
     virtual ~sched_entry() {}
 
@@ -74,10 +72,12 @@ class alignas(CACHELINE_SIZE) sched_entry {
     void set_exec_mode(ccl_sched_entry_exec_mode mode);
 
     virtual const char* name() const = 0;
+    virtual std::string name_ext() const;
 
     static const char* status_to_str(ccl_sched_entry_status status);
 
-    ccl::sched_timer timer;
+    ccl::sched_timer total_timer;
+    ccl::sched_timer update_timer;
 
     virtual void init(){};
     virtual void finalize(){};
@@ -95,4 +95,9 @@ class alignas(CACHELINE_SIZE) sched_entry {
     size_t start_idx = 0;
     ccl_sched_entry_status status = ccl_sched_entry_status_not_started;
     ccl_sched_entry_exec_mode exec_mode = ccl_sched_entry_exec_regular;
+
+    bool use_total_timer = false;
+    bool detect_update_time_expiration = false;
+    bool use_update_timer = false;
+    bool is_update_time_expired = false;
 };
diff --git a/src/sched/entry/probe_entry.hpp b/src/sched/entry/probe_entry.hpp
index adc1d4ece..e57127ed1 100644
--- a/src/sched/entry/probe_entry.hpp
+++ b/src/sched/entry/probe_entry.hpp
@@ -33,7 +33,7 @@ class probe_entry : public sched_entry {
               comm(comm) {}
 
     void start() override {
-        atl_tag = comm->get_atl_comm()->tag->create(
+        atl_tag = comm->get_atl_comm()->tag_creator->create(
             src, comm->get_comm_id(), sched->sched_id, sched->get_op_id());
         LOG_DEBUG("PROBE entry src ", src, ", tag ", atl_tag);
         status = ccl_sched_entry_status_started;
diff --git a/src/sched/entry/recv_copy_entry.cpp b/src/sched/entry/recv_copy_entry.cpp
index bd4d9fdaa..52fb8cfba 100644
--- a/src/sched/entry/recv_copy_entry.cpp
+++ b/src/sched/entry/recv_copy_entry.cpp
@@ -13,13 +13,14 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
+#include "comm/comm.hpp"
 #include "comp/comp.hpp"
 #include "sched/entry/entry.hpp"
 #include "sched/entry/recv_copy_entry.hpp"
 #include "sched/queue/queue.hpp"
 
 void recv_copy_entry::start() {
-    atl_tag = comm->get_atl_comm()->tag->create(
+    atl_tag = comm->get_atl_comm()->tag_creator->create(
         src, comm->get_comm_id(), sched->sched_id, sched->get_op_id());
     LOG_DEBUG("starting RECV in RECV_COPY entry, src ",
               src,
@@ -56,3 +57,22 @@ void recv_copy_entry::update() {
     status = ccl_sched_entry_status_complete;
     LOG_DEBUG("completed COPY in RECV_COPY entry");
 }
+
+void recv_copy_entry::dump_detail(std::stringstream& str) const {
+    ccl_logger::format(str,
+                       ", recv_buf ",
+                       recv_buf,
+                       ", copy_buf ",
+                       copy_buf,
+                       ", bytes ",
+                       bytes,
+                       ", src ",
+                       src,
+                       ", atl_tag ",
+                       atl_tag,
+                       ", comm_id ",
+                       comm->get_comm_id(),
+                       ", req ",
+                       req,
+                       "\n");
+}
diff --git a/src/sched/entry/recv_copy_entry.hpp b/src/sched/entry/recv_copy_entry.hpp
index 8adcd7021..d31c75af8 100644
--- a/src/sched/entry/recv_copy_entry.hpp
+++ b/src/sched/entry/recv_copy_entry.hpp
@@ -48,24 +48,7 @@ class recv_copy_entry final : public sched_entry {
     }
 
 protected:
-    void dump_detail(std::stringstream& str) const override {
-        ccl_logger::format(str,
-                           ", recv_buf ",
-                           recv_buf,
-                           ", copy_buf ",
-                           copy_buf,
-                           ", bytes ",
-                           bytes,
-                           ", src ",
-                           src,
-                           ", atl_tag ",
-                           atl_tag,
-                           ", comm_id ",
-                           comm->get_comm_id(),
-                           ", req ",
-                           req,
-                           "\n");
-    }
+    void dump_detail(std::stringstream& str) const override;
 
 private:
     ccl_buffer recv_buf;
diff --git a/src/sched/entry/recv_entry.hpp b/src/sched/entry/recv_entry.hpp
index 0f48f5157..5eca5c28c 100644
--- a/src/sched/entry/recv_entry.hpp
+++ b/src/sched/entry/recv_entry.hpp
@@ -53,7 +53,7 @@ class recv_entry : public sched_entry,
     void start() override {
         update_fields();
 
-        atl_tag = comm->get_atl_comm()->tag->create(
+        atl_tag = comm->get_atl_comm()->tag_creator->create(
             src, comm->get_comm_id(), sched->sched_id, sched->get_op_id());
         size_t bytes = cnt * dtype.size();
 
diff --git a/src/sched/entry/recv_reduce_entry.hpp b/src/sched/entry/recv_reduce_entry.hpp
index 6195d9dda..828b39baf 100644
--- a/src/sched/entry/recv_reduce_entry.hpp
+++ b/src/sched/entry/recv_reduce_entry.hpp
@@ -77,7 +77,7 @@ class recv_reduce_entry final : public sched_entry {
     }
 
     void start() override {
-        atl_tag = comm->get_atl_comm()->tag->create(
+        atl_tag = comm->get_atl_comm()->tag_creator->create(
             src, comm->get_comm_id(), sched->sched_id, sched->get_op_id());
         size_t bytes = in_cnt * dtype.size();
         LOG_DEBUG("starting RECV in RECV_REDUCE entry, src ",
diff --git a/src/sched/entry/send_entry.hpp b/src/sched/entry/send_entry.hpp
index 4b44cfeb2..44c1e5127 100644
--- a/src/sched/entry/send_entry.hpp
+++ b/src/sched/entry/send_entry.hpp
@@ -71,7 +71,7 @@ class send_entry : public sched_entry,
     }
 
     void start_send() {
-        atl_tag = comm->get_atl_comm()->tag->create(
+        atl_tag = comm->get_atl_comm()->tag_creator->create(
             comm->rank(), comm->get_comm_id(), sched->sched_id, sched->get_op_id());
         size_t bytes = cnt * dtype.size();
 
diff --git a/src/sched/entry/subsched_entry.hpp b/src/sched/entry/subsched_entry.hpp
index 381b8911d..adaa38e52 100644
--- a/src/sched/entry/subsched_entry.hpp
+++ b/src/sched/entry/subsched_entry.hpp
@@ -59,6 +59,11 @@ class subsched_entry : public sched_entry {
         auto& subscheds = subsched->get_subscheds();
         for (size_t i = 0; i < subscheds.size(); i++) {
             subscheds[i]->set_op_id(i);
+            if (!strcmp(subsched_name, "SCALEOUT") || !strcmp(subsched_name, "A2AV_RECV") ||
+                !strcmp(subsched_name, "A2AV_SEND")) {
+                subscheds[i]->set_scaleout_flag();
+                LOG_DEBUG("scaleout flag set: ", subscheds[i]);
+            }
         }
     }
 
diff --git a/src/sched/entry/ze/allreduce/ze_a2a_allreduce_entry.cpp b/src/sched/entry/ze/allreduce/ze_a2a_allreduce_entry.cpp
index 18abf70c9..240d4d2a7 100644
--- a/src/sched/entry/ze/allreduce/ze_a2a_allreduce_entry.cpp
+++ b/src/sched/entry/ze/allreduce/ze_a2a_allreduce_entry.cpp
@@ -14,6 +14,7 @@
  limitations under the License.
 */
 #include "common/stream/stream.hpp"
+#include "comp/comp.hpp"
 #include "sched/entry/ze/allreduce/ze_a2a_allreduce_entry.hpp"
 #include "sched/entry/ze/ze_a2a_allgatherv_entry.hpp"
 #include "sched/entry/ze/ze_a2a_reduce_scatter_entry.hpp"
@@ -23,6 +24,7 @@
 
 #include <algorithm>
 #include <string>
+#include <sstream>
 
 using namespace ccl;
 using namespace ccl::ze;
@@ -36,7 +38,8 @@ ze_a2a_allreduce_entry::ze_a2a_allreduce_entry(ccl_sched* sched,
                                                ccl_comm* comm,
                                                std::vector<ze_event_handle_t> wait_events,
                                                size_t send_buf_idx,
-                                               size_t recv_buf_idx)
+                                               size_t recv_buf_idx,
+                                               size_t peer_buf_offset)
         : ze_base_entry(sched, comm, comm->size() * event_group_count, wait_events),
           send_buf(send_buf),
           recv_buf(recv_buf),
@@ -45,6 +48,7 @@ ze_a2a_allreduce_entry::ze_a2a_allreduce_entry(ccl_sched* sched,
           op(op),
           send_buf_idx(send_buf_idx),
           recv_buf_idx(recv_buf_idx),
+          peer_buf_offset(peer_buf_offset),
           peer_count(comm->size() - 1) {
     size_t segment_count = cnt / comm->size();
     bool count_check =
@@ -59,14 +63,16 @@ ze_a2a_allreduce_entry::ze_a2a_allreduce_entry(ccl_sched* sched,
 void ze_a2a_allreduce_entry::init_ze_hook() {
     /* get peer buffers */
     std::vector<ccl_buffer> peer_send_bufs(peer_count);
-    std::vector<ccl_buffer> peer_recv_bufs(peer_count);
+    // allgatherv entry requires the peer_recv_bufs at the same index as rank
+    std::vector<ccl_buffer> peer_recv_bufs(comm->size());
 
     for (int i = 0; i < peer_count; ++i) {
         int peer_rank = (comm_rank + i + 1) % comm->size();
         sched->get_memory().handle_manager.get(peer_rank, send_buf_idx, peer_send_bufs[i], comm);
         CCL_THROW_IF_NOT(peer_send_bufs[i].get_ptr(), "null IPC buffer is received");
-        sched->get_memory().handle_manager.get(peer_rank, recv_buf_idx, peer_recv_bufs[i], comm);
-        CCL_THROW_IF_NOT(peer_recv_bufs[i].get_ptr(), "null IPC buffer is received");
+        sched->get_memory().handle_manager.get(
+            peer_rank, recv_buf_idx, peer_recv_bufs[peer_rank], comm);
+        CCL_THROW_IF_NOT(peer_recv_bufs[peer_rank].get_ptr(), "null IPC buffer is received");
     }
 
     size_t main_block_count = cnt / comm_size;
@@ -84,10 +90,10 @@ void ze_a2a_allreduce_entry::init_ze_hook() {
     /* alloc temp buffer */
     size_t tmp_buf_bytes = peer_count * block_count * dtype.size();
     ccl::alloc_param alloc_param(tmp_buf_bytes, buffer_type::ze, buffer_place::device);
-    void* tmp_buf = sched->alloc_buffer(alloc_param).get_ptr();
+    ccl_buffer tmp_buf = sched->alloc_buffer(alloc_param);
 
     LOG_DEBUG("rank ",
-              comm_size,
+              comm_rank,
               ", main_block_count: ",
               main_block_count,
               ", block_count: ",
@@ -98,29 +104,42 @@ void ze_a2a_allreduce_entry::init_ze_hook() {
               cnt);
 
     /* copy peer segments to temp buffer */
-    size_t main_block_bytes = main_block_count * dtype.size();
-    size_t block_bytes = block_count * dtype.size();
 
-    pre_copy_events.resize(peer_count);
-    for (auto& event : pre_copy_events) {
-        event = ze_base_entry::create_event();
+    // do no need separate memcpys when using monolithic kernel
+    if (!ccl::global_data::env().reduce_scatter_monolithic_kernel) {
+        pre_copy_events.resize(peer_count);
+        for (auto& event : pre_copy_events) {
+            event = ze_base_entry::create_event();
+        }
+    }
+
+    if (ccl::global_data::env().reduce_scatter_monolithic_kernel) {
+        // two kernels. one leftover kernel and an aligned kernel
+        kernel_events.resize((int)ccl::utils::align_kernels::count);
+    }
+    else if (ccl::global_data::env().enable_kernel_single_reduce_peers) {
+        // when kernel merge is used only one kernel is required
+        kernel_events.resize(1);
+    }
+    else {
+        kernel_events.resize(peer_count);
     }
 
-    kernel_events.resize(peer_count);
     for (auto& event : kernel_events) {
         event = ze_base_entry::create_event();
     }
 
     barrier_event = ze_base_entry::create_event();
-
+    bool is_monolithic = ccl::global_data::env().reduce_scatter_monolithic_kernel;
+    bool is_single_kernel = ccl::global_data::env().enable_kernel_single_reduce_peers;
     ze_a2a_reduce_scatter_entry::fill_list(this,
                                            send_buf.get_ptr(),
-                                           tmp_buf,
+                                           tmp_buf.get_ptr(),
                                            peer_send_bufs,
                                            peer_count,
                                            comm_rank,
                                            block_count,
-                                           comm_rank * main_block_bytes,
+                                           comm_rank * main_block_count,
                                            pre_copy_events,
                                            kernels,
                                            kernel_events,
@@ -130,24 +149,68 @@ void ze_a2a_allreduce_entry::init_ze_hook() {
                                            device,
                                            context,
                                            op,
-                                           worker_idx);
+                                           worker_idx,
+                                           peer_buf_offset,
+                                           is_monolithic,
+                                           is_single_kernel);
+
+    CCL_THROW_IF_NOT(!ccl::global_data::env().allgatherv_topo_read,
+                     "ze_a2a_allreduce_entry with allgatherv_read not implemented for scaleup");
+    // TODO: for doing allgatherv_read, we need to copy the reduced part from
+    // tmp_buf to recv_bufs[comm_rank] and use in_place allgatherv because
+    // we do not have the remote address of tmp_buf. Else use ipc exchange for tmp_buf.
+    // also we need to do a comm_barrier before allgatherv entry to make sure
+    // all remote ranks have finished reduce_scatter
+
+    // for write, we can directly use tmp_buf and do not need in_place as true.
 
-    post_copy_events.resize(comm_size);
+    bool is_monolithic_allgat = ccl::global_data::env().allgatherv_monolithic_kernel;
+    // TODO: MLSL-1651 make int8 work with allgatherv write monolithic kernel
+    if (dtype == ccl::datatype::int8) {
+        is_monolithic_allgat = false;
+    }
+    if (is_monolithic_allgat) {
+        // two for peer copy (unaligned and aligned kernel) and one for non-inplace tmp_buf copy
+        post_copy_events.resize((int)ccl::utils::align_kernels::count + 1);
+    }
+    else {
+        post_copy_events.resize(comm_size);
+    }
     for (auto& event : post_copy_events) {
         event = ze_base_entry::create_event();
     }
 
+    size_t main_block_bytes = main_block_count * dtype.size();
+    std::vector<size_t> block_bytes(comm_size, main_block_bytes);
+    // last rank chunk may have a different size due to leftover data
+    block_bytes.back() += (cnt - main_block_count * comm_size) * dtype.size();
+
+    std::vector<size_t> rank_buf_offsets(comm_size);
+    rank_buf_offsets.at(comm_rank) = comm_rank * main_block_count;
+    std::vector<ccl_buffer> recv_bufs;
+    for (int i = 0; i < comm_size; i++) {
+        recv_bufs.push_back(recv_buf + i * main_block_bytes);
+    }
     ze_a2a_allgatherv_entry::fill_list(this,
                                        comm_rank,
                                        tmp_buf,
-                                       recv_buf.get_ptr(),
+                                       recv_bufs,
                                        peer_recv_bufs,
                                        peer_count,
                                        block_bytes,
-                                       comm_rank * main_block_bytes,
+                                       dtype,
+                                       rank_buf_offsets,
                                        false,
                                        post_copy_events,
-                                       kernel_events.back());
+                                       kernel_events,
+                                       kernels,
+                                       module,
+                                       device,
+                                       context,
+                                       worker_idx,
+                                       peer_buf_offset,
+                                       ccl::global_data::env().allgatherv_topo_read,
+                                       is_monolithic_allgat);
 }
 
 void ze_a2a_allreduce_entry::start() {
@@ -170,3 +233,28 @@ void ze_a2a_allreduce_entry::update() {
     ZE_CALL(zeEventHostSignal, (ze_base_entry::entry_event));
     ze_base_entry::update();
 }
+
+std::string ze_a2a_allreduce_entry::name_ext() const {
+    std::stringstream out;
+    out << name() << ":" << cnt * dtype.size();
+    return out.str();
+}
+
+void ze_a2a_allreduce_entry::dump_detail(std::stringstream& str) const {
+    ccl_logger::format(str,
+                       "dt ",
+                       ccl::global_data::get().dtypes->name(dtype),
+                       ", cnt ",
+                       cnt,
+                       ", send_buf ",
+                       send_buf,
+                       ", recv_buf ",
+                       recv_buf,
+                       ", op ",
+                       ccl_reduction_to_str(op),
+                       ", comm ",
+                       comm->to_string(),
+                       ", context ",
+                       context,
+                       "\n");
+}
diff --git a/src/sched/entry/ze/allreduce/ze_a2a_allreduce_entry.hpp b/src/sched/entry/ze/allreduce/ze_a2a_allreduce_entry.hpp
index a36b9ec12..692054462 100644
--- a/src/sched/entry/ze/allreduce/ze_a2a_allreduce_entry.hpp
+++ b/src/sched/entry/ze/allreduce/ze_a2a_allreduce_entry.hpp
@@ -16,12 +16,8 @@
 #pragma once
 
 #include "common/utils/buffer.hpp"
-#include "comp/comp.hpp"
 #include "sched/entry/ze/ze_base_entry.hpp"
 
-#include <atomic>
-#include <sstream>
-
 class ze_a2a_allreduce_entry : public ze_base_entry {
 public:
     static constexpr const char* class_name() noexcept {
@@ -32,12 +28,7 @@ class ze_a2a_allreduce_entry : public ze_base_entry {
         return class_name();
     }
 
-    virtual std::string name_ext() const override {
-        std::stringstream out;
-        out << name() << " ";
-        out << "size: " << cnt;
-        return out.str();
-    }
+    virtual std::string name_ext() const override;
 
     ze_a2a_allreduce_entry() = delete;
     explicit ze_a2a_allreduce_entry(ccl_sched* sched,
@@ -49,7 +40,8 @@ class ze_a2a_allreduce_entry : public ze_base_entry {
                                     ccl_comm* comm,
                                     std::vector<ze_event_handle_t> wait_events = {},
                                     size_t send_buf_idx = 0,
-                                    size_t recv_buf_idx = 1);
+                                    size_t recv_buf_idx = 1,
+                                    size_t peer_buf_offset = 0);
 
     void init_ze_hook() override;
 
@@ -57,24 +49,7 @@ class ze_a2a_allreduce_entry : public ze_base_entry {
     void update() override;
 
 protected:
-    void dump_detail(std::stringstream& str) const override {
-        ccl_logger::format(str,
-                           "dt ",
-                           ccl::global_data::get().dtypes->name(dtype),
-                           ", cnt ",
-                           cnt,
-                           ", send_buf ",
-                           send_buf,
-                           ", recv_buf ",
-                           recv_buf,
-                           ", op ",
-                           ccl_reduction_to_str(op),
-                           ", comm ",
-                           comm->to_string(),
-                           ", context ",
-                           context,
-                           "\n");
-    }
+    void dump_detail(std::stringstream& str) const override;
 
 private:
     static constexpr size_t event_group_count{ 3 }; // copy + kernel + copy
@@ -87,6 +62,7 @@ class ze_a2a_allreduce_entry : public ze_base_entry {
 
     const size_t send_buf_idx;
     const size_t recv_buf_idx;
+    const size_t peer_buf_offset;
 
     const int peer_count;
 
diff --git a/src/sched/entry/ze/allreduce/ze_onesided_allreduce_entry.cpp b/src/sched/entry/ze/allreduce/ze_onesided_allreduce_entry.cpp
index cb2d6325e..2cbac5590 100644
--- a/src/sched/entry/ze/allreduce/ze_onesided_allreduce_entry.cpp
+++ b/src/sched/entry/ze/allreduce/ze_onesided_allreduce_entry.cpp
@@ -14,12 +14,14 @@
  limitations under the License.
 */
 #include "common/stream/stream.hpp"
+#include "comp/comp.hpp"
 #include "sched/entry/ze/allreduce/ze_onesided_allreduce_entry.hpp"
 #include "sched/entry/ze/ze_primitives.hpp"
 #include "sched/entry/ze/ze_cache.hpp"
 #include "sched/queue/queue.hpp"
 
 #include <string>
+#include <sstream>
 
 using namespace ccl;
 using namespace ccl::ze;
@@ -32,7 +34,7 @@ ze_onesided_allreduce_entry::ze_onesided_allreduce_entry(ccl_sched* sched,
                                                          reduction op,
                                                          ccl_comm* comm,
                                                          std::vector<ze_event_handle_t> wait_events,
-                                                         const size_t buf_offset_cnt)
+                                                         size_t peer_buf_offset)
         : ze_base_entry(sched, comm, 3 /* request additional events */, wait_events),
           send_buf(send_buf),
           recv_buf(recv_buf),
@@ -40,7 +42,7 @@ ze_onesided_allreduce_entry::ze_onesided_allreduce_entry(ccl_sched* sched,
           dtype(dtype),
           op(op),
           buf_size_bytes(dtype.size() * cnt),
-          buf_offset_bytes(dtype.size() * buf_offset_cnt) {}
+          buf_offset_bytes(dtype.size() * peer_buf_offset) {}
 
 void ze_onesided_allreduce_entry::init_ze_hook() {
     /* create kernels */
@@ -48,8 +50,8 @@ void ze_onesided_allreduce_entry::init_ze_hook() {
     ccl_buffer right_recv_buf;
     int peer_rank = (comm_rank + 1) % comm_size;
 
-    send_buf_ptr = static_cast<char*>(send_buf.get_ptr()) + buf_offset_bytes;
-    recv_buf_ptr = static_cast<char*>(recv_buf.get_ptr()) + buf_offset_bytes;
+    send_buf_ptr = send_buf.get_ptr();
+    recv_buf_ptr = recv_buf.get_ptr();
     if (send_buf_ptr == recv_buf_ptr) {
         sched->get_memory().handle_manager.get(peer_rank, 1, right_send_buf, comm);
         sched->get_memory().handle_manager.get(peer_rank, 1, right_recv_buf, comm);
@@ -77,29 +79,10 @@ void ze_onesided_allreduce_entry::init_ze_hook() {
     LOG_DEBUG("get kernel: name: ", main_kernel_name);
 
     global_data::get().ze_data->cache->get(context, device, "kernels.spv", &module);
-    global_data::get().ze_data->cache->get(worker_idx, module, main_kernel_name, &main_kernel);
 
     ze_kernel_args_t allreduce_kernel_args{ &comm_rank,         &comm_size,    &cnt,
                                             &send_buf_ptr,      &recv_buf_ptr, &right_send_buf_ptr,
                                             &right_recv_buf_ptr };
-    ze_kernel_args_t reduce_local_kernel_args{ &comm_rank,    &comm_size,   &cnt,
-                                               &send_buf_ptr, &tmp_buf_ptr, &recv_buf_ptr };
-
-    auto& main_kernel_args = (global_data::env().enable_kernel_1s_copy_ops)
-                                 ? reduce_local_kernel_args
-                                 : allreduce_kernel_args;
-    LOG_DEBUG("kernel ", main_kernel, " args:\n", to_string(main_kernel_args));
-    set_kernel_args(main_kernel, main_kernel_args);
-
-    ze_group_size_t group_size;
-    get_suggested_group_size(main_kernel, cnt, &group_size);
-    LOG_DEBUG("suggested group size: ", to_string(group_size));
-
-    get_suggested_group_count(group_size, cnt, &group_count);
-    LOG_DEBUG("suggested group count: ", to_string(group_count));
-
-    ZE_CALL(zeKernelSetGroupSize,
-            (main_kernel, group_size.groupSizeX, group_size.groupSizeY, group_size.groupSizeZ));
 
     if (global_data::env().enable_kernel_1s_ipc_wa) {
         LOG_DEBUG("get kernel: name: ", empty_kernel_name);
@@ -135,6 +118,23 @@ void ze_onesided_allreduce_entry::init_ze_hook() {
     if (global_data::env().enable_kernel_1s_copy_ops) {
         LOG_DEBUG("one-sided multi-phase algorithm");
 
+        global_data::get().ze_data->cache->get(worker_idx, module, main_kernel_name, &main_kernel);
+
+        ze_kernel_args_t main_kernel_args{ &comm_rank,    &comm_size,   &cnt,
+                                           &send_buf_ptr, &tmp_buf_ptr, &recv_buf_ptr };
+        LOG_DEBUG("kernel ", main_kernel, " args:\n", to_string(main_kernel_args));
+        set_kernel_args(main_kernel, main_kernel_args);
+
+        ze_group_size_t group_size;
+        get_suggested_group_size(main_kernel, cnt, &group_size);
+        LOG_DEBUG("suggested group size: ", to_string(group_size));
+
+        get_suggested_group_count(group_size, cnt, &group_count);
+        LOG_DEBUG("suggested group count: ", to_string(group_count));
+
+        ZE_CALL(zeKernelSetGroupSize,
+                (main_kernel, group_size.groupSizeX, group_size.groupSizeY, group_size.groupSizeZ));
+
         ZE_CALL(zeCommandListAppendMemoryCopy,
                 (ze_base_entry::get_copy_list(),
                  tmp_buf_ptr,
@@ -163,13 +163,63 @@ void ze_onesided_allreduce_entry::init_ze_hook() {
     }
     else {
         LOG_DEBUG("one-sided monolithic algorithm");
-        ZE_CALL(zeCommandListAppendLaunchKernel,
-                (ze_base_entry::get_comp_list(),
-                 main_kernel,
-                 &group_count,
-                 ze_base_entry::entry_event,
-                 (empty_kernel_event) ? 1 : 0,
-                 &empty_kernel_event));
+
+        // use recv_buf_ptr instead of right_recv_buf_ptr since we cannot make sure
+        // right_recv_buf_ptr got using ipc has the same alignment as remote recv_buf_ptr.
+        // we assume local recv_buf_ptr and remote recv_buf_ptr has the same alignment
+        unsigned long pre_align_offset_byte = ccl::utils::get_aligned_offset_byte(
+            recv_buf_ptr, buf_size_bytes, ccl::global_data::env().kernel_mem_align);
+
+        // first kernel starts from location 0 to pre_align_offset_byte
+        // and the second kernel starts from location pre_align_offset_byte to the rest
+        constexpr int kernel_count = (int)ccl::utils::align_kernels::count;
+        const unsigned long offsets[kernel_count] = { 0, pre_align_offset_byte };
+        const unsigned long counts[kernel_count] = { pre_align_offset_byte / dtype.size(),
+                                                     cnt - pre_align_offset_byte / dtype.size() };
+        ze_event_handle_t events[kernel_count];
+        int start_kernel = (int)ccl::utils::align_kernels::unaligned;
+        // when pre_align_offset_byte is 0, only aligned kernel is needed
+        if (pre_align_offset_byte == 0) {
+            start_kernel = (int)ccl::utils::align_kernels::aligned;
+        }
+        // if the initial data is aligned, we need only one kernel
+        // otherwise run two kernels, one for unaligned and one for aligned data
+        for (int i = start_kernel; i < kernel_count; i++) {
+            kernels.emplace_back(module, main_kernel_name, worker_idx);
+
+            void* send_buf_ptr_tmp = static_cast<char*>(send_buf_ptr) + offsets[i];
+            void* recv_buf_ptr_tmp = static_cast<char*>(recv_buf_ptr) + offsets[i];
+            void* right_send_buf_ptr_tmp = static_cast<char*>(right_send_buf_ptr) + offsets[i];
+            void* right_recv_buf_ptr_tmp = static_cast<char*>(right_recv_buf_ptr) + offsets[i];
+            ze_kernel_args_t main_kernel_args{ &comm_rank,
+                                               &comm_size,
+                                               &counts[i],
+                                               &send_buf_ptr_tmp,
+                                               &recv_buf_ptr_tmp,
+                                               &right_send_buf_ptr_tmp,
+                                               &right_recv_buf_ptr_tmp };
+
+            kernels.back().set_args(main_kernel_args);
+            kernels.back().calculate_group_size(counts[i]);
+            events[i] = (start_kernel == (int)ccl::utils::align_kernels::aligned)
+                            ? ze_base_entry::entry_event
+                            : ze_base_entry::create_event();
+
+            ZE_CALL(zeCommandListAppendLaunchKernel,
+                    (ze_base_entry::get_comp_list(),
+                     kernels.back().get_kernel(),
+                     kernels.back().get_group_count(),
+                     events[i],
+                     (empty_kernel_event) ? 1 : 0,
+                     &empty_kernel_event));
+        }
+
+        // use a barrier to combine the events of the unalinged and aligned kernel
+        if (start_kernel == (int)ccl::utils::align_kernels::unaligned) {
+            ZE_CALL(
+                zeCommandListAppendBarrier,
+                (ze_base_entry::get_comp_list(), ze_base_entry::entry_event, kernel_count, events));
+        }
     }
 }
 
@@ -178,7 +228,9 @@ void ze_onesided_allreduce_entry::finalize_ze_hook() {
         global_data::get().ze_data->cache->push(
             worker_idx, module, empty_kernel_name, empty_kernel);
     }
-    global_data::get().ze_data->cache->push(worker_idx, module, main_kernel_name, main_kernel);
+    if (global_data::env().enable_kernel_1s_copy_ops) {
+        global_data::get().ze_data->cache->push(worker_idx, module, main_kernel_name, main_kernel);
+    }
 }
 
 void ze_onesided_allreduce_entry::start() {
@@ -203,3 +255,28 @@ void ze_onesided_allreduce_entry::update() {
         global_data::get().ze_data->kernel_counter--;
     }
 }
+
+std::string ze_onesided_allreduce_entry::name_ext() const {
+    std::stringstream out;
+    out << name() << ":" << cnt * dtype.size();
+    return out.str();
+}
+
+void ze_onesided_allreduce_entry::dump_detail(std::stringstream& str) const {
+    ccl_logger::format(str,
+                       "dt ",
+                       ccl::global_data::get().dtypes->name(dtype),
+                       ", cnt ",
+                       cnt,
+                       ", send_buf ",
+                       send_buf,
+                       ", recv_buf ",
+                       recv_buf,
+                       ", op ",
+                       ccl_reduction_to_str(op),
+                       ", comm ",
+                       comm->to_string(),
+                       ", context ",
+                       context,
+                       "\n");
+}
diff --git a/src/sched/entry/ze/allreduce/ze_onesided_allreduce_entry.hpp b/src/sched/entry/ze/allreduce/ze_onesided_allreduce_entry.hpp
index 4b461cbb6..454ca535c 100644
--- a/src/sched/entry/ze/allreduce/ze_onesided_allreduce_entry.hpp
+++ b/src/sched/entry/ze/allreduce/ze_onesided_allreduce_entry.hpp
@@ -16,12 +16,8 @@
 #pragma once
 
 #include "common/utils/buffer.hpp"
-#include "comp/comp.hpp"
 #include "sched/entry/ze/ze_base_entry.hpp"
 
-#include <atomic>
-#include <sstream>
-
 class ze_onesided_allreduce_entry : public ze_base_entry {
 public:
     static constexpr const char* class_name() noexcept {
@@ -32,12 +28,7 @@ class ze_onesided_allreduce_entry : public ze_base_entry {
         return class_name();
     }
 
-    virtual std::string name_ext() const override {
-        std::stringstream out;
-        out << name() << " ";
-        out << "size: " << cnt;
-        return out.str();
-    }
+    virtual std::string name_ext() const override;
 
     ze_onesided_allreduce_entry() = delete;
     explicit ze_onesided_allreduce_entry(ccl_sched* sched,
@@ -48,7 +39,7 @@ class ze_onesided_allreduce_entry : public ze_base_entry {
                                          ccl::reduction op,
                                          ccl_comm* comm,
                                          std::vector<ze_event_handle_t> wait_events = {},
-                                         const size_t buf_offset_cnt = 0);
+                                         size_t peer_buf_offset = 0);
 
     void init_ze_hook() override;
     void finalize_ze_hook() override;
@@ -57,24 +48,7 @@ class ze_onesided_allreduce_entry : public ze_base_entry {
     void update() override;
 
 protected:
-    void dump_detail(std::stringstream& str) const override {
-        ccl_logger::format(str,
-                           "dt ",
-                           ccl::global_data::get().dtypes->name(dtype),
-                           ", cnt ",
-                           cnt,
-                           ", send_buf ",
-                           send_buf,
-                           ", recv_buf ",
-                           recv_buf,
-                           ", op ",
-                           ccl_reduction_to_str(op),
-                           ", comm ",
-                           comm->to_string(),
-                           ", context ",
-                           context,
-                           "\n");
-    }
+    void dump_detail(std::stringstream& str) const override;
 
 private:
     const ccl_buffer send_buf;
@@ -100,4 +74,6 @@ class ze_onesided_allreduce_entry : public ze_base_entry {
 
     ze_kernel_handle_t empty_kernel{};
     std::string empty_kernel_name{ "empty_kernel" };
+
+    std::vector<ze_kernel> kernels;
 };
diff --git a/src/sched/entry/ze/allreduce/ze_ring_allreduce_entry.cpp b/src/sched/entry/ze/allreduce/ze_ring_allreduce_entry.cpp
index 9b3fcf6c1..09ad07f88 100644
--- a/src/sched/entry/ze/allreduce/ze_ring_allreduce_entry.cpp
+++ b/src/sched/entry/ze/allreduce/ze_ring_allreduce_entry.cpp
@@ -13,6 +13,7 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
+#include "comp/comp.hpp"
 #include "common/stream/stream.hpp"
 #include "sched/entry/ze/allreduce/ze_ring_allreduce_entry.hpp"
 #include "sched/entry/ze/ze_primitives.hpp"
@@ -63,11 +64,12 @@ void ze_ring_allreduce_entry::atl_ops_init() {
     sync_send_flags.resize(total_iter_count, comm_rank);
 
     for (int i = 0; i < total_iter_count; ++i) {
-        send_tags[i] = comm->get_atl_comm()->tag->create(right_peer,
-                                                         comm->get_comm_id(),
-                                                         sched->sched_id,
-                                                         sched->get_op_id() + i + op_id_offset);
-        recv_tags[i] = comm->get_atl_comm()->tag->create(
+        send_tags[i] =
+            comm->get_atl_comm()->tag_creator->create(right_peer,
+                                                      comm->get_comm_id(),
+                                                      sched->sched_id,
+                                                      sched->get_op_id() + i + op_id_offset);
+        recv_tags[i] = comm->get_atl_comm()->tag_creator->create(
             comm_rank, comm->get_comm_id(), sched->sched_id, sched->get_op_id() + i + op_id_offset);
     }
 
@@ -554,3 +556,28 @@ void ze_ring_allreduce_entry::reset_fields() {
         std::fill(ag_copy_started.begin(), ag_copy_started.end(), false);
     }
 }
+
+std::string ze_ring_allreduce_entry::name_ext() const {
+    std::stringstream out;
+    out << name() << ":" << cnt * dtype.size();
+    return out.str();
+}
+
+void ze_ring_allreduce_entry::dump_detail(std::stringstream& str) const {
+    ccl_logger::format(str,
+                       "dt ",
+                       ccl::global_data::get().dtypes->name(dtype),
+                       ", cnt ",
+                       cnt,
+                       ", send_buf ",
+                       send_buf,
+                       ", recv_buf ",
+                       recv_buf,
+                       ", op ",
+                       ccl_reduction_to_str(op),
+                       ", comm ",
+                       comm->to_string(),
+                       ", context ",
+                       context,
+                       "\n");
+}
diff --git a/src/sched/entry/ze/allreduce/ze_ring_allreduce_entry.hpp b/src/sched/entry/ze/allreduce/ze_ring_allreduce_entry.hpp
index 5f5eee3cc..3bcdd0314 100644
--- a/src/sched/entry/ze/allreduce/ze_ring_allreduce_entry.hpp
+++ b/src/sched/entry/ze/allreduce/ze_ring_allreduce_entry.hpp
@@ -16,7 +16,6 @@
 #pragma once
 
 #include "common/utils/buffer.hpp"
-#include "comp/comp.hpp"
 #include "sched/entry/ze/ze_base_entry.hpp"
 
 class ze_ring_allreduce_entry : public ze_base_entry {
@@ -29,12 +28,7 @@ class ze_ring_allreduce_entry : public ze_base_entry {
         return class_name();
     }
 
-    virtual std::string name_ext() const override {
-        std::stringstream out;
-        out << name() << " ";
-        out << "size: " << cnt;
-        return out.str();
-    }
+    virtual std::string name_ext() const override;
 
     ze_ring_allreduce_entry() = delete;
     explicit ze_ring_allreduce_entry(ccl_sched* sched,
@@ -57,24 +51,7 @@ class ze_ring_allreduce_entry : public ze_base_entry {
     void reset_fields();
 
 protected:
-    void dump_detail(std::stringstream& str) const override {
-        ccl_logger::format(str,
-                           "dt ",
-                           ccl::global_data::get().dtypes->name(dtype),
-                           ", cnt ",
-                           cnt,
-                           ", send_buf ",
-                           send_buf,
-                           ", recv_buf ",
-                           recv_buf,
-                           ", op ",
-                           ccl_reduction_to_str(op),
-                           ", comm ",
-                           comm->to_string(),
-                           ", context ",
-                           context,
-                           "\n");
-    }
+    void dump_detail(std::stringstream& str) const override;
 
 private:
     static constexpr uint32_t local_events_count{ 3 };
diff --git a/src/sched/entry/ze/ze_a2a_allgatherv_entry.cpp b/src/sched/entry/ze/ze_a2a_allgatherv_entry.cpp
index 62fc4c067..8f5ab3a05 100644
--- a/src/sched/entry/ze/ze_a2a_allgatherv_entry.cpp
+++ b/src/sched/entry/ze/ze_a2a_allgatherv_entry.cpp
@@ -25,8 +25,8 @@ using namespace ccl::ze;
 ze_a2a_allgatherv_entry::ze_a2a_allgatherv_entry(ccl_sched* sched,
                                                  ccl_buffer send_buf,
                                                  size_t send_count,
-                                                 ccl_buffer recv_buf,
-                                                 const size_t* recv_counts,
+                                                 std::vector<ccl_buffer> recv_bufs,
+                                                 std::vector<size_t> recv_counts,
                                                  const ccl_datatype& dtype,
                                                  ccl_comm* comm,
                                                  std::vector<ze_event_handle_t> wait_events,
@@ -35,85 +35,315 @@ ze_a2a_allgatherv_entry::ze_a2a_allgatherv_entry(ccl_sched* sched,
         : ze_base_entry(sched, comm, comm->size() * event_group_count, wait_events),
           send_buf(send_buf),
           send_count(send_count),
-          recv_buf(recv_buf),
-          recv_counts(recv_counts, recv_counts + comm->size()),
+          recv_bufs(recv_bufs),
+          recv_counts(recv_counts),
           dtype(dtype),
           peer_buf_idx(peer_buf_idx),
           peer_buf_offset(peer_buf_offset),
           peer_count(comm->size() - 1) {}
 
+void ze_a2a_allgatherv_entry::fill_list_read(const ze_base_entry* entry,
+                                             int comm_rank,
+                                             ccl_buffer send_buf,
+                                             const std::vector<ccl_buffer>& recv_bufs,
+                                             const std::vector<ccl_buffer>& peer_send_bufs,
+                                             int peer_count,
+                                             const std::vector<size_t>& copy_bytes,
+                                             const ccl_datatype& dtype,
+                                             const std::vector<size_t>& rank_buf_offsets,
+                                             bool is_inplace,
+                                             std::vector<ze_event_handle_t>& copy_events,
+                                             std::vector<ze_event_handle_t>& wait_events,
+                                             size_t peer_buf_offset,
+                                             bool is_monolithic) {
+    if (is_monolithic) {
+        LOG_INFO("allgatherv read not allowed with monolithic kernel");
+    }
+    const size_t comm_size = peer_count + 1;
+    for (int i = 0; i < peer_count; ++i) {
+        const int peer_rank = (comm_rank + i + 1) % comm_size;
+        void* src = peer_send_bufs[peer_rank].get_ptr();
+        if (is_inplace) {
+            // TODO: use peer_send_bufs directly without adding offset
+            src = (peer_send_bufs[peer_rank] +
+                   (rank_buf_offsets.at(peer_rank) + peer_buf_offset) * dtype.size())
+                      .get_ptr();
+        }
+
+        void* dst = recv_bufs[peer_rank].get_ptr();
+
+        auto list = entry->get_copy_list(copy_direction::c2c, i);
+        ZE_CALL(zeCommandListAppendMemoryCopy,
+                (list,
+                 dst,
+                 src,
+                 copy_bytes.at(peer_rank),
+                 copy_events.at(i),
+                 wait_events.size(),
+                 wait_events.data()));
+    }
+}
+
+void ze_a2a_allgatherv_entry::fill_list_write(const ze_base_entry* entry,
+                                              int comm_rank,
+                                              ccl_buffer send_buf,
+                                              const std::vector<ccl_buffer>& recv_bufs,
+                                              const std::vector<ccl_buffer>& peer_recv_bufs,
+                                              int peer_count,
+                                              const std::vector<size_t>& copy_bytes,
+                                              const ccl_datatype& dtype,
+                                              const std::vector<size_t>& rank_buf_offsets,
+                                              bool is_inplace,
+                                              std::vector<ze_event_handle_t>& copy_events,
+                                              std::vector<ze_event_handle_t>& wait_events,
+                                              std::vector<ze_kernel>& kernels,
+                                              ze_module_handle_t module,
+                                              ze_device_handle_t device,
+                                              ze_context_handle_t context,
+                                              size_t worker_idx,
+                                              size_t peer_buf_offset,
+                                              bool is_monolithic) {
+    /* copy send_buf to peer buffers */
+    const size_t comm_size = peer_count + 1;
+
+    std::vector<ccl_buffer> peer_bufs;
+    ccl_buffer src_buf = send_buf;
+    if (is_inplace) {
+        src_buf = recv_bufs.at(comm_rank);
+    }
+    for (int i = 0; i < peer_count; ++i) {
+        const int peer_rank = (comm_rank + i + 1) % comm_size;
+        // TODO: use peer_recv_bufs directly without adding offset
+        ccl_buffer dst_buf = peer_recv_bufs[peer_rank] +
+                             (rank_buf_offsets.at(comm_rank) + peer_buf_offset) * dtype.size();
+
+        if (is_monolithic) {
+            peer_bufs.push_back(dst_buf);
+        }
+        else {
+            // TODO: if we on the same device, then use t2t direction
+            auto list = entry->get_copy_list(copy_direction::c2c, i);
+            ZE_CALL(zeCommandListAppendMemoryCopy,
+                    (list,
+                     dst_buf.get_ptr(),
+                     src_buf.get_ptr(),
+                     copy_bytes.at(comm_rank),
+                     copy_events.at(i),
+                     wait_events.size(),
+                     wait_events.data()));
+        }
+    }
+    if (is_monolithic) {
+        //TODO: add fallback path for peer_count > max_peer_count
+        CCL_THROW_IF_NOT(size_t(peer_count) <= ccl::ze::max_peer_count,
+                         "monolithic kernel not supported for peer_count ",
+                         peer_count,
+                         " > ",
+                         ccl::ze::max_peer_count);
+        global_data::get().ze_data->cache->get(context, device, "kernels.spv", &module);
+        std::string monolithic_kernel_name =
+            "write_monolithic_kernel_" + std::to_string(peer_count) + "_" + to_string(dtype.idx()) +
+            "_" + ccl_reduction_to_str(ccl::reduction::custom);
+        LOG_DEBUG("allgatherv monolithic kernel name: ", monolithic_kernel_name);
+
+        // use src instead of peer dst since we cannot make sure
+        // peer dst got using ipc has the same alignment as remote buffer
+        // we assume local buffer and remote buffer has the same alignment
+        size_t buf_size_bytes = copy_bytes.at(comm_rank);
+        unsigned long pre_align_offset_byte =
+            ccl::utils::get_aligned_offset_byte(recv_bufs.at(comm_rank).get_ptr(),
+                                                buf_size_bytes,
+                                                ccl::global_data::env().kernel_mem_align);
+
+        // First kernel starts from location 0 to pre_align_offset_byte
+        // and the second kernel starts from location pre_align_offset_byte to the rest
+        const size_t copy_count = copy_bytes.at(comm_rank) / dtype.size();
+        constexpr int kernel_count = (int)ccl::utils::align_kernels::count;
+        const unsigned long offsets[kernel_count] = { 0, pre_align_offset_byte };
+        const unsigned long counts[kernel_count] = {
+            pre_align_offset_byte / dtype.size(), copy_count - pre_align_offset_byte / dtype.size()
+        };
+
+        // Start two kernels, first kernel for the part of the array before the aligned start offset
+        // and second kernel for the rest of the array from the aligned start offset to the end
+        for (int i = 0; i < kernel_count; i++) {
+            unsigned long count_local = counts[i];
+            // data count is small and there is no need to execute the second aligned kernel
+            if (i == (int)ccl::utils::align_kernels::aligned && count_local == 0) {
+                copy_events.at(i) = copy_events.at(i - 1);
+                break;
+            }
+            void* src = (src_buf + offsets[i]).get_ptr();
+            std::vector<void*> dsts;
+            for (auto& peer_buf : peer_bufs) {
+                dsts.push_back((peer_buf + offsets[i]).get_ptr());
+            }
+            kernels.emplace_back(module, monolithic_kernel_name, worker_idx);
+            ze_kernel_arg_t peer_bufs_ze_arg(dsts.data(), dsts.size());
+            kernels.back().set_args({ &count_local, &src, peer_bufs_ze_arg });
+            kernels.back().calculate_group_size(count_local);
+
+            ZE_CALL(zeCommandListAppendLaunchKernel,
+                    (entry->get_comp_list(),
+                     kernels.back().get_kernel(),
+                     kernels.back().get_group_count(),
+                     copy_events.at(i),
+                     wait_events.size(),
+                     wait_events.data()));
+        }
+    }
+}
+
 void ze_a2a_allgatherv_entry::fill_list(const ze_base_entry* entry,
                                         int comm_rank,
-                                        void* send_buf,
-                                        void* recv_buf,
-                                        const std::vector<ccl_buffer>& peer_recv_bufs,
+                                        ccl_buffer send_buf,
+                                        const std::vector<ccl_buffer>& recv_bufs,
+                                        const std::vector<ccl_buffer>& peer_bufs,
                                         int peer_count,
-                                        size_t copy_bytes,
-                                        size_t offset_bytes,
+                                        const std::vector<size_t>& copy_bytes,
+                                        const ccl_datatype& dtype,
+                                        const std::vector<size_t>& rank_buf_offsets,
                                         bool is_inplace,
                                         std::vector<ze_event_handle_t>& copy_events,
-                                        ze_event_handle_t wait_event) {
-    /* copy send_buf to peer buffers */
-    for (int i = 0; i < peer_count; ++i) {
-        void* src = send_buf;
-        if (is_inplace) {
-            src = static_cast<char*>(recv_buf) + offset_bytes;
-        }
-        void* dst = static_cast<char*>(peer_recv_bufs[i].get_ptr()) + offset_bytes;
-        auto list = entry->get_copy_list(i, true);
-        ZE_CALL(zeCommandListAppendMemoryCopy,
-                (list, dst, src, copy_bytes, copy_events.at(i), (wait_event) ? 1 : 0, &wait_event));
+                                        std::vector<ze_event_handle_t>& wait_events,
+                                        std::vector<ze_kernel>& kernels,
+                                        ze_module_handle_t module,
+                                        ze_device_handle_t device,
+                                        ze_context_handle_t context,
+                                        size_t worker_idx,
+                                        size_t peer_buf_offset,
+                                        bool is_read,
+                                        bool is_monolithic) {
+    if (is_read) {
+        fill_list_read(entry,
+                       comm_rank,
+                       send_buf,
+                       recv_bufs,
+                       peer_bufs,
+                       peer_count,
+                       copy_bytes,
+                       dtype,
+                       rank_buf_offsets,
+                       is_inplace,
+                       copy_events,
+                       wait_events,
+                       peer_buf_offset,
+                       is_monolithic);
+    }
+    else {
+        fill_list_write(entry,
+                        comm_rank,
+                        send_buf,
+                        recv_bufs,
+                        peer_bufs,
+                        peer_count,
+                        copy_bytes,
+                        dtype,
+                        rank_buf_offsets,
+                        is_inplace,
+                        copy_events,
+                        wait_events,
+                        kernels,
+                        module,
+                        device,
+                        context,
+                        worker_idx,
+                        peer_buf_offset,
+                        is_monolithic);
     }
 
     if (!is_inplace) {
         /* copy send_buf to my buffer */
-        void* src = send_buf;
-        void* dst = static_cast<char*>(recv_buf) + offset_bytes;
-        auto list = entry->get_copy_list();
-        ZE_CALL(
-            zeCommandListAppendMemoryCopy,
-            (list, dst, src, copy_bytes, copy_events.back(), (wait_event) ? 1 : 0, &wait_event));
+        void* src = send_buf.get_ptr();
+        void* dst = recv_bufs.at(comm_rank).get_ptr();
+        auto list = entry->get_copy_list(copy_direction::t2t);
+        ZE_CALL(zeCommandListAppendMemoryCopy,
+                (list,
+                 dst,
+                 src,
+                 copy_bytes.at(comm_rank),
+                 copy_events.back(),
+                 wait_events.size(),
+                 wait_events.data()));
     }
 }
 
 void ze_a2a_allgatherv_entry::init_ze_hook() {
     /* get peer recv buffers */
-    std::vector<ccl_buffer> peer_recv_bufs(peer_count);
+    std::vector<ccl_buffer> peer_recv_bufs(comm->size());
 
     for (int i = 0; i < peer_count; ++i) {
-        int peer_rank = (comm_rank + i + 1) % comm->size();
+        const int peer_rank = (comm_rank + i + 1) % comm->size();
         ccl_buffer buf{};
         sched->get_memory().handle_manager.get(peer_rank, peer_buf_idx, buf, comm);
         CCL_THROW_IF_NOT(buf.get_ptr(), "null IPC buffer is received");
-        peer_recv_bufs[i] = buf + peer_buf_offset * dtype.size();
+        peer_recv_bufs[peer_rank] = buf;
     }
 
     bool is_inplace{};
-    if (send_buf == recv_buf) {
+    if (send_buf == recv_bufs.at(comm_rank)) {
         is_inplace = true;
     }
+    std::vector<size_t> rank_buf_offsets(comm_size);
+    for (int i = 1; i < comm_size; i++) {
+        rank_buf_offsets[i] = rank_buf_offsets[i - 1] + recv_counts[i - 1];
+    }
 
-    size_t offset_count = std::accumulate(recv_counts.begin(), recv_counts.begin() + comm_rank, 0);
-    size_t offset_bytes = offset_count * dtype.size();
-    size_t block_bytes =
-        (!is_inplace) ? (send_count * dtype.size()) : recv_counts[comm_rank] * dtype.size();
-    LOG_DEBUG("rank: ", comm_rank, ", block_bytes: ", block_bytes);
+    CCL_THROW_IF_NOT(send_count == recv_counts[comm_rank],
+                     "allgatherv send_count :",
+                     send_count,
+                     " and recv_count :",
+                     recv_counts[comm_rank],
+                     " does not match");
 
-    copy_events.resize((!is_inplace) ? comm_size : peer_count);
+    std::vector<size_t> block_bytes(comm_size);
+    for (int i = 0; i < comm_size; i++) {
+        block_bytes[i] = recv_counts[i] * dtype.size();
+    }
+
+    LOG_DEBUG("rank: ", comm_rank, ", block_bytes: ", block_bytes.at(comm_rank));
+
+    bool is_monolithic = ccl::global_data::env().allgatherv_monolithic_kernel;
+    bool is_read = ccl::global_data::env().allgatherv_topo_read;
+    // TODO: MLSL-1651 make int8 work with allgatherv write monolithic kernel
+    if (dtype == ccl::datatype::int8) {
+        is_monolithic = false;
+    }
+    size_t copy_events_size = peer_count;
+    // write requires two kernels, unaligned and aligned kernel
+    if (is_monolithic && !is_read) {
+        copy_events_size = (int)ccl::utils::align_kernels::count;
+    }
+    // need additional memcpy for non inplace data
+    if (!is_inplace) {
+        copy_events_size++;
+    }
+    copy_events.resize(copy_events_size);
     for (auto& event : copy_events) {
         event = ze_base_entry::create_event();
     }
 
+    std::vector<ze_event_handle_t> empty_wait_events;
     fill_list(this,
               comm_rank,
-              send_buf.get_ptr(),
-              recv_buf.get_ptr(),
+              send_buf,
+              recv_bufs,
               peer_recv_bufs,
               peer_count,
               block_bytes,
-              offset_bytes,
+              dtype,
+              rank_buf_offsets,
               is_inplace,
-              copy_events);
+              copy_events,
+              empty_wait_events,
+              kernels,
+              module,
+              device,
+              context,
+              worker_idx,
+              peer_buf_offset,
+              is_read,
+              is_monolithic);
 }
 
 void ze_a2a_allgatherv_entry::update() {
@@ -126,3 +356,13 @@ void ze_a2a_allgatherv_entry::update() {
     ZE_CALL(zeEventHostSignal, (ze_base_entry::entry_event));
     ze_base_entry::update();
 }
+
+std::string ze_a2a_allgatherv_entry::name_ext() const {
+    std::stringstream out;
+    out << name() << ":" << send_count * dtype.size();
+    return out.str();
+}
+
+void ze_a2a_allgatherv_entry::dump_detail(std::stringstream& str) const {
+    ccl_logger::format(str, "comm ", comm->to_string(), "\n");
+}
diff --git a/src/sched/entry/ze/ze_a2a_allgatherv_entry.hpp b/src/sched/entry/ze/ze_a2a_allgatherv_entry.hpp
index 2170e02aa..890f895d0 100644
--- a/src/sched/entry/ze/ze_a2a_allgatherv_entry.hpp
+++ b/src/sched/entry/ze/ze_a2a_allgatherv_entry.hpp
@@ -21,25 +21,20 @@
 class ze_a2a_allgatherv_entry : public ze_base_entry {
 public:
     static constexpr const char* class_name() noexcept {
-        return "ZE_ALLGATHERV";
+        return "ZE_A2A_ALLGATHERV";
     }
 
     const char* name() const override {
         return class_name();
     }
 
-    virtual std::string name_ext() const override {
-        std::stringstream out;
-        out << name() << " ";
-        out << "send size: " << send_count;
-        return out.str();
-    }
+    virtual std::string name_ext() const override;
 
     explicit ze_a2a_allgatherv_entry(ccl_sched* sched,
                                      ccl_buffer send_buf,
                                      size_t send_count,
-                                     ccl_buffer recv_buf,
-                                     const size_t* recv_counts,
+                                     std::vector<ccl_buffer> recv_bufs,
+                                     std::vector<size_t> recv_counts,
                                      const ccl_datatype& dtype,
                                      ccl_comm* comm,
                                      std::vector<ze_event_handle_t> wait_events = {},
@@ -52,27 +47,34 @@ class ze_a2a_allgatherv_entry : public ze_base_entry {
 
     static void fill_list(const ze_base_entry* entry,
                           int comm_rank,
-                          void* send_buf,
-                          void* recv_buf,
-                          const std::vector<ccl_buffer>& peer_recv_bufs,
+                          ccl_buffer send_buf,
+                          const std::vector<ccl_buffer>& recv_bufs,
+                          const std::vector<ccl_buffer>& peer_bufs,
                           int peer_count,
-                          size_t copy_bytes,
-                          size_t offset_bytes,
+                          const std::vector<size_t>& copy_bytes,
+                          const ccl_datatype& dtype,
+                          const std::vector<size_t>& rank_buf_offsets,
                           bool is_inplace,
                           std::vector<ze_event_handle_t>& copy_events,
-                          ze_event_handle_t wait_event = nullptr);
+                          std::vector<ze_event_handle_t>& wait_events,
+                          std::vector<ze_kernel>& kernels,
+                          ze_module_handle_t module,
+                          ze_device_handle_t device,
+                          ze_context_handle_t context,
+                          size_t worker_idx,
+                          size_t peer_buf_offset,
+                          bool is_read,
+                          bool is_monolithic);
 
 protected:
-    void dump_detail(std::stringstream& str) const override {
-        ccl_logger::format(str, "comm ", comm->to_string(), "\n");
-    }
+    void dump_detail(std::stringstream& str) const override;
 
 private:
     static constexpr size_t event_group_count{ 1 }; // copy phase
 
     const ccl_buffer send_buf;
     const size_t send_count;
-    const ccl_buffer recv_buf;
+    const std::vector<ccl_buffer> recv_bufs;
     const std::vector<size_t> recv_counts;
     const ccl_datatype dtype;
     const size_t peer_buf_idx;
@@ -80,4 +82,41 @@ class ze_a2a_allgatherv_entry : public ze_base_entry {
     const int peer_count;
 
     std::vector<ze_event_handle_t> copy_events;
+    std::vector<ze_kernel> kernels;
+    std::vector<ze_event_handle_t> kernel_events;
+
+    static void fill_list_read(const ze_base_entry* entry,
+                               int comm_rank,
+                               ccl_buffer send_buf,
+                               const std::vector<ccl_buffer>& recv_bufs,
+                               const std::vector<ccl_buffer>& peer_send_bufs,
+                               int peer_count,
+                               const std::vector<size_t>& copy_bytes,
+                               const ccl_datatype& dtype,
+                               const std::vector<size_t>& rank_buf_offsets,
+                               bool is_inplace,
+                               std::vector<ze_event_handle_t>& copy_events,
+                               std::vector<ze_event_handle_t>& wait_events,
+                               size_t peer_buf_offset,
+                               bool is_monolithic);
+
+    static void fill_list_write(const ze_base_entry* entry,
+                                int comm_rank,
+                                ccl_buffer send_buf,
+                                const std::vector<ccl_buffer>& recv_bufs,
+                                const std::vector<ccl_buffer>& peer_recv_bufs,
+                                int peer_count,
+                                const std::vector<size_t>& copy_bytes,
+                                const ccl_datatype& dtype,
+                                const std::vector<size_t>& rank_buf_offsets,
+                                bool is_inplace,
+                                std::vector<ze_event_handle_t>& copy_events,
+                                std::vector<ze_event_handle_t>& wait_events,
+                                std::vector<ze_kernel>& kernels,
+                                ze_module_handle_t module,
+                                ze_device_handle_t device,
+                                ze_context_handle_t context,
+                                size_t worker_idx,
+                                size_t peer_buf_offset,
+                                bool is_monolithic);
 };
diff --git a/src/sched/entry/ze/ze_a2a_gatherv_entry.cpp b/src/sched/entry/ze/ze_a2a_gatherv_entry.cpp
index d36086c73..92fbe8955 100644
--- a/src/sched/entry/ze/ze_a2a_gatherv_entry.cpp
+++ b/src/sched/entry/ze/ze_a2a_gatherv_entry.cpp
@@ -74,3 +74,24 @@ void ze_a2a_gatherv_entry::init_ze_hook() {
                  nullptr));
     }
 }
+
+std::string ze_a2a_gatherv_entry::name_ext() const {
+    std::stringstream out;
+    out << name() << ":" << send_bytes;
+    return out.str();
+}
+
+void ze_a2a_gatherv_entry::dump_detail(std::stringstream& str) const {
+    ccl_logger::format(str,
+                       "dt ",
+                       ccl::global_data::get().dtypes->name(dtype),
+                       ", send_buf ",
+                       send_buf,
+                       ", recv_buf ",
+                       recv_buf,
+                       ", comm ",
+                       comm->to_string(),
+                       ", context ",
+                       context,
+                       "\n");
+}
diff --git a/src/sched/entry/ze/ze_a2a_gatherv_entry.hpp b/src/sched/entry/ze/ze_a2a_gatherv_entry.hpp
index 5c1b1d9a0..f8c4d3a8e 100644
--- a/src/sched/entry/ze/ze_a2a_gatherv_entry.hpp
+++ b/src/sched/entry/ze/ze_a2a_gatherv_entry.hpp
@@ -28,12 +28,7 @@ class ze_a2a_gatherv_entry : public ze_base_entry {
         return class_name();
     }
 
-    virtual std::string name_ext() const override {
-        std::stringstream out;
-        out << name() << " ";
-        out << "send size: " << send_bytes;
-        return out.str();
-    }
+    virtual std::string name_ext() const override;
 
     explicit ze_a2a_gatherv_entry(ccl_sched* sched,
                                   ccl_buffer send_buf,
@@ -47,6 +42,9 @@ class ze_a2a_gatherv_entry : public ze_base_entry {
 
     void init_ze_hook() override;
 
+protected:
+    void dump_detail(std::stringstream& str) const override;
+
 private:
     const ccl_buffer send_buf;
     const size_t send_bytes;
diff --git a/src/sched/entry/ze/ze_a2a_reduce_scatter_entry.cpp b/src/sched/entry/ze/ze_a2a_reduce_scatter_entry.cpp
index 300b72232..b021a1983 100644
--- a/src/sched/entry/ze/ze_a2a_reduce_scatter_entry.cpp
+++ b/src/sched/entry/ze/ze_a2a_reduce_scatter_entry.cpp
@@ -13,6 +13,7 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
+#include "comp/comp.hpp"
 #include "sched/entry/ze/ze_a2a_reduce_scatter_entry.hpp"
 #include "sched/entry/ze/ze_cache.hpp"
 #include "sched/entry/ze/ze_primitives.hpp"
@@ -30,7 +31,8 @@ ze_a2a_reduce_scatter_entry::ze_a2a_reduce_scatter_entry(ccl_sched* sched,
                                                          reduction op,
                                                          ccl_comm* comm,
                                                          std::vector<ze_event_handle_t> wait_events,
-                                                         size_t peer_buf_idx)
+                                                         size_t peer_buf_idx,
+                                                         size_t peer_buf_offset)
         : ze_base_entry(sched, comm, comm->size() * event_group_count, wait_events),
           send_buf(send_buf),
           recv_buf(recv_buf),
@@ -38,12 +40,14 @@ ze_a2a_reduce_scatter_entry::ze_a2a_reduce_scatter_entry(ccl_sched* sched,
           op(op),
           recv_counts(recv_counts, recv_counts + comm->size()),
           peer_buf_idx(peer_buf_idx),
+          peer_buf_offset(peer_buf_offset),
           peer_count(comm->size() - 1) {}
 
-void ze_a2a_reduce_scatter_entry::kernel_init(size_t offset_bytes,
+void ze_a2a_reduce_scatter_entry::kernel_init(size_t rank_buf_offset,
                                               size_t block_count,
                                               void* send_buf,
                                               void* base_ptr,
+                                              const std::vector<ccl_buffer>& peer_send_bufs,
                                               int peer_count,
                                               const ccl_datatype& dtype,
                                               int comm_rank,
@@ -52,28 +56,99 @@ void ze_a2a_reduce_scatter_entry::kernel_init(size_t offset_bytes,
                                               ze_device_handle_t device,
                                               ze_context_handle_t context,
                                               ccl::reduction op,
-                                              size_t worker_idx) {
+                                              size_t worker_idx,
+                                              size_t peer_buf_offset,
+                                              bool is_monolithic,
+                                              bool is_single_kernel) {
     global_data::get().ze_data->cache->get(context, device, "kernels.spv", &module);
-    std::string kernel_name =
-        "reduce_local_inplace_kernel_" + to_string(dtype.idx()) + "_" + ccl_reduction_to_str(op);
 
-    /* reduce peer values in tmp_buf only */
-    kernels.reserve(peer_count);
     unsigned long count = block_count;
-    for (int i = 1; i < peer_count; ++i) {
-        void* input_buf = static_cast<char*>(base_ptr) + i * block_count * dtype.size();
+    if (is_monolithic && peer_count <= (int)ccl::ze::max_peer_count) {
+        std::string monolithic_kernel_name =
+            "reduce_monolithic_kernel_" + std::to_string(peer_count) + "_" +
+            to_string(dtype.idx()) + "_" + ccl_reduction_to_str(op);
+        LOG_DEBUG("Reduce scatter monolithic kernel name: ", monolithic_kernel_name);
+
+        // use send_buf instead of peer_send_buf_ptr since we cannot make sure
+        // peer_send_buf_ptr got using ipc has the same alignment as remote send_buf=.
+        // we assume local send_buf and remote send_buf has the same alignment
+        void* send_buf_ptr = static_cast<char*>(send_buf) + rank_buf_offset * dtype.size();
+        size_t buf_size_bytes = count * dtype.size();
+        unsigned long pre_align_offset_byte = ccl::utils::get_aligned_offset_byte(
+            send_buf_ptr, buf_size_bytes, ccl::global_data::env().kernel_mem_align);
+
+        // First kernel starts from location 0 to pre_align_offset_byte
+        // and the second kernel starts from location pre_align_offset_byte to the rest
+        constexpr int kernel_count = (int)ccl::utils::align_kernels::count;
+        kernels.reserve(kernel_count);
+
+        const unsigned long offsets[kernel_count] = { 0, pre_align_offset_byte };
+        const unsigned long counts[kernel_count] = { pre_align_offset_byte / dtype.size(),
+                                                     count - pre_align_offset_byte / dtype.size() };
+
+        // Start two kernels, first kernel for the part of the array before the aligned start offset
+        // and second kernel for the rest of the array from the aligned start offset to the end
+        for (int i = 0; i < kernel_count; i++) {
+            unsigned long count_local = counts[i];
+            // data count is small and there is no need to excute the second aligned kernel
+            if (i == (int)ccl::utils::align_kernels::aligned && count_local == 0) {
+                break;
+            }
+            void* input_buf = static_cast<char*>(send_buf_ptr) + offsets[i];
+            std::vector<void*> peer_bufs;
+            peer_bufs.reserve(peer_count);
+            for (auto& peer_send_buf : peer_send_bufs) {
+                void* peer_buf = static_cast<char*>(peer_send_buf.get_ptr()) +
+                                 (rank_buf_offset + peer_buf_offset) * dtype.size() + offsets[i];
+                peer_bufs.push_back(peer_buf);
+            }
+            ze_kernel_arg_t peer_bufs_ze_arg(peer_bufs.data(), peer_bufs.size());
+            void* output_buf = static_cast<char*>(base_ptr) + offsets[i];
+            kernels.emplace_back(module, monolithic_kernel_name, worker_idx);
+            kernels.back().set_args({ &count_local, &input_buf, peer_bufs_ze_arg, &output_buf });
+            kernels.back().calculate_group_size(count_local);
+        }
+    }
+    else if (is_single_kernel || (is_monolithic && peer_count > (int)ccl::ze::max_peer_count)) {
+        // fallback path for monolithic kernel if peer_count > max_peer_count
+        if (is_monolithic) {
+            LOG_WARN("monolithic kernel not supported for peer_count ",
+                     peer_count,
+                     " > ",
+                     ccl::ze::max_peer_count);
+        }
+        std::string kernel_name = "reduce_single_local_inplace_kernel_" + to_string(dtype.idx()) +
+                                  "_" + ccl_reduction_to_str(op);
+
+        // reduce peer values in tmp_buf and own values in send_buf into tmp_buf
+        kernels.reserve(1);
+        void* input_buf = static_cast<char*>(send_buf) + rank_buf_offset * dtype.size();
         void* inoutput_buf = base_ptr;
         kernels.emplace_back(module, kernel_name, worker_idx);
-        kernels.back().set_args({ &count, &input_buf, &inoutput_buf });
+        kernels.back().set_args({ &count, &peer_count, &input_buf, &inoutput_buf });
         kernels.back().calculate_group_size(count);
     }
+    else {
+        std::string kernel_name = "reduce_local_inplace_kernel_" + to_string(dtype.idx()) + "_" +
+                                  ccl_reduction_to_str(op);
+
+        // reduce peer values in tmp_buf only
+        kernels.reserve(peer_count);
+        for (int i = 1; i < peer_count; ++i) {
+            void* input_buf = static_cast<char*>(base_ptr) + i * block_count * dtype.size();
+            void* inoutput_buf = base_ptr;
+            kernels.emplace_back(module, kernel_name, worker_idx);
+            kernels.back().set_args({ &count, &input_buf, &inoutput_buf });
+            kernels.back().calculate_group_size(count);
+        }
 
-    /* reduce send_buf + tmp_buf */
-    void* input_buf = static_cast<char*>(send_buf) + offset_bytes;
-    void* inoutput_buf = base_ptr;
-    kernels.emplace_back(module, kernel_name, worker_idx);
-    kernels.back().set_args({ &count, &input_buf, &inoutput_buf });
-    kernels.back().calculate_group_size(count);
+        // reduce send_buf + tmp_buf
+        void* input_buf = static_cast<char*>(send_buf) + rank_buf_offset * dtype.size();
+        void* inoutput_buf = base_ptr;
+        kernels.emplace_back(module, kernel_name, worker_idx);
+        kernels.back().set_args({ &count, &input_buf, &inoutput_buf });
+        kernels.back().calculate_group_size(count);
+    }
 }
 
 void ze_a2a_reduce_scatter_entry::fill_list(const ze_base_entry* entry,
@@ -83,7 +158,7 @@ void ze_a2a_reduce_scatter_entry::fill_list(const ze_base_entry* entry,
                                             int peer_count,
                                             int comm_rank,
                                             size_t block_count,
-                                            size_t offset_bytes,
+                                            size_t rank_buf_offset,
                                             std::vector<ze_event_handle_t>& copy_events,
                                             std::vector<ze_kernel>& kernels,
                                             std::vector<ze_event_handle_t>& kernel_events,
@@ -93,11 +168,15 @@ void ze_a2a_reduce_scatter_entry::fill_list(const ze_base_entry* entry,
                                             ze_device_handle_t device,
                                             ze_context_handle_t context,
                                             ccl::reduction op,
-                                            size_t worker_idx) {
-    kernel_init(offset_bytes,
+                                            size_t worker_idx,
+                                            size_t peer_buf_offset,
+                                            bool is_monolithic,
+                                            bool is_single_kernel) {
+    kernel_init(rank_buf_offset,
                 block_count,
                 send_buf,
                 tmp_buf,
+                peer_send_bufs,
                 peer_count,
                 dtype,
                 comm_rank,
@@ -106,30 +185,60 @@ void ze_a2a_reduce_scatter_entry::fill_list(const ze_base_entry* entry,
                 device,
                 context,
                 op,
-                worker_idx);
+                worker_idx,
+                peer_buf_offset,
+                is_monolithic,
+                is_single_kernel);
 
-    size_t copy_bytes = block_count * dtype.size();
-    /* copy peer segments to temp buffer */
-    for (int i = 0; i < peer_count; i++) {
-        void* src = static_cast<char*>(peer_send_bufs[i].get_ptr()) + offset_bytes;
-        void* dst = static_cast<char*>(tmp_buf) + i * copy_bytes;
-        auto list = entry->get_copy_list(i, true);
-        ZE_CALL(zeCommandListAppendMemoryCopy,
-                (list, dst, src, copy_bytes, copy_events.at(i), 0, nullptr));
+    if (is_monolithic && peer_count <= (int)ccl::ze::max_peer_count) {
+        // reduce stage
+        for (size_t i = 0; i < kernels.size(); ++i) {
+            ZE_CALL(zeCommandListAppendLaunchKernel,
+                    (entry->get_comp_list(),
+                     kernels[i].get_kernel(),
+                     kernels[i].get_group_count(),
+                     kernel_events.at(i),
+                     0,
+                     nullptr));
+        }
+        // if only unaligned kernel is executed, then fill the event for
+        // aligned kernel also since calling function expect two events
+        if (kernels.size() < (int)ccl::utils::align_kernels::count) {
+            CCL_THROW_IF_NOT(kernel_events.size() == (int)ccl::utils::align_kernels::count,
+                             "monolithic kernel event count ",
+                             kernel_events.size(),
+                             " != ",
+                             (int)ccl::utils::align_kernels::count);
+            // assign kernel_events[1] = kernel_events[0]
+            kernel_events.back() = kernel_events.front();
+        }
     }
+    else {
+        size_t copy_bytes = block_count * dtype.size();
+        /* copy peer segments to temp buffer */
+        for (int i = 0; i < peer_count; i++) {
+            void* src = static_cast<char*>(peer_send_bufs[i].get_ptr()) +
+                        (rank_buf_offset + peer_buf_offset) * dtype.size();
+            void* dst = static_cast<char*>(tmp_buf) + i * copy_bytes;
+            // TODO: if we on the same device, then use t2t direction
+            auto list = entry->get_copy_list(copy_direction::c2c, i);
+            ZE_CALL(zeCommandListAppendMemoryCopy,
+                    (list, dst, src, copy_bytes, copy_events.at(i), 0, nullptr));
+        }
+
+        ZE_CALL(zeCommandListAppendBarrier,
+                (entry->get_comp_list(), barrier_event, copy_events.size(), copy_events.data()));
 
-    ZE_CALL(zeCommandListAppendBarrier,
-            (entry->get_comp_list(), barrier_event, copy_events.size(), copy_events.data()));
-
-    /* reduce stage */
-    for (size_t i = 0; i < kernels.size(); ++i) {
-        ZE_CALL(zeCommandListAppendLaunchKernel,
-                (entry->get_comp_list(),
-                 kernels[i].get_kernel(),
-                 kernels[i].get_group_count(),
-                 kernel_events.at(i),
-                 1,
-                 (i == 0) ? &barrier_event : &kernel_events.at(i - 1)));
+        /* reduce stage */
+        for (size_t i = 0; i < kernels.size(); ++i) {
+            ZE_CALL(zeCommandListAppendLaunchKernel,
+                    (entry->get_comp_list(),
+                     kernels[i].get_kernel(),
+                     kernels[i].get_group_count(),
+                     kernel_events.at(i),
+                     1,
+                     (i == 0) ? &barrier_event : &kernel_events.at(i - 1)));
+        }
     }
 }
 
@@ -161,18 +270,31 @@ void ze_a2a_reduce_scatter_entry::init_ze_hook() {
 
     /* copy peer segments to temp buffer */
 
-    pre_copy_events.resize(peer_count);
-    for (auto& event : pre_copy_events) {
-        event = ze_base_entry::create_event();
+    // do no need separate memcpys when using monolithic kernel
+    if (!ccl::global_data::env().reduce_scatter_monolithic_kernel) {
+        pre_copy_events.resize(peer_count);
+        for (auto& event : pre_copy_events) {
+            event = ze_base_entry::create_event();
+        }
     }
 
-    kernel_events.resize(peer_count);
+    if (ccl::global_data::env().reduce_scatter_monolithic_kernel) {
+        // leftover kernel and aligned kernel
+        kernel_events.resize((int)ccl::utils::align_kernels::count);
+    }
+    else if (ccl::global_data::env().enable_kernel_single_reduce_peers) {
+        // when kernel merge is used only one kernel is required
+        kernel_events.resize(1);
+    }
+    else {
+        kernel_events.resize(peer_count);
+    }
     for (auto& event : kernel_events) {
         event = ze_base_entry::create_event();
     }
 
-    size_t offset_count = std::accumulate(recv_counts.begin(), recv_counts.begin() + comm_rank, 0);
-    size_t offset_bytes = offset_count * dtype.size();
+    size_t rank_buf_offset =
+        std::accumulate(recv_counts.begin(), recv_counts.begin() + comm_rank, 0);
 
     barrier_event = ze_base_entry::create_event();
 
@@ -183,7 +305,7 @@ void ze_a2a_reduce_scatter_entry::init_ze_hook() {
               peer_count,
               comm_rank,
               recv_counts[comm_rank],
-              offset_bytes,
+              rank_buf_offset,
               pre_copy_events,
               kernels,
               kernel_events,
@@ -193,7 +315,10 @@ void ze_a2a_reduce_scatter_entry::init_ze_hook() {
               device,
               context,
               op,
-              worker_idx);
+              worker_idx,
+              peer_buf_offset,
+              ccl::global_data::env().reduce_scatter_monolithic_kernel,
+              ccl::global_data::env().enable_kernel_single_reduce_peers);
     post_copy_events.resize(1);
     for (auto& event : post_copy_events) {
         event = ze_base_entry::create_event();
@@ -204,8 +329,8 @@ void ze_a2a_reduce_scatter_entry::init_ze_hook() {
              tmp_buf,
              buf_bytes,
              post_copy_events.back(),
-             1,
-             &kernel_events.back()));
+             kernel_events.size(),
+             kernel_events.data()));
 }
 
 void ze_a2a_reduce_scatter_entry::update() {
@@ -217,3 +342,10 @@ void ze_a2a_reduce_scatter_entry::update() {
     ZE_CALL(zeEventHostSignal, (ze_base_entry::entry_event));
     ze_base_entry::update();
 }
+
+std::string ze_a2a_reduce_scatter_entry::name_ext() const {
+    std::stringstream out;
+    out << name() << ":"
+        << std::accumulate(recv_counts.begin(), recv_counts.end(), 0) * dtype.size();
+    return out.str();
+}
diff --git a/src/sched/entry/ze/ze_a2a_reduce_scatter_entry.hpp b/src/sched/entry/ze/ze_a2a_reduce_scatter_entry.hpp
index 9f92c8266..fad67d086 100644
--- a/src/sched/entry/ze/ze_a2a_reduce_scatter_entry.hpp
+++ b/src/sched/entry/ze/ze_a2a_reduce_scatter_entry.hpp
@@ -15,9 +15,7 @@
 */
 #pragma once
 
-#include <numeric>
 #include "common/utils/buffer.hpp"
-#include "comp/comp.hpp"
 #include "sched/entry/ze/ze_base_entry.hpp"
 
 class ze_a2a_reduce_scatter_entry : public ze_base_entry {
@@ -30,12 +28,7 @@ class ze_a2a_reduce_scatter_entry : public ze_base_entry {
         return class_name();
     }
 
-    virtual std::string name_ext() const override {
-        std::stringstream out;
-        out << name() << " ";
-        out << "size: " << std::accumulate(recv_counts.begin(), recv_counts.end(), 0);
-        return out.str();
-    }
+    virtual std::string name_ext() const override;
 
     ze_a2a_reduce_scatter_entry() = delete;
     explicit ze_a2a_reduce_scatter_entry(ccl_sched* sched,
@@ -46,7 +39,8 @@ class ze_a2a_reduce_scatter_entry : public ze_base_entry {
                                          ccl::reduction op,
                                          ccl_comm* comm,
                                          std::vector<ze_event_handle_t> wait_events = {},
-                                         size_t peer_buf_idx = 0);
+                                         size_t peer_buf_idx = 0,
+                                         size_t peer_buf_offset = 0);
 
     void init_ze_hook() override;
 
@@ -59,7 +53,7 @@ class ze_a2a_reduce_scatter_entry : public ze_base_entry {
                           int peer_count,
                           int comm_rank,
                           size_t block_count,
-                          size_t offset_bytes,
+                          size_t rank_buf_offset,
                           std::vector<ze_event_handle_t>& copy_events,
                           std::vector<ze_kernel>& kernels,
                           std::vector<ze_event_handle_t>& kernel_events,
@@ -69,7 +63,10 @@ class ze_a2a_reduce_scatter_entry : public ze_base_entry {
                           ze_device_handle_t device,
                           ze_context_handle_t context,
                           ccl::reduction op,
-                          size_t worker_idx);
+                          size_t worker_idx,
+                          size_t peer_buf_offset,
+                          bool is_monolithic,
+                          bool is_single_kernel);
 
 private:
     static constexpr size_t event_group_count{ 3 }; // copy + kernel + copy
@@ -80,6 +77,7 @@ class ze_a2a_reduce_scatter_entry : public ze_base_entry {
     const ccl::reduction op;
     const std::vector<size_t> recv_counts;
     const size_t peer_buf_idx;
+    const size_t peer_buf_offset;
     const int peer_count;
 
     std::vector<ze_event_handle_t> pre_copy_events;
@@ -89,10 +87,11 @@ class ze_a2a_reduce_scatter_entry : public ze_base_entry {
     std::vector<ze_kernel> kernels;
     std::vector<ze_event_handle_t> kernel_events;
 
-    static void kernel_init(size_t offset_bytes,
+    static void kernel_init(size_t rank_buf_offset,
                             size_t block_count,
                             void* send_buf,
                             void* base_ptr,
+                            const std::vector<ccl_buffer>& peer_send_bufs,
                             int peer_count,
                             const ccl_datatype& dtype,
                             int comm_rank,
@@ -101,5 +100,8 @@ class ze_a2a_reduce_scatter_entry : public ze_base_entry {
                             ze_device_handle_t device,
                             ze_context_handle_t context,
                             ccl::reduction op,
-                            size_t worker_idx);
+                            size_t worker_idx,
+                            size_t peer_buf_offset,
+                            bool is_monolithic,
+                            bool is_single_kernel);
 };
diff --git a/src/sched/entry/ze/ze_barrier_entry.cpp b/src/sched/entry/ze/ze_barrier_entry.cpp
index ea0d0e9d5..175a16308 100644
--- a/src/sched/entry/ze/ze_barrier_entry.cpp
+++ b/src/sched/entry/ze/ze_barrier_entry.cpp
@@ -13,20 +13,20 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#include "ze_barrier_entry.hpp"
-
+#include "common/api_wrapper/ze_api_wrapper.hpp"
+#include "sched/entry/ze/ze_barrier_entry.hpp"
 #include "sched/entry/ze/ze_base_entry.hpp"
 #include "sched/entry/ze/ze_primitives.hpp"
 
 ze_barrier_entry::ze_barrier_entry(ccl_sched* sched,
                                    ccl_comm* comm,
                                    ze_event_pool_handle_t& local_pool,
-                                   size_t event_idx)
+                                   size_t wait_event_idx)
         : sched_entry(sched),
           comm(comm),
           rank(comm->rank()),
           comm_size(comm->size()),
-          event_idx(event_idx),
+          wait_event_idx(wait_event_idx),
           local_pool(local_pool) {
     LOG_DEBUG("initialization");
     CCL_THROW_IF_NOT(sched, "no sched");
@@ -61,7 +61,7 @@ void ze_barrier_entry::start() {
     ze_event_desc_t event_desc = default_event_desc;
     event_desc.signal = ZE_EVENT_SCOPE_FLAG_HOST; //TODO: DEVICE
     event_desc.wait = ZE_EVENT_SCOPE_FLAG_HOST;
-    event_desc.index = event_idx;
+    event_desc.index = wait_event_idx;
 
     signal_event = ze_base_entry::create_event(local_pool, event_desc);
     LOG_DEBUG("signal event is created for rank: ", rank);
@@ -98,3 +98,7 @@ void ze_barrier_entry::update() {
         }
     }
 }
+
+void ze_barrier_entry::dump_detail(std::stringstream& str) const {
+    ccl_logger::format(str, "comm ", comm->to_string(), ", wait_events ", wait_events.size(), "\n");
+}
diff --git a/src/sched/entry/ze/ze_barrier_entry.hpp b/src/sched/entry/ze/ze_barrier_entry.hpp
index ab6dfc0df..f9cac1a6a 100644
--- a/src/sched/entry/ze/ze_barrier_entry.hpp
+++ b/src/sched/entry/ze/ze_barrier_entry.hpp
@@ -17,8 +17,6 @@
 
 #include "sched/entry/factory/entry_factory.hpp"
 
-#include "common/ze/ze_api_wrapper.hpp"
-
 class ze_barrier_entry : public sched_entry {
 public:
     static constexpr const char* class_name() noexcept {
@@ -42,17 +40,14 @@ class ze_barrier_entry : public sched_entry {
     void finalize() override;
 
 protected:
-    void dump_detail(std::stringstream& str) const override {
-        ccl_logger::format(
-            str, "comm ", comm->to_string(), ", wait_events ", wait_events.size(), "\n");
-    }
+    void dump_detail(std::stringstream& str) const override;
 
 private:
-    ccl_comm* comm;
+    const ccl_comm* comm;
     const int rank;
     const int comm_size;
     size_t last_completed_event_idx{};
-    size_t event_idx{};
+    size_t wait_event_idx{};
 
     ze_event_pool_handle_t local_pool{};
     ze_event_handle_t signal_event{};
diff --git a/src/sched/entry/ze/ze_base_entry.cpp b/src/sched/entry/ze/ze_base_entry.cpp
index feb1b3aa7..1ddb248f2 100644
--- a/src/sched/entry/ze/ze_base_entry.cpp
+++ b/src/sched/entry/ze/ze_base_entry.cpp
@@ -14,14 +14,15 @@
  limitations under the License.
 */
 #include "common/stream/stream.hpp"
-#include "sched/queue/queue.hpp"
-
+#include "common/utils/sycl_utils.hpp"
+#include "comm/comm.hpp"
+#include "common/global/global.hpp"
+#include "common/api_wrapper/ze_api_wrapper.hpp"
 #include "sched/entry/ze/ze_base_entry.hpp"
 #include "sched/entry/ze/ze_cache.hpp"
 #include "sched/entry/ze/ze_call.hpp"
 #include "sched/entry/ze/ze_primitives.hpp"
-
-#include "common/utils/sycl_utils.hpp"
+#include "sched/sched.hpp"
 
 using namespace ccl;
 using namespace ccl::ze;
@@ -34,12 +35,12 @@ ze_base_entry::ze_base_entry(ccl_sched *sched,
           comm(comm),
           use_single_list(sched->use_single_list),
           wait_events(wait_events) {
-    if (!comm) {
-        comm = sched->coll_param.comm;
+    if (!this->comm) {
+        this->comm = sched->coll_param.comm;
     }
-    CCL_THROW_IF_NOT(comm, "no comm");
-    comm_rank = comm->rank();
-    comm_size = comm->size();
+    CCL_THROW_IF_NOT(this->comm, "no comm");
+    comm_rank = this->comm->rank();
+    comm_size = this->comm->size();
 
     // we can be here in case of copy_entry which may not have ze backend here, so check it
     if (sched->coll_param.stream &&
@@ -167,6 +168,22 @@ bool ze_base_entry::is_event_completed(ze_event_handle_t event) {
 
 void ze_base_entry::update() {
     bool complete = is_event_completed(entry_event);
+    if (is_update_time_expired) {
+        size_t complete_event_count = 0;
+        for (auto &event : wait_events) {
+            if (is_event_completed(event)) {
+                complete_event_count++;
+            }
+        }
+        LOG_DEBUG("completed ",
+                  complete_event_count,
+                  " of ",
+                  wait_events.size(),
+                  " wait events. Entry event ",
+                  entry_event,
+                  " is ",
+                  (complete) ? "completed" : "not completed");
+    }
 
     if (complete) {
         LOG_DEBUG(name(), " ", this, " entry complete");
@@ -202,13 +219,9 @@ ze_command_list_handle_t ze_base_entry::get_comp_list(uint32_t index) const {
     return sched->get_memory().list_manager->get_comp_list(this, wait_events, index);
 }
 
-ze_command_list_handle_t ze_base_entry::get_copy_list(uint32_t index, bool peer_card_copy) const {
-    return sched->get_memory().list_manager->get_copy_list(
-        this, wait_events, index, peer_card_copy);
-}
-
-std::string ze_base_entry::name_ext() const {
-    return "[empty]";
+ze_command_list_handle_t ze_base_entry::get_copy_list(copy_direction direction,
+                                                      uint32_t index) const {
+    return sched->get_memory().list_manager->get_copy_list(this, wait_events, direction, index);
 }
 
 ze_event_handle_t ze_base_entry::create_event(ze_event_pool_handle_t event_pool,
diff --git a/src/sched/entry/ze/ze_base_entry.hpp b/src/sched/entry/ze/ze_base_entry.hpp
index 5014b8a86..29393c672 100644
--- a/src/sched/entry/ze/ze_base_entry.hpp
+++ b/src/sched/entry/ze/ze_base_entry.hpp
@@ -15,12 +15,12 @@
 */
 #pragma once
 
-#include "comm/comm.hpp"
-#include "common/global/global.hpp"
-#include "sched/sched.hpp"
+#include "sched/entry/copy/copy_helper.hpp"
 #include "sched/entry/entry.hpp"
+#include "sched/queue/queue.hpp"
 
-#include "common/ze/ze_api_wrapper.hpp"
+class ccl_sched;
+class ccl_comm;
 
 using namespace ccl::ze;
 
@@ -38,7 +38,8 @@ class ze_base_entry : public sched_entry {
     virtual void update() override;
 
     ze_command_list_handle_t get_comp_list(uint32_t index = 0) const;
-    ze_command_list_handle_t get_copy_list(uint32_t index = 0, bool peer_card_copy = false) const;
+    ze_command_list_handle_t get_copy_list(copy_direction direction = copy_direction::d2d,
+                                           uint32_t index = 0) const;
 
     ze_event_handle_t entry_event{};
 
@@ -60,8 +61,6 @@ class ze_base_entry : public sched_entry {
     void init_entries();
     void finalize_entries();
 
-    virtual std::string name_ext() const;
-
     ze_event_handle_t create_event();
     void reset_events();
     void destroy_events();
diff --git a/src/sched/entry/ze/ze_cache.cpp b/src/sched/entry/ze/ze_cache.cpp
index a7d72e4f8..d4b31aebf 100644
--- a/src/sched/entry/ze/ze_cache.cpp
+++ b/src/sched/entry/ze/ze_cache.cpp
@@ -365,12 +365,13 @@ void module_cache::load(ze_context_handle_t context,
     load_module(file_path, device, context, module);
 }
 
+// mem_handle cache
 mem_handle_cache::mem_handle_cache() {
-    if (!global_data::env().enable_ze_cache_ipc_handles) {
+    if (!global_data::env().enable_ze_cache_open_ipc_handles) {
         return;
     }
 
-    threshold = global_data::env().ze_cache_ipc_handles_threshold;
+    threshold = global_data::env().ze_cache_open_ipc_handles_threshold;
     cache.reserve(threshold + 1);
     LOG_DEBUG("cache threshold: ", threshold);
 }
@@ -425,7 +426,6 @@ void mem_handle_cache::get(ze_context_handle_t context,
         if (fd_is_valid(fd)) {
             // move key_value to the beginning of the list
             cache_list.splice(cache_list.begin(), cache_list, key_value->second);
-            close_handle_fd(info.handle);
             *out_value = value;
             found = true;
         }
@@ -438,13 +438,13 @@ void mem_handle_cache::get(ze_context_handle_t context,
     }
 
     if (!found) {
-        push(device, std::move(key), info.handle, out_value);
+        push(device, std::move(key), info, out_value);
     }
 }
 
 void mem_handle_cache::push(ze_device_handle_t device,
                             key_t&& key,
-                            const ze_ipc_mem_handle_t& handle,
+                            const ipc_handle_desc& info,
                             value_t* out_value) {
     make_clean(threshold);
 
@@ -453,6 +453,8 @@ void mem_handle_cache::push(ze_device_handle_t device,
     auto remote_context = global_data::get().ze_data->contexts.at(remote_context_id);
 
     void* ptr{};
+    ze_ipc_mem_handle_t handle = info.mem_to_ipc_handle();
+
     ZE_CALL(zeMemOpenIpcHandle, (remote_context, device, handle, {}, &ptr));
     *out_value = std::make_shared<const handle_desc>(remote_context, handle, ptr);
     cache_list.push_front(std::make_pair(key, *out_value));
@@ -496,6 +498,53 @@ mem_handle_cache::handle_desc::~handle_desc() {
     close_handle();
 }
 
+// ipc_handle cache
+void ipc_handle_cache::clear() {
+    LOG_DEBUG("clear ipc_handle_cache: size: ", cache.size());
+    std::lock_guard<std::mutex> lock(mutex);
+    for (auto& key_value : cache) {
+        close_handle_fd(key_value.second);
+    }
+    cache.clear();
+}
+
+ipc_handle_cache::~ipc_handle_cache() {
+    if (!cache.empty()) {
+        LOG_WARN("ipc_handle_cache is not empty, size: ", cache.size());
+        clear();
+    }
+}
+void ipc_handle_cache::get(ze_context_handle_t context,
+                           ze_device_handle_t device,
+                           const ipc_get_handle_desc& ipc_desc,
+                           value_t* out_value) {
+    CCL_THROW_IF_NOT(context);
+    CCL_THROW_IF_NOT(device);
+    CCL_THROW_IF_NOT(ipc_desc.ptr);
+
+    std::lock_guard<std::mutex> lock(mutex);
+
+    key_t key(ipc_desc.ptr, ipc_desc.mem_id);
+
+    auto key_value = cache.find(key);
+    if (key_value != cache.end()) {
+        value_t& value = key_value->second;
+        *out_value = value;
+    }
+    else {
+        LOG_DEBUG("ipc_handle is not found in the cache");
+        push(context, std::move(key), ipc_desc, out_value);
+    }
+}
+
+void ipc_handle_cache::push(ze_context_handle_t context,
+                            key_t&& key,
+                            const ipc_get_handle_desc& ipc_desc,
+                            value_t* out_value) {
+    ZE_CALL(zeMemGetIpcHandle, (context, ipc_desc.ptr, out_value));
+    cache.insert({ std::move(key), *out_value });
+}
+
 // cache
 cache::~cache() {
     for (size_t i = 0; i < instance_count; ++i) {
@@ -508,6 +557,7 @@ cache::~cache() {
 
     modules.clear();
     mem_handles.clear();
+    ipc_handles.clear();
 }
 
 } // namespace ze
diff --git a/src/sched/entry/ze/ze_cache.hpp b/src/sched/entry/ze/ze_cache.hpp
index 02051f2c9..a76f73e1f 100644
--- a/src/sched/entry/ze/ze_cache.hpp
+++ b/src/sched/entry/ze/ze_cache.hpp
@@ -231,12 +231,39 @@ class mem_handle_cache {
 
     void push(ze_device_handle_t device,
               key_t&& key,
-              const ze_ipc_mem_handle_t& handle,
+              const ipc_handle_desc& info,
               value_t* out_value);
     void make_clean(size_t limit);
     bool fd_is_valid(int fd);
 };
 
+struct ipc_get_handle_desc;
+
+class ipc_handle_cache {
+public:
+    using value_t = ze_ipc_mem_handle_t;
+
+    ipc_handle_cache() = default;
+    ~ipc_handle_cache();
+
+    void clear();
+
+    void get(ze_context_handle_t context,
+             ze_device_handle_t device,
+             const ipc_get_handle_desc& ipc_info,
+             value_t* out_value);
+
+private:
+    using key_t = typename std::tuple<void*, uint64_t>;
+    std::unordered_multimap<key_t, value_t, utils::tuple_hash> cache;
+    std::mutex mutex;
+
+    void push(ze_context_handle_t context,
+              key_t&& key,
+              const ipc_get_handle_desc& ipc_info,
+              value_t* out_value);
+};
+
 class cache {
 public:
     cache(size_t instance_count)
@@ -308,6 +335,13 @@ class cache {
         mem_handles.get(context, device, info, out_value);
     }
 
+    void get(ze_context_handle_t context,
+             ze_device_handle_t device,
+             const ipc_get_handle_desc& ipc_info,
+             ipc_handle_cache::value_t* out_value) {
+        ipc_handles.get(context, device, ipc_info, out_value);
+    }
+
     /* push */
     void push(size_t instance_idx,
               ze_module_handle_t module,
@@ -359,6 +393,7 @@ class cache {
     std::vector<device_mem_cache> device_mems;
     module_cache modules{};
     mem_handle_cache mem_handles{};
+    ipc_handle_cache ipc_handles{};
 };
 
 } // namespace ze
diff --git a/src/sched/entry/ze/ze_call.hpp b/src/sched/entry/ze/ze_call.hpp
index 8a9ebb010..e324fcfa6 100644
--- a/src/sched/entry/ze/ze_call.hpp
+++ b/src/sched/entry/ze/ze_call.hpp
@@ -15,7 +15,7 @@
 */
 #pragma once
 
-#include "common/ze/ze_api_wrapper.hpp"
+#include "common/api_wrapper/ze_api_wrapper.hpp"
 
 namespace ccl {
 namespace ze {
diff --git a/src/sched/entry/ze/ze_copy_entry.cpp b/src/sched/entry/ze/ze_copy_entry.cpp
index e17b79631..90552a0e0 100644
--- a/src/sched/entry/ze/ze_copy_entry.cpp
+++ b/src/sched/entry/ze/ze_copy_entry.cpp
@@ -13,6 +13,7 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
+#include "sched/entry/copy/copy_helper.hpp"
 #include "sched/entry/ze/ze_copy_entry.hpp"
 
 using namespace ccl;
@@ -23,14 +24,16 @@ ze_copy_entry::ze_copy_entry(ccl_sched* sched,
                              size_t count,
                              const ccl_datatype& dtype,
                              const copy_attr& attr,
-                             std::vector<ze_event_handle_t> wait_events)
+                             std::vector<ze_event_handle_t> wait_events,
+                             std::vector<ze_event_handle_t> dep_events)
         : ze_base_entry(sched, nullptr, 1, wait_events),
           sched(sched),
           in_buf(in_buf),
           out_buf(out_buf),
           dtype(dtype),
           attr(attr),
-          count(count) {
+          count(count),
+          dep_events(dep_events) {
     CCL_THROW_IF_NOT(sched, "no sched");
 }
 
@@ -51,8 +54,23 @@ void ze_copy_entry::init_ze_hook() {
     void* src = static_cast<char*>(in_buf.get_ptr()) + attr.in_buf_offset * dtype.size();
 
     ze_command_list_handle_t list =
-        ze_base_entry::get_copy_list(attr.hint_queue_index, attr.is_peer_card_copy);
-
+        ze_base_entry::get_copy_list(attr.direction, attr.hint_queue_index);
     ZE_CALL(zeCommandListAppendMemoryCopy,
-            (list, dst, src, dtype.size() * count, ze_base_entry::entry_event, 0, nullptr));
+            (list,
+             dst,
+             src,
+             dtype.size() * count,
+             ze_base_entry::entry_event,
+             dep_events.size(),
+             dep_events.data()));
+}
+
+std::string ze_copy_entry::name_ext() const {
+    std::stringstream out;
+    out << name();
+    if (attr.direction != copy_direction::undefined) {
+        out << ":" << to_string(attr.direction);
+    }
+    out << ":" << count * dtype.size();
+    return out.str();
 }
diff --git a/src/sched/entry/ze/ze_copy_entry.hpp b/src/sched/entry/ze/ze_copy_entry.hpp
index 5e19bdc34..4099e6a68 100644
--- a/src/sched/entry/ze/ze_copy_entry.hpp
+++ b/src/sched/entry/ze/ze_copy_entry.hpp
@@ -15,7 +15,6 @@
 */
 #pragma once
 
-#include "sched/entry/copy/copy_helper.hpp"
 #include "sched/entry/ze/ze_base_entry.hpp"
 
 struct copy_attr;
@@ -30,12 +29,7 @@ class ze_copy_entry : public ze_base_entry {
         return class_name();
     }
 
-    virtual std::string name_ext() const override {
-        std::stringstream out;
-        out << name() << " ";
-        out << "size: " << count;
-        return out.str();
-    }
+    virtual std::string name_ext() const override;
 
     explicit ze_copy_entry(ccl_sched* sched,
                            ccl_buffer in_buf,
@@ -43,7 +37,8 @@ class ze_copy_entry : public ze_base_entry {
                            size_t count,
                            const ccl_datatype& dtype,
                            const copy_attr& attr = {},
-                           std::vector<ze_event_handle_t> wait_events = {});
+                           std::vector<ze_event_handle_t> wait_events = {},
+                           std::vector<ze_event_handle_t> dep_events = {});
 
     void init_ze_hook() override;
 
@@ -54,4 +49,5 @@ class ze_copy_entry : public ze_base_entry {
     const ccl_datatype dtype;
     const copy_attr attr;
     const size_t count;
+    std::vector<ze_event_handle_t> dep_events;
 };
diff --git a/src/sched/entry/ze/ze_event_signal_entry.cpp b/src/sched/entry/ze/ze_event_signal_entry.cpp
index 1a5b860f2..7d881f423 100644
--- a/src/sched/entry/ze/ze_event_signal_entry.cpp
+++ b/src/sched/entry/ze/ze_event_signal_entry.cpp
@@ -13,9 +13,10 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
+#include "common/utils/sycl_utils.hpp"
 #include "sched/entry/ze/ze_event_signal_entry.hpp"
 #include "sched/queue/queue.hpp"
-#include "common/utils/sycl_utils.hpp"
+#include "sched/sched.hpp"
 
 ze_event_signal_entry::ze_event_signal_entry(ccl_sched* sched, ccl_sched* master_sched)
         : sched_entry(sched),
diff --git a/src/sched/entry/ze/ze_event_signal_entry.hpp b/src/sched/entry/ze/ze_event_signal_entry.hpp
index 435cb8f88..725eb8085 100644
--- a/src/sched/entry/ze/ze_event_signal_entry.hpp
+++ b/src/sched/entry/ze/ze_event_signal_entry.hpp
@@ -16,7 +16,6 @@
 #pragma once
 
 #include "sched/entry/entry.hpp"
-#include "sched/sched.hpp"
 
 class ccl_sched;
 
diff --git a/src/sched/entry/ze/ze_handle_exchange_entry.cpp b/src/sched/entry/ze/ze_handle_exchange_entry.cpp
index f08005926..fcd302bb3 100644
--- a/src/sched/entry/ze/ze_handle_exchange_entry.cpp
+++ b/src/sched/entry/ze/ze_handle_exchange_entry.cpp
@@ -19,14 +19,9 @@
 #include "sched/entry/ze/ze_primitives.hpp"
 #include "sched/ze/ze_handle_manager.hpp"
 
-#include <arpa/inet.h>
 #include <errno.h>
 #include <fcntl.h>
-#include <netdb.h>
 #include <sys/socket.h>
-#include <sys/types.h>
-#include <sys/wait.h>
-#include <unistd.h>
 
 static void cast_pool_to_mem_handle(ze_ipc_mem_handle_t* mem,
                                     const ze_ipc_event_pool_handle_t* pool) {
@@ -49,11 +44,54 @@ ze_handle_exchange_entry::ze_handle_exchange_entry(ccl_sched* sched,
     CCL_THROW_IF_NOT(sched, "no sched");
     CCL_THROW_IF_NOT(!in_buffers.empty(), "in_buffers should be non empty");
 
+    if (ccl::global_data::env().ze_ipc_exchange == ccl::ze::ipc_exchange_mode::sockets) {
+        if (comm_size == 1) {
+            this->skip_rank = rank;
+        }
+        poll_fds.reserve(max_pfds);
+        std::string unique_tag =
+            std::to_string(comm->get_comm_id()) + "-" + std::to_string(sched->sched_id) + "-" +
+            std::to_string(sched->get_op_id()) + "-" + std::to_string(getuid()) + "-" +
+            comm->get_topo_manager().get_uuid(0);
+        right_peer_socket_name =
+            "/tmp/ccl-handle-" + std::to_string((rank + 1) % comm_size) + "-" + unique_tag;
+        left_peer_socket_name = "/tmp/ccl-handle-" + std::to_string(rank) + "-" + unique_tag;
+    }
+    create_local_ipc_handles(in_buffers);
+    LOG_DEBUG("init completed");
+}
+
+ze_handle_exchange_entry::~ze_handle_exchange_entry() {
+    if (ccl::global_data::env().ze_ipc_exchange == ccl::ze::ipc_exchange_mode::sockets) {
+        close_sockets();
+        unlink_sockets();
+    }
+
+    if (ccl::global_data::env().ze_ipc_exchange == ccl::ze::ipc_exchange_mode::pidfd) {
+        for (auto& fd : opened_pidfds) {
+            close(fd);
+        }
+        opened_pidfds.clear();
+    }
+}
+
+void ze_handle_exchange_entry::start() {
+    start_buf_idx = start_peer_idx = 0;
+    skip_first_send = false;
+    status = ccl_sched_entry_status_started;
+
     if (comm_size == 1) {
-        skip_rank = rank;
+        status = ccl_sched_entry_status_complete;
     }
+}
 
-    poll_fds.reserve(max_pfds);
+void ze_handle_exchange_entry::create_local_ipc_handles(const std::vector<mem_desc_t>& bufs) {
+    if (comm_size == 1) {
+        this->skip_rank = rank;
+    }
+
+    // current pid of current rank
+    current_pid = getpid();
 
     handles.resize(comm_size);
     for (auto& buffers : handles) {
@@ -62,13 +100,14 @@ ze_handle_exchange_entry::ze_handle_exchange_entry(ccl_sched* sched,
     LOG_DEBUG("handles size: ", handles.size(), ", in_buffers size: ", in_buffers.size());
 
     for (size_t buf_idx = 0; buf_idx < in_buffers.size(); buf_idx++) {
+        int mem_handle = ccl::utils::invalid_mem_handle;
         auto mem_ptr = in_buffers[buf_idx].first;
         CCL_THROW_IF_NOT(mem_ptr, "memory pointer is nullptr");
         auto mem_type = in_buffers[buf_idx].second;
         mem_info_t mem_info{};
 
-        ze_ipc_mem_handle_t handle{};
-        if (rank != skip_rank) {
+        ze_ipc_mem_handle_t ipc_handle{};
+        if (rank != this->skip_rank) {
             if (mem_type == ccl::ze::ipc_mem_type::memory) {
                 // zeMemGetIpcHandle requires the provided pointer to be the base of an allocation.
                 // We handle this the following way: for an input buffer retrieve its base pointer
@@ -76,7 +115,21 @@ ze_handle_exchange_entry::ze_handle_exchange_entry(ccl_sched* sched,
                 // and the offset is sent to the other rank. On that rank the base ptr is retrieved
                 // and offsetted to get the actual input buffer ptr.
                 mem_info = get_mem_info(mem_ptr);
-                sched->get_memory().handle_manager.get_handle(mem_info.first, &handle);
+                sched->get_memory().handle_manager.get_handle(mem_info.first, &ipc_handle);
+
+                if (ccl::global_data::env().ze_ipc_exchange == ccl::ze::ipc_exchange_mode::drmfd) {
+                    device_fds = ccl::global_data::get().ze_data->fd_manager->get_device_fds();
+                    mem_handle = ipc_to_mem_handle(
+                        ipc_handle,
+                        ccl::ze::get_parent_device_id(sched->coll_param.stream->get_ze_device()));
+                }
+                else if (ccl::global_data::env().ze_ipc_exchange ==
+                         ccl::ze::ipc_exchange_mode::pidfd) {
+                    mem_handle = ipc_to_mem_handle(ipc_handle);
+                }
+                else {
+                    LOG_DEBUG("don't need ipc_to_mem_handle in sockets mode");
+                }
             }
             else if (mem_type == ccl::ze::ipc_mem_type::pool) {
                 ze_ipc_event_pool_handle_t pool_handle;
@@ -85,54 +138,152 @@ ze_handle_exchange_entry::ze_handle_exchange_entry(ccl_sched* sched,
                 // since ze_ipc_event_pool_handle_t and ze_ipc_mem_handle_t are similar,
                 // we cast ze_ipc_event_pool_handle_t to ze_ipc_mem_handle_t, but
                 // maybe this is not the most correct way
-                cast_pool_to_mem_handle(&handle, &pool_handle);
+                cast_pool_to_mem_handle(&ipc_handle, &pool_handle);
             }
             else {
                 CCL_THROW("unknown memory type");
             }
         }
 
-        handles[rank][buf_idx] = { handle, mem_info.second, mem_type };
+        handles[rank][buf_idx] = { ipc_handle, mem_info.second, mem_type, mem_handle };
         LOG_DEBUG("set IPC handle: { rank: ",
                   rank,
                   ", buf_idx: ",
                   buf_idx,
                   ", mem_type: ",
                   to_string(mem_type),
+                  ", mem_handle: ",
+                  mem_handle,
                   " }");
     }
+}
 
-    std::string unique_tag = std::to_string(comm->get_comm_id()) + "-" +
-                             std::to_string(sched->sched_id) + "-" +
-                             std::to_string(sched->get_op_id()) + "-" + std::to_string(getuid()) +
-                             "-" + comm->get_topo_manager().get_uuid(0);
-    right_peer_socket_name =
-        "/tmp/ccl-handle-" + std::to_string((rank + 1) % comm_size) + "-" + unique_tag;
-    left_peer_socket_name = "/tmp/ccl-handle-" + std::to_string(rank) + "-" + unique_tag;
+int ze_handle_exchange_entry::ipc_to_mem_handle(const ze_ipc_mem_handle_t& ipc_handle,
+                                                const int parent_dev_id) {
+    int dmabuf_fd;
+    int mem_handle = ccl::utils::invalid_mem_handle;
 
-    LOG_DEBUG("init completed");
+    if (ccl::global_data::env().ze_ipc_exchange == ccl::ze::ipc_exchange_mode::drmfd) {
+        // convert dma_buf fd to GEM handle
+        memcpy(&dmabuf_fd, &ipc_handle, sizeof(dmabuf_fd));
+        mem_handle = ccl::ze::fd_manager::fd_to_mem_handle(device_fds[parent_dev_id], dmabuf_fd);
+        LOG_DEBUG("device_fd: ", device_fds[parent_dev_id]);
+    }
+    else if (ccl::global_data::env().ze_ipc_exchange == ccl::ze::ipc_exchange_mode::pidfd) {
+        memcpy(&mem_handle, &ipc_handle, sizeof(int));
+    }
+    else {
+        CCL_THROW("unexpected ipc_exchange_mode");
+    }
+
+    CCL_THROW_IF_NOT(mem_handle != ccl::utils::invalid_mem_handle,
+                     "convertation failed: invalid mem_handle: ",
+                     mem_handle);
+    LOG_DEBUG("mem_handle: ", mem_handle);
+    return mem_handle;
 }
 
-ze_handle_exchange_entry::~ze_handle_exchange_entry() {
-    close_sockets();
-    unlink_sockets();
+void ze_handle_exchange_entry::fill_payload(payload_t& payload,
+                                            const std::vector<mem_desc_t>& bufs,
+                                            size_t buf_idx) {
+    payload.mem_handle = handles[rank][buf_idx].mem_handle;
+    payload.mem_type = handles[rank][buf_idx].mem_type;
+    payload.mem_offset = handles[rank][buf_idx].mem_offset;
+    payload.remote_pid = getpid();
+    const void* ptr = bufs[buf_idx].first;
+
+    ze_context_handle_t remote_context{};
+    ze_device_handle_t remote_device{};
+    ze_memory_allocation_properties_t mem_alloc_props;
+    if (!ccl::ze::get_buffer_context_and_device(
+            ptr, &remote_context, &remote_device, &mem_alloc_props)) {
+        CCL_THROW("unable to get context from ptr\n");
+    }
+    ssize_t remote_context_id{ ccl::utils::invalid_context_id };
+    if (!ccl::ze::get_context_global_id(remote_context, &remote_context_id)) {
+        CCL_THROW("unable to get global id for context\n");
+    }
+    ssize_t remote_device_id{ ccl::utils::invalid_device_id };
+    if (!ccl::ze::get_device_global_id(remote_device, &remote_device_id)) {
+        CCL_THROW("unable to get global id for device\n");
+    }
+
+    payload.remote_mem_alloc_id = mem_alloc_props.id;
+    payload.remote_context_id = remote_context_id;
+    payload.remote_device_id = remote_device_id;
 }
 
-void ze_handle_exchange_entry::start() {
-    start_buf_idx = start_peer_idx = 0;
-    skip_first_send = false;
-    status = ccl_sched_entry_status_started;
+void ze_handle_exchange_entry::fill_remote_handle(const payload_t& payload,
+                                                  ze_ipc_mem_handle_t ipc_handle,
+                                                  const size_t idx,
+                                                  const size_t buf_idx) {
+    handles[idx][buf_idx] = {
+        ipc_handle, payload.mem_offset, payload.mem_type, payload.mem_handle
+    };
+    handles[idx][buf_idx].remote_mem_alloc_id = payload.remote_mem_alloc_id;
+    handles[idx][buf_idx].remote_context_id = payload.remote_context_id;
+    handles[idx][buf_idx].remote_pid = payload.remote_pid;
+    handles[idx][buf_idx].remote_device_id = payload.remote_device_id;
+    handles[idx][buf_idx].pidfd_fd = payload.pidfd_fd;
+    handles[idx][buf_idx].device_fd = payload.device_fd;
+    LOG_DEBUG("get IPC handle: { peer: ",
+              idx,
+              ", buf_idx: ",
+              buf_idx,
+              ", mem_type: ",
+              to_string(payload.mem_type),
+              " }");
+}
 
-    if (comm_size == 1) {
-        status = ccl_sched_entry_status_complete;
+void ze_handle_exchange_entry::common_fd_mode_exchange(const std::vector<mem_desc_t>& bufs) {
+    for (size_t buf_idx = 0; buf_idx < bufs.size(); buf_idx++) {
+        std::vector<payload_t> payloads(comm_size);
+        payload_t payload{};
+        fill_payload(payload, bufs, buf_idx);
+
+        if (!(ccl::utils::allgather(
+                comm->get_atl_comm(), &payload, payloads.data(), sizeof(payload_t)))) {
+            CCL_THROW("allgather exchange is failed");
+        }
+
+        for (size_t idx = 0; idx < payloads.size(); idx++) {
+            if (comm->rank() == (int)idx) {
+                continue;
+            }
+
+            if (ccl::global_data::env().ze_ipc_exchange == ccl::ze::ipc_exchange_mode::drmfd) {
+                int remote_device_id = payloads[idx].remote_device_id;
+                payloads[idx].device_fd = device_fds
+                    [ccl::global_data::get().ze_data->devices[remote_device_id].parent_idx];
+            }
+            else if (ccl::global_data::env().ze_ipc_exchange == ccl::ze::ipc_exchange_mode::pidfd) {
+                opened_pidfds.push_back(ccl::ze::fd_manager::pidfd_open(payloads[idx].remote_pid));
+                payloads[idx].pidfd_fd = opened_pidfds.back();
+            }
+            else {
+                CCL_THROW("unexpected ipc_exchange_mode");
+            }
+            fill_remote_handle(
+                payloads[idx],
+                {}, // ipc_handle is empty, it's initialized immeadiately before calling zeMemOpenIpcHandle
+                idx,
+                buf_idx);
+        }
     }
+
+    LOG_DEBUG(ccl::ze::ipc_exchange_names[ccl::global_data::env().ze_ipc_exchange],
+              " mode completed, handles size: ",
+              handles.size(),
+              ", in_buffers size: ",
+              bufs.size());
+    sched->get_memory().handle_manager.set(handles);
 }
 
-void ze_handle_exchange_entry::update() {
+int ze_handle_exchange_entry::sockets_mode_exchange(const std::vector<mem_desc_t>& bufs) {
     if (!is_created) {
         // server
-        left_peer_connect_socket = create_server_socket(
-            left_peer_socket_name, &left_peer_addr, &left_peer_addr_len, comm_size);
+        left_peer_connect_socket =
+            create_server_socket(left_peer_socket_name, &left_peer_addr, &left_peer_addr_len);
 
         // client
         right_peer_socket =
@@ -144,7 +295,7 @@ void ze_handle_exchange_entry::update() {
     if (!is_connected) {
         if (connect_call(
                 right_peer_socket, &right_peer_addr, right_peer_addr_len, right_peer_socket_name)) {
-            return;
+            return ccl::utils::invalid_err_code;
         }
         is_connected = true;
     }
@@ -155,7 +306,7 @@ void ze_handle_exchange_entry::update() {
                         &left_peer_addr_len,
                         left_peer_socket_name,
                         left_peer_socket)) {
-            return;
+            return ccl::utils::invalid_err_code;
         }
 
         struct pollfd poll_fd {};
@@ -173,44 +324,23 @@ void ze_handle_exchange_entry::update() {
         for (int peer_idx = start_peer_idx; peer_idx < comm_size - 1; peer_idx++) {
             int peer = (comm_size + rank - 1 - peer_idx) % comm_size;
 
-            if ((peer_idx == 0) && !skip_first_send && (rank != skip_rank)) {
+            if ((peer_idx == 0) && !skip_first_send && (rank != this->skip_rank)) {
                 // send own handle to right peer
-                int send_fd = ccl::ze::get_fd_from_handle(handles[rank][buf_idx].handle);
+                int send_fd = ccl::ze::get_fd_from_handle(handles[rank][buf_idx].ipc_handle);
                 payload_t payload{};
-                payload.mem_offset = handles[rank][buf_idx].mem_offset;
-                payload.remote_pid = getpid();
-                void* ptr = in_buffers[buf_idx].first;
-
-                ze_context_handle_t remote_context{};
-                ze_device_handle_t remote_device{};
-                ze_memory_allocation_properties_t mem_alloc_props;
-                if (!ccl::ze::get_buffer_context_and_device(
-                        ptr, &remote_context, &remote_device, &mem_alloc_props)) {
-                    CCL_THROW("unable to get context from ptr\n");
-                }
-                ssize_t remote_context_id{ -1 };
-                if (!ccl::ze::get_context_global_id(remote_context, &remote_context_id)) {
-                    CCL_THROW("unable to get global id for context\n");
-                }
-                ssize_t remote_device_id{ -1 };
-                if (!ccl::ze::get_device_global_id(remote_device, &remote_device_id)) {
-                    CCL_THROW("unable to get global id for device\n");
-                }
-
-                payload.remote_mem_alloc_id = mem_alloc_props.id;
-                payload.remote_context_id = remote_context_id;
-                payload.remote_device_id = remote_device_id;
-                sendmsg_call(right_peer_socket, send_fd, payload);
+                fill_payload(payload, in_buffers, buf_idx);
+                ccl::utils::sendmsg_call(
+                    right_peer_socket, send_fd, &payload, sizeof(payload_t), rank);
                 skip_first_send = true;
             }
 
-            if (peer == skip_rank)
+            if (peer == this->skip_rank)
                 continue;
 
             int poll_ret = poll(&poll_fds[0], poll_fds.size(), timeout_ms);
 
             if (poll_ret == poll_expire_err_code) {
-                return;
+                return ccl::utils::invalid_err_code;
             }
             else if (poll_ret == POLL_ERR) {
                 CCL_THROW("poll: error: ", strerror(errno), ", ret: ", poll_ret);
@@ -219,34 +349,24 @@ void ze_handle_exchange_entry::update() {
             CCL_THROW_IF_NOT(poll_ret > 0, "unexpected poll ret: ", poll_ret);
 
             if (poll_fds[0].revents & POLLIN) {
-                int recv_fd = -1;
+                int recv_fd = ccl::utils::invalid_fd;
 
                 // recv data from left peer
                 payload_t payload{};
-                recvmsg_call(left_peer_socket, recv_fd, payload);
+                ccl::utils::recvmsg_call(
+                    left_peer_socket, &recv_fd, &payload, sizeof(payload_t), rank);
 
                 ze_ipc_mem_handle_t tmp_handle = ccl::ze::get_handle_from_fd(recv_fd);
 
                 // we don't know anything about the memory type on the other side,
                 // so we take it from our list. This assumes that the lists of types (exactly types)
                 // on the sending and receiving side are the same in both value and quantity
-                auto mem_type = in_buffers[buf_idx].second;
-                handles[peer][buf_idx] = { tmp_handle, payload.mem_offset, mem_type };
-                handles[peer][buf_idx].remote_mem_alloc_id = payload.remote_mem_alloc_id;
-                handles[peer][buf_idx].remote_context_id = payload.remote_context_id;
-                handles[peer][buf_idx].remote_pid = payload.remote_pid;
-                handles[peer][buf_idx].remote_device_id = payload.remote_device_id;
-                LOG_DEBUG("get IPC handle: { peer: ",
-                          peer,
-                          ", buf_idx: ",
-                          buf_idx,
-                          ", mem_type: ",
-                          to_string(mem_type),
-                          " }");
+                fill_remote_handle(payload, tmp_handle, peer, buf_idx);
 
                 if (peer_idx < (comm_size - 2)) {
                     // proxy data to right peer
-                    sendmsg_call(right_peer_socket, recv_fd, payload);
+                    ccl::utils::sendmsg_call(
+                        right_peer_socket, recv_fd, &payload, sizeof(payload), rank);
                 }
                 start_peer_idx++;
             }
@@ -260,7 +380,7 @@ void ze_handle_exchange_entry::update() {
                 LOG_TRACE("poll: nothing to receive, buf_idx: ", buf_idx, ", peer_idx ", peer_idx);
                 // nothing to receive
                 // continue with the same buf_idx/peer_idx in the next update() call
-                return;
+                return ccl::utils::invalid_err_code;
             }
         }
         start_peer_idx = 0;
@@ -279,16 +399,30 @@ void ze_handle_exchange_entry::update() {
         // of waiting till destruction of the entry.
         close_sockets();
     }
+    LOG_DEBUG("sockets_mode_exchange completed");
+    return 0;
+}
 
+void ze_handle_exchange_entry::update() {
+    if (ccl::global_data::env().ze_ipc_exchange == ccl::ze::ipc_exchange_mode::sockets) {
+        if (sockets_mode_exchange(in_buffers)) {
+            return;
+        }
+    }
+    else if (ccl::global_data::env().ze_ipc_exchange == ccl::ze::ipc_exchange_mode::drmfd ||
+             ccl::global_data::env().ze_ipc_exchange == ccl::ze::ipc_exchange_mode::pidfd) {
+        common_fd_mode_exchange(in_buffers);
+    }
+    else {
+        CCL_THROW("unexpected ipc_exchange_mode");
+    }
     status = ccl_sched_entry_status_complete;
-
     LOG_DEBUG("completed: ", name());
 }
 
 int ze_handle_exchange_entry::create_server_socket(const std::string& socket_name,
                                                    struct sockaddr_un* socket_addr,
-                                                   int* addr_len,
-                                                   int comm_size) {
+                                                   int* addr_len) {
     int ret = 0;
     memset(&(*socket_addr), 0, sizeof((*socket_addr)));
 
@@ -399,161 +533,6 @@ int ze_handle_exchange_entry::connect_call(int sock,
     return 0;
 }
 
-int ze_handle_exchange_entry::check_msg_retval(std::string operation_name,
-                                               ssize_t bytes,
-                                               struct iovec iov,
-                                               struct msghdr msg,
-                                               size_t union_size,
-                                               int sock,
-                                               int fd) {
-    LOG_DEBUG(operation_name,
-              ": ",
-              bytes,
-              ", expected_bytes:",
-              iov.iov_len,
-              ", expected size of cntr_buf: ",
-              union_size,
-              " -> gotten cntr_buf: ",
-              msg.msg_controllen,
-              ", socket: ",
-              sock,
-              ", fd: ",
-              fd);
-    int ret = -1;
-    if (bytes == static_cast<ssize_t>(iov.iov_len)) {
-        ret = 0;
-    }
-    else if (bytes < 0) {
-        ret = -errno;
-    }
-    else {
-        ret = -EIO;
-    }
-    return ret;
-}
-
-void ze_handle_exchange_entry::sendmsg_fd(int sock, int fd, const payload_t& payload) {
-    CCL_THROW_IF_NOT(fd >= 0, "unexpected fd value");
-
-    struct iovec iov {};
-    iov.iov_base = const_cast<payload_t*>(&payload);
-    iov.iov_len = sizeof(payload);
-
-    union {
-        struct cmsghdr align;
-        char cntr_buf[CMSG_SPACE(sizeof(int))]{};
-    } u;
-
-    struct msghdr msg {};
-    msg.msg_control = u.cntr_buf;
-    msg.msg_controllen = sizeof(u.cntr_buf);
-    msg.msg_iov = &iov;
-    msg.msg_iovlen = 1;
-
-    auto cmsg = CMSG_FIRSTHDR(&msg);
-    cmsg->cmsg_len = CMSG_LEN(sizeof(fd));
-    cmsg->cmsg_level = SOL_SOCKET;
-    cmsg->cmsg_type = SCM_RIGHTS;
-    *reinterpret_cast<int*>(CMSG_DATA(cmsg)) = fd;
-
-    ssize_t send_bytes = sendmsg(sock, &msg, 0);
-    CCL_THROW_IF_NOT(
-        !check_msg_retval("sendmsg", send_bytes, iov, msg, sizeof(u.cntr_buf), sock, fd),
-        ", from: ",
-        comm->rank(),
-        ", errno: ",
-        strerror(errno));
-}
-
-void ze_handle_exchange_entry::recvmsg_fd(int sock, int& fd, payload_t& payload) {
-    payload_t buf{};
-    struct iovec iov {};
-    iov.iov_base = &buf;
-    iov.iov_len = sizeof(buf);
-
-    union {
-        struct cmsghdr align;
-        char cntr_buf[CMSG_SPACE(sizeof(int))]{};
-    } u;
-
-    struct msghdr msg {};
-    msg.msg_control = u.cntr_buf;
-    msg.msg_controllen = sizeof(u.cntr_buf);
-    msg.msg_iov = &iov;
-    msg.msg_iovlen = 1;
-
-    ssize_t recv_bytes = recvmsg(sock, &msg, 0);
-    CCL_THROW_IF_NOT(
-        !check_msg_retval("recvmsg", recv_bytes, iov, msg, sizeof(u.cntr_buf), sock, fd),
-        ", from: ",
-        comm->rank(),
-        ", errno: ",
-        strerror(errno));
-
-    if (msg.msg_flags & (MSG_CTRUNC | MSG_TRUNC)) {
-        std::string flag_str = "";
-        if (msg.msg_flags & MSG_CTRUNC) {
-            flag_str += " MSG_CTRUNC";
-        }
-        if (msg.msg_flags & MSG_TRUNC) {
-            flag_str += " MSG_TRUNC";
-        }
-
-        /** MSG_CTRUNC message can be in case of:
-         * - remote peer send invalid fd, so msg_controllen == 0
-         * - limit of fds reached in the current process, so msg_controllen == 0
-         * - the remote peer control message > msg_control buffer size
-         */
-        CCL_THROW("control or usual message is truncated:",
-                  flag_str,
-                  " control message size: ",
-                  msg.msg_controllen,
-                  ", ",
-                  to_string(ccl::utils::get_fd_info()));
-    }
-
-    for (auto cmsg = CMSG_FIRSTHDR(&msg); cmsg != NULL; cmsg = CMSG_NXTHDR(&msg, cmsg)) {
-        if (cmsg->cmsg_len == CMSG_LEN(sizeof(int)) && cmsg->cmsg_level == SOL_SOCKET &&
-            cmsg->cmsg_type == SCM_RIGHTS) {
-            memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
-            break;
-        }
-    }
-
-    // we assume that the message has a strict format and size, if not this means that something
-    // is wrong.
-    if (msg.msg_iovlen != 1 || msg.msg_iov[0].iov_len != sizeof(payload_t)) {
-        CCL_THROW("received data in unexpected format");
-    }
-
-    memcpy(&payload, msg.msg_iov[0].iov_base, sizeof(payload_t));
-}
-
-void ze_handle_exchange_entry::sendmsg_call(int sock, int fd, const payload_t& payload) {
-    sendmsg_fd(sock, fd, payload);
-    LOG_DEBUG("send: rank[",
-              comm->rank(),
-              "], send fd: ",
-              fd,
-              ", sock: ",
-              sock,
-              ", mem_offset: ",
-              payload.mem_offset);
-}
-
-void ze_handle_exchange_entry::recvmsg_call(int sock, int& fd, payload_t& payload) {
-    recvmsg_fd(sock, fd, payload);
-
-    LOG_DEBUG("recv: rank[",
-              rank,
-              "], got fd: ",
-              fd,
-              ", sock: ",
-              sock,
-              ", mem_offset: ",
-              payload.mem_offset);
-}
-
 ze_handle_exchange_entry::mem_info_t ze_handle_exchange_entry::get_mem_info(const void* ptr) {
     void* base_ptr{};
     size_t alloc_size{};
@@ -577,3 +556,18 @@ void ze_handle_exchange_entry::close_sockets() {
         sockets_closed = true;
     }
 }
+
+void ze_handle_exchange_entry::dump_detail(std::stringstream& str) const {
+    ccl_logger::format(str,
+                       "comm ",
+                       comm->to_string(),
+                       ", right_peer ",
+                       right_peer_socket_name,
+                       ", left_peer ",
+                       left_peer_socket_name,
+                       ", in_buffers size ",
+                       in_buffers.size(),
+                       ", handles size ",
+                       handles.size(),
+                       "\n");
+}
diff --git a/src/sched/entry/ze/ze_handle_exchange_entry.hpp b/src/sched/entry/ze/ze_handle_exchange_entry.hpp
index 40eac82d6..5e0b0201c 100644
--- a/src/sched/entry/ze/ze_handle_exchange_entry.hpp
+++ b/src/sched/entry/ze/ze_handle_exchange_entry.hpp
@@ -16,13 +16,14 @@
 #pragma once
 
 #include "comm/comm.hpp"
+#include "common/utils/exchange_utils.hpp"
+#include "common/utils/utils.hpp"
 #include "sched/entry/entry.hpp"
 #include "sched/sched.hpp"
 #include "sched/ze/ze_handle_manager.hpp"
 
 #include <poll.h>
 #include <sys/un.h>
-#include "common/ze/ze_api_wrapper.hpp"
 
 class ze_handle_exchange_entry : public sched_entry {
 public:
@@ -47,23 +48,9 @@ class ze_handle_exchange_entry : public sched_entry {
     void update() override;
 
 protected:
-    void dump_detail(std::stringstream& str) const override {
-        ccl_logger::format(str,
-                           "comm ",
-                           comm->to_string(),
-                           ", right_peer ",
-                           right_peer_socket_name,
-                           ", left_peer ",
-                           left_peer_socket_name,
-                           ", in_buffers size ",
-                           in_buffers.size(),
-                           ", handles size ",
-                           handles.size(),
-                           "\n");
-    }
+    void dump_detail(std::stringstream& str) const override;
 
 private:
-    static constexpr size_t socket_max_str_len = 100;
     static constexpr int poll_expire_err_code = 0;
     static constexpr int timeout_ms = 1;
     static constexpr size_t max_pfds = 1;
@@ -75,7 +62,8 @@ class ze_handle_exchange_entry : public sched_entry {
 
     const int rank;
     const int comm_size;
-    const int skip_rank;
+    int skip_rank;
+    pid_t current_pid = ccl::utils::invalid_pid;
 
     int start_buf_idx{};
     int start_peer_idx{};
@@ -93,14 +81,33 @@ class ze_handle_exchange_entry : public sched_entry {
     std::string right_peer_socket_name;
     std::string left_peer_socket_name;
 
+    std::vector<int> device_fds;
+
     struct payload_t {
-        pid_t remote_pid{};
+        int mem_handle{ ccl::utils::invalid_mem_handle };
+        ccl::ze::ipc_mem_type mem_type{};
+        pid_t remote_pid{ ccl::utils::invalid_pid };
         size_t mem_offset{};
         uint64_t remote_mem_alloc_id{};
-        ssize_t remote_context_id{ -1 };
-        ssize_t remote_device_id{ -1 };
+        ssize_t remote_context_id{ ccl::utils::invalid_context_id };
+        ssize_t remote_device_id{ ccl::utils::invalid_device_id };
+        int pidfd_fd{ ccl::utils::invalid_fd };
+        int device_fd{ ccl::utils::invalid_fd };
     };
 
+    void fill_payload(payload_t& payload, const std::vector<mem_desc_t>& bufs, size_t buf_idx);
+    void fill_remote_handle(const payload_t& payload,
+                            ze_ipc_mem_handle_t ipc_handle,
+                            const size_t idx,
+                            const size_t buf_idx);
+
+    int ipc_to_mem_handle(const ze_ipc_mem_handle_t& ipc_handle,
+                          const int parent_dev_id = ccl::utils::invalid_device_id);
+
+    void create_local_ipc_handles(const std::vector<mem_desc_t>& bufs);
+    int sockets_mode_exchange(const std::vector<mem_desc_t>& bufs);
+    void common_fd_mode_exchange(const std::vector<mem_desc_t>& bufs);
+
     bool is_created{};
     bool is_connected{};
     bool is_accepted{};
@@ -108,8 +115,7 @@ class ze_handle_exchange_entry : public sched_entry {
 
     int create_server_socket(const std::string& socket_name,
                              struct sockaddr_un* socket_addr,
-                             int* addr_len,
-                             int comm_size);
+                             int* addr_len);
     int create_client_socket(const std::string& left_peer_socket_name,
                              struct sockaddr_un* sockaddr_cli,
                              int* len);
@@ -124,23 +130,11 @@ class ze_handle_exchange_entry : public sched_entry {
                      int addr_len,
                      const std::string& socket_name);
 
-    void sendmsg_fd(int sock, int fd, const payload_t& handle_desc);
-    void recvmsg_fd(int sock, int& fd, payload_t& handle_desc);
-
-    void sendmsg_call(int sock, int fd, const payload_t& handle_desc);
-    void recvmsg_call(int sock, int& fd, payload_t& handle_desc);
-    int check_msg_retval(std::string operation_name,
-                         ssize_t bytes,
-                         struct iovec iov,
-                         struct msghdr msg,
-                         size_t union_size,
-                         int sock,
-                         int fd);
-
     using mem_info_t = typename std::pair<void*, size_t>;
     mem_info_t get_mem_info(const void* ptr);
 
     bool sockets_closed = false;
+    std::vector<int> opened_pidfds;
 
     void unlink_sockets();
     void close_sockets();
diff --git a/src/sched/entry/ze/ze_onesided_reduce_entry.cpp b/src/sched/entry/ze/ze_onesided_reduce_entry.cpp
index 41afaac03..5de49cf4e 100644
--- a/src/sched/entry/ze/ze_onesided_reduce_entry.cpp
+++ b/src/sched/entry/ze/ze_onesided_reduce_entry.cpp
@@ -32,7 +32,8 @@ ze_onesided_reduce_entry::ze_onesided_reduce_entry(ccl_sched* sched,
                                                    reduction op,
                                                    int root,
                                                    ccl_comm* comm,
-                                                   std::vector<ze_event_handle_t> wait_events)
+                                                   std::vector<ze_event_handle_t> wait_events,
+                                                   size_t peer_buf_offset)
         : ze_base_entry(sched, comm, 2 /* request additional events */, wait_events),
           send_buf(send_buf),
           recv_buf(recv_buf),
@@ -41,6 +42,7 @@ ze_onesided_reduce_entry::ze_onesided_reduce_entry(ccl_sched* sched,
           op(op),
           root(root),
           buf_size_bytes(dtype.size() * cnt),
+          peer_buf_offset_bytes(dtype.size() * peer_buf_offset),
           empty_kernel_event(nullptr),
           empty_kernel(nullptr),
           empty_kernel_name("empty_kernel") {
@@ -76,9 +78,12 @@ void ze_onesided_reduce_entry::init_ze_hook() {
     LOG_DEBUG(
         "get IPC pointers from ", peer_rank, " by ", root, ", right_send_buf: ", right_send_buf);
 
+    send_buf_ptr = send_buf.get_ptr();
+    recv_buf_ptr = recv_buf.get_ptr();
+
     // TODO: in place case check! diff idx for handle_mngr
 
-    right_send_buf_ptr = right_send_buf.get_ptr();
+    right_send_buf_ptr = static_cast<char*>(right_send_buf.get_ptr()) + peer_buf_offset_bytes;
 
     void* kernel_input_buf2 = right_send_buf_ptr;
     if (global_data::env().enable_kernel_1s_copy_ops) {
@@ -163,6 +168,9 @@ void ze_onesided_reduce_entry::init_ze_hook() {
 }
 
 void ze_onesided_reduce_entry::finalize_ze_hook() {
+    if (comm->size() == 1) {
+        return;
+    }
     if (empty_kernel_event) {
         ccl::global_data::get().ze_data->cache->push(
             worker_idx, module, empty_kernel_name, empty_kernel);
@@ -199,3 +207,28 @@ void ze_onesided_reduce_entry::update() {
         ccl::global_data::get().ze_data->kernel_counter--;
     }
 }
+
+std::string ze_onesided_reduce_entry::name_ext() const {
+    std::stringstream out;
+    out << name() << ":" << cnt * dtype.size();
+    return out.str();
+}
+
+void ze_onesided_reduce_entry::dump_detail(std::stringstream& str) const {
+    ccl_logger::format(str,
+                       "dt ",
+                       ccl::global_data::get().dtypes->name(dtype),
+                       ", cnt ",
+                       cnt,
+                       ", send_buf ",
+                       send_buf,
+                       ", recv_buf ",
+                       recv_buf,
+                       ", op ",
+                       ccl_reduction_to_str(op),
+                       ", comm ",
+                       comm->to_string(),
+                       ", context ",
+                       context,
+                       "\n");
+}
diff --git a/src/sched/entry/ze/ze_onesided_reduce_entry.hpp b/src/sched/entry/ze/ze_onesided_reduce_entry.hpp
index 05d52e2f2..088064519 100644
--- a/src/sched/entry/ze/ze_onesided_reduce_entry.hpp
+++ b/src/sched/entry/ze/ze_onesided_reduce_entry.hpp
@@ -32,12 +32,7 @@ class ze_onesided_reduce_entry : public ze_base_entry {
         return class_name();
     }
 
-    virtual std::string name_ext() const override {
-        std::stringstream out;
-        out << name() << " ";
-        out << "size: " << cnt;
-        return out.str();
-    }
+    virtual std::string name_ext() const override;
 
     ze_onesided_reduce_entry() = delete;
     explicit ze_onesided_reduce_entry(ccl_sched* sched,
@@ -48,7 +43,8 @@ class ze_onesided_reduce_entry : public ze_base_entry {
                                       ccl::reduction op,
                                       int root,
                                       ccl_comm* comm,
-                                      std::vector<ze_event_handle_t> wait_events = {});
+                                      std::vector<ze_event_handle_t> wait_events = {},
+                                      size_t peer_buf_offset = 0);
 
     void init_ze_hook() override;
     void finalize_ze_hook() override;
@@ -57,24 +53,7 @@ class ze_onesided_reduce_entry : public ze_base_entry {
     void update() override;
 
 protected:
-    void dump_detail(std::stringstream& str) const override {
-        ccl_logger::format(str,
-                           "dt ",
-                           ccl::global_data::get().dtypes->name(dtype),
-                           ", cnt ",
-                           cnt,
-                           ", send_buf ",
-                           send_buf,
-                           ", recv_buf ",
-                           recv_buf,
-                           ", op ",
-                           ccl_reduction_to_str(op),
-                           ", comm ",
-                           comm->to_string(),
-                           ", context ",
-                           context,
-                           "\n");
-    }
+    void dump_detail(std::stringstream& str) const override;
 
 private:
     ccl_buffer send_buf;
@@ -87,6 +66,7 @@ class ze_onesided_reduce_entry : public ze_base_entry {
     const ccl::reduction op;
     int root;
     const size_t buf_size_bytes;
+    const size_t peer_buf_offset_bytes;
 
     ze_event_handle_t empty_kernel_event;
     ze_event_handle_t copy_from_peer_event;
diff --git a/src/sched/entry/ze/ze_primitives.cpp b/src/sched/entry/ze/ze_primitives.cpp
index b3b4d41c0..89370ce5b 100644
--- a/src/sched/entry/ze/ze_primitives.cpp
+++ b/src/sched/entry/ze/ze_primitives.cpp
@@ -18,12 +18,26 @@
 
 #include "common/global/global.hpp"
 #include "common/log/log.hpp"
+#include "common/utils/utils.hpp"
 #include "sched/entry/ze/ze_primitives.hpp"
 
 namespace ccl {
 
 namespace ze {
 
+std::map<copy_engine_mode, std::string> copy_engine_names = {
+    std::make_pair(copy_engine_mode::none, "none"),
+    std::make_pair(copy_engine_mode::main, "main"),
+    std::make_pair(copy_engine_mode::link, "link"),
+    std::make_pair(copy_engine_mode::auto_mode, "auto")
+};
+
+std::map<h2d_copy_engine_mode, std::string> h2d_copy_engine_names = {
+    std::make_pair(h2d_copy_engine_mode::none, "none"),
+    std::make_pair(h2d_copy_engine_mode::main, "main"),
+    std::make_pair(h2d_copy_engine_mode::auto_mode, "auto")
+};
+
 std::string get_build_log_string(ze_module_build_log_handle_t build_log) {
     size_t log_size{};
     ZE_CALL(zeModuleBuildLogGetString, (build_log, &log_size, nullptr));
@@ -93,14 +107,19 @@ void get_suggested_group_size(ze_kernel_handle_t kernel,
         return;
     }
 
-    ZE_CALL(zeKernelSuggestGroupSize,
-            (kernel,
-             elem_count,
-             1,
-             1,
-             &group_size->groupSizeX,
-             &group_size->groupSizeY,
-             &group_size->groupSizeZ));
+    if (ccl::global_data::env().kernel_group_size == 0) {
+        ZE_CALL(zeKernelSuggestGroupSize,
+                (kernel,
+                 elem_count,
+                 1,
+                 1,
+                 &group_size->groupSizeX,
+                 &group_size->groupSizeY,
+                 &group_size->groupSizeZ));
+    }
+    else {
+        group_size->groupSizeX = ccl::global_data::env().kernel_group_size;
+    }
 
     CCL_THROW_IF_NOT(group_size->groupSizeX >= 1,
                      "wrong group size calculation: size: ",
@@ -117,6 +136,13 @@ void get_suggested_group_count(const ze_group_size_t& group_size,
     group_count->groupCountZ = 1;
 
     auto rem = elem_count % group_size.groupSizeX;
+
+    //check whether any remaining elements are left and
+    //add an additional group to account for that
+    if (ccl::global_data::env().kernel_group_size != 0 && rem != 0) {
+        group_count->groupCountX = group_count->groupCountX + 1;
+        rem = 0;
+    }
     CCL_THROW_IF_NOT(group_count->groupCountX >= 1 && rem == 0,
                      "wrong group calculation: size: ",
                      to_string(group_size),
@@ -129,16 +155,20 @@ void get_suggested_group_count(const ze_group_size_t& group_size,
 void set_kernel_args(ze_kernel_handle_t kernel, const ze_kernel_args_t& kernel_args) {
     uint32_t idx = 0;
     for (const auto& arg : kernel_args) {
-        auto res = zeKernelSetArgumentValue(kernel, idx, arg.size, arg.ptr);
-        if (res != ZE_RESULT_SUCCESS) {
-            CCL_THROW("zeKernelSetArgumentValue failed with error ",
-                      to_string(res),
-                      " on idx ",
-                      idx,
-                      " with value ",
-                      *((void**)arg.ptr));
+        // each arg can be an array with arg.count elements
+        for (size_t i = 0; i < arg.count; i++) {
+            auto ptr = &((char*)arg.ptr)[i * arg.size];
+            auto res = zeKernelSetArgumentValue(kernel, idx, arg.size, ptr);
+            if (res != ZE_RESULT_SUCCESS) {
+                CCL_THROW("zeKernelSetArgumentValue failed with error ",
+                          to_string(res),
+                          " on idx ",
+                          idx,
+                          " with value ",
+                          *((void**)ptr));
+            }
+            ++idx;
         }
-        ++idx;
     }
 }
 
@@ -249,8 +279,11 @@ bool get_device_global_id(ze_device_handle_t device, ssize_t* id) {
     CCL_THROW_IF_NOT(device, "no device");
     CCL_THROW_IF_NOT(id, "no id");
     bool success{};
-    const auto& devices = global_data::get().ze_data->device_handles;
-    auto found = std::find(devices.begin(), devices.end(), device);
+    const auto& devices = global_data::get().ze_data->devices;
+    auto found =
+        std::find_if(devices.begin(), devices.end(), [&device](const device_info& info) -> bool {
+            return info.device == device;
+        });
     if (found != devices.end()) {
         *id = std::distance(devices.begin(), found);
         success = true;
@@ -258,11 +291,19 @@ bool get_device_global_id(ze_device_handle_t device, ssize_t* id) {
     return success;
 }
 
-device_family get_device_family(ze_device_handle_t device) {
-    ze_device_properties_t device_prop = ccl::ze::default_device_props;
+uint32_t get_parent_device_id(ze_device_handle_t device) {
+    ssize_t dev_id = ccl::utils::invalid_device_id;
+    ccl::ze::get_device_global_id(device, &dev_id);
+    CCL_THROW_IF_NOT(dev_id != ccl::utils::invalid_device_id, "unexpected dev_id");
+    LOG_DEBUG("device_id: ", dev_id);
+
+    return ccl::global_data::get().ze_data->devices[dev_id].parent_idx;
+}
 
-    ZE_CALL(zeDeviceGetProperties, (device, &device_prop));
-    uint32_t id = device_prop.deviceId & 0xfff0;
+device_family get_device_family(ze_device_handle_t device) {
+    ze_device_properties_t dev_props = ccl::ze::default_device_props;
+    ZE_CALL(zeDeviceGetProperties, (device, &dev_props));
+    uint32_t id = dev_props.deviceId & 0xfff0;
     using enum_t = typename std::underlying_type<device_family>::type;
 
     switch (id) {
@@ -314,6 +355,51 @@ bool is_same_fabric_port(const zes_fabric_port_id_t& port1, const zes_fabric_por
     return result;
 }
 
+bool pci_address_comparator::operator()(const zes_pci_address_t& a,
+                                        const zes_pci_address_t& b) const {
+    if (a.domain == b.domain) {
+        if (a.bus == b.bus) {
+            if (a.device == b.device) {
+                if (a.function == b.function) {
+                    return false;
+                }
+                else {
+                    return (a.function < b.function);
+                }
+            }
+            else {
+                return (a.device < b.device);
+            }
+        }
+        else {
+            return (a.bus < b.bus);
+        }
+    }
+    else {
+        return (a.domain < b.domain);
+    }
+}
+
+bool fabric_port_comparator::operator()(const zes_fabric_port_id_t& a,
+                                        const zes_fabric_port_id_t& b) const {
+    if (a.fabricId == b.fabricId) {
+        if (a.attachId == b.attachId) {
+            if (a.portNumber == b.portNumber) {
+                return false;
+            }
+            else {
+                return (a.portNumber < b.portNumber);
+            }
+        }
+        else {
+            return (a.attachId < b.attachId);
+        }
+    }
+    else {
+        return (a.fabricId < b.fabricId);
+    }
+}
+
 std::string to_string(ze_result_t result) {
     switch (result) {
         case ZE_RESULT_SUCCESS: return "ZE_RESULT_SUCCESS";
diff --git a/src/sched/entry/ze/ze_primitives.hpp b/src/sched/entry/ze/ze_primitives.hpp
index 7b387d191..774e4eb54 100644
--- a/src/sched/entry/ze/ze_primitives.hpp
+++ b/src/sched/entry/ze/ze_primitives.hpp
@@ -19,7 +19,7 @@
 #include <string>
 #include <vector>
 
-#include "common/ze/ze_api_wrapper.hpp"
+#include "common/api_wrapper/ze_api_wrapper.hpp"
 #include "sched/entry/ze/ze_call.hpp"
 
 namespace ccl {
@@ -30,6 +30,12 @@ namespace ze {
 
 enum class device_id : uint32_t { unknown = 0x0, id1 = 0x200, id2 = 0xbd0 };
 
+enum class copy_engine_mode { none, main, link, auto_mode };
+enum class h2d_copy_engine_mode { none, main, auto_mode };
+
+extern std::map<copy_engine_mode, std::string> copy_engine_names;
+extern std::map<h2d_copy_engine_mode, std::string> h2d_copy_engine_names;
+
 constexpr ze_context_desc_t default_context_desc = { .stype = ZE_STRUCTURE_TYPE_CONTEXT_DESC,
                                                      .pNext = nullptr,
                                                      .flags = 0 };
@@ -110,12 +116,23 @@ void get_suggested_group_count(const ze_group_size_t& group_size,
                                size_t elem_count,
                                ze_group_count_t* group_count);
 
+// use a maximum peer size of 5 since for 6 PVCs in Aurora.
+// TODO: Need to generalize the peer count for other configs.
+constexpr size_t max_peer_count = 5;
+
 struct ze_kernel_arg_t {
     template <class T>
     constexpr ze_kernel_arg_t(const T* arg) noexcept
             : size{ sizeof(T) },
               ptr{ static_cast<const void*>(arg) } {}
+    template <class T>
+    constexpr ze_kernel_arg_t(const T* arg, const size_t count) noexcept
+            : size{ sizeof(T) },
+              count{ count },
+              ptr{ static_cast<const void*>(arg) } {}
     const size_t size;
+    //TODO: should we use a vector of ptr instead of keeping ptr and count
+    const size_t count = 1;
     const void* ptr;
 };
 
@@ -137,6 +154,7 @@ bool get_buffer_context_and_device(const void* buf,
                                    ze_memory_allocation_properties_t* props = nullptr);
 bool get_context_global_id(ze_context_handle_t context, ssize_t* id);
 bool get_device_global_id(ze_device_handle_t device, ssize_t* id);
+uint32_t get_parent_device_id(ze_device_handle_t device);
 
 int get_fd_from_handle(const ze_ipc_mem_handle_t& handle);
 void close_handle_fd(const ze_ipc_mem_handle_t& handle);
@@ -149,50 +167,11 @@ bool is_same_dev_uuid(const ze_device_uuid_t& uuid1, const ze_device_uuid_t& uui
 bool is_same_fabric_port(const zes_fabric_port_id_t& port1, const zes_fabric_port_id_t& port2);
 
 struct pci_address_comparator {
-    bool operator()(const zes_pci_address_t& a, const zes_pci_address_t& b) const {
-        if (a.domain == b.domain) {
-            if (a.bus == b.bus) {
-                if (a.device == b.device) {
-                    if (a.function == b.function) {
-                        return false;
-                    }
-                    else {
-                        return (a.function < b.function);
-                    }
-                }
-                else {
-                    return (a.device < b.device);
-                }
-            }
-            else {
-                return (a.bus < b.bus);
-            }
-        }
-        else {
-            return (a.domain < b.domain);
-        }
-    }
+    bool operator()(const zes_pci_address_t& a, const zes_pci_address_t& b) const;
 };
 
 struct fabric_port_comparator {
-    bool operator()(const zes_fabric_port_id_t& a, const zes_fabric_port_id_t& b) const {
-        if (a.fabricId == b.fabricId) {
-            if (a.attachId == b.attachId) {
-                if (a.portNumber == b.portNumber) {
-                    return false;
-                }
-                else {
-                    return (a.portNumber < b.portNumber);
-                }
-            }
-            else {
-                return (a.attachId < b.attachId);
-            }
-        }
-        else {
-            return (a.fabricId < b.fabricId);
-        }
-    }
+    bool operator()(const zes_fabric_port_id_t& a, const zes_fabric_port_id_t& b) const;
 };
 
 std::string to_string(ze_result_t result);
@@ -217,6 +196,7 @@ template <typename T>
 std::string flags_to_string(uint32_t flags) {
     constexpr size_t bits = 8;
     std::vector<std::string> output;
+
     for (size_t i = 0; i < sizeof(flags) * bits; ++i) {
         const size_t mask = 1UL << i;
         const auto flag = flags & mask;
@@ -224,6 +204,11 @@ std::string flags_to_string(uint32_t flags) {
             output.emplace_back(to_string(static_cast<T>(flag)));
         }
     }
+
+    if (output.empty()) {
+        output.emplace_back("<empty>");
+    }
+
     return join_strings(output, " | ");
 }
 
diff --git a/src/sched/entry/ze/ze_reduce_local_entry.cpp b/src/sched/entry/ze/ze_reduce_local_entry.cpp
index 346b1f7fd..1ea95fbcb 100644
--- a/src/sched/entry/ze/ze_reduce_local_entry.cpp
+++ b/src/sched/entry/ze/ze_reduce_local_entry.cpp
@@ -74,3 +74,9 @@ void ze_reduce_local_entry::init_ze_hook() {
 void ze_reduce_local_entry::finalize_ze_hook() {
     global_data::get().ze_data->cache->push(worker_idx, module, kernel_name, kernel);
 }
+
+std::string ze_reduce_local_entry::name_ext() const {
+    std::stringstream out;
+    out << name() << ":" << in_cnt * dtype.size();
+    return out.str();
+}
diff --git a/src/sched/entry/ze/ze_reduce_local_entry.hpp b/src/sched/entry/ze/ze_reduce_local_entry.hpp
index a574dfb12..c6bc0b28c 100644
--- a/src/sched/entry/ze/ze_reduce_local_entry.hpp
+++ b/src/sched/entry/ze/ze_reduce_local_entry.hpp
@@ -28,12 +28,7 @@ class ze_reduce_local_entry : public ze_base_entry {
         return class_name();
     }
 
-    virtual std::string name_ext() const override {
-        std::stringstream out;
-        out << name() << " ";
-        out << "in size: " << in_cnt;
-        return out.str();
-    }
+    virtual std::string name_ext() const override;
 
     explicit ze_reduce_local_entry(ccl_sched* sched,
                                    const ccl_buffer in_buf,
diff --git a/src/sched/queue/queue.cpp b/src/sched/queue/queue.cpp
index 44b196a71..b8d01d4ac 100644
--- a/src/sched/queue/queue.cpp
+++ b/src/sched/queue/queue.cpp
@@ -151,7 +151,7 @@ void ccl_sched_queue::add(ccl_sched* sched) {
     CCL_ASSERT(bin);
 }
 
-size_t ccl_sched_queue::erase(ccl_sched_bin* bin, size_t idx) {
+size_t ccl_sched_queue::erase(ccl_sched_bin* bin, size_t sched_idx) {
     CCL_ASSERT(bin);
     size_t bin_priority = bin->get_priority();
 
@@ -161,7 +161,7 @@ size_t ccl_sched_queue::erase(ccl_sched_bin* bin, size_t idx) {
 
     // erase sched and check bin size after
     // no need to lock whole `bins` for single erase
-    if (!bin->erase(idx, next_idx)) {
+    if (!bin->erase(sched_idx, next_idx)) {
         // 'bin 'looks like empty, we can erase it from 'bins'.
         // double check on bin.empty(), before remove it from whole table
         std::lock_guard<sched_queue_lock_t> lock{ bins_guard };
diff --git a/src/sched/queue/queue.hpp b/src/sched/queue/queue.hpp
index a155ebfc4..b6f44f251 100644
--- a/src/sched/queue/queue.hpp
+++ b/src/sched/queue/queue.hpp
@@ -215,7 +215,7 @@ class ccl_sched_queue {
     size_t get_idx() const;
 
     void add(ccl_sched* sched);
-    size_t erase(ccl_sched_bin* bin, size_t idx);
+    size_t erase(ccl_sched_bin* bin, size_t sched_idx);
     void clear();
 
     /**
@@ -232,10 +232,11 @@ class ccl_sched_queue {
             std::lock_guard<sched_queue_lock_t> lock(bins_guard);
             out << "{\n";
             out << "  sched_queue: idx: " << idx << " size: " << bins.size() << "\n";
-            size_t idx = 0;
+            size_t bin_idx = 0;
             for (auto& bin : bins) {
-                out << "   bin: idx: " << idx << " priority: " << bin.first
+                out << "   bin: idx: " << bin_idx << " priority: " << bin.first
                     << " size: " << bin.second.size() << "\n";
+                bin_idx++;
                 bin.second.dump(out);
             }
             out << "}\n";
diff --git a/src/sched/sched.cpp b/src/sched/sched.cpp
index fae69b550..73772c403 100644
--- a/src/sched/sched.cpp
+++ b/src/sched/sched.cpp
@@ -18,7 +18,6 @@
 #include "common/log/log.hpp"
 #include "common/request/request.hpp"
 #include "common/utils/sync_object.hpp"
-#include "common/utils/sycl_utils.hpp"
 #include "parallelizer/parallelizer.hpp"
 #include "sched/cache/cache.hpp"
 #include "sched/cache/key.hpp"
@@ -29,8 +28,7 @@
 #include "sched/sched_restart_manager.hpp"
 
 #ifdef CCL_ENABLE_SYCL
-#include <CL/sycl.hpp>
-#include <CL/sycl/backend/level_zero.hpp>
+#include "common/utils/sycl_utils.hpp"
 
 #ifdef CCL_ENABLE_ZE
 #include "sched/entry/ze/ze_cache.hpp"
@@ -380,7 +378,7 @@ void ccl_sched::complete() {
 
     // save sched type because we cannot assume that the sched with type == master
     // is not destroyed after we complete the request
-    auto* parent_sched = this->parent_sched;
+    auto* parent_schedule = this->parent_sched;
 
     // it's important to do finalization/cleanup before full completion of the request
     // because right after its completion, the request and the sched can be destroyed
@@ -391,7 +389,7 @@ void ccl_sched::complete() {
     // completing it one more time setting the counter to 0.
     if (get_request()->complete_counter() == 1) {
         if (ccl::global_data::env().sched_profile) {
-            timer.stop();
+            timer.update();
             if (entries.size() > 0) {
                 std::stringstream ss;
                 ss << "\ncoll:";
@@ -404,10 +402,12 @@ void ccl_sched::complete() {
                     ss << " count:" << profile_param->get_send_count();
                 }
 
-                ss << " time(usec):\ntotal: " << timer.str() << "\n";
+                ss << " time(usec): sched total:\n" << to_string(timer) << "\n";
                 for (size_t idx = 0; idx < entries.size(); ++idx) {
-                    ss << "[" << idx << "] " << entries[idx]->name() << ": "
-                       << entries[idx]->timer.str() << "\n";
+                    ss << "[" << idx << "] " << entries[idx]->name()
+                       << ": total: " << to_string(entries[idx]->total_timer);
+                    ss << ", update: " << to_string(entries[idx]->update_timer);
+                    ss << "\n";
                 }
                 ss << "-----------------------------";
                 logger.info(ss.str());
@@ -420,21 +420,21 @@ void ccl_sched::complete() {
         // so we can finaly complete it
         get_request()->complete();
 
-        if (parent_sched) {
+        if (parent_schedule) {
             // after we call try_to_restart() on the parent, it's request might be changed
             // so rememeber it here to call complete on
-            auto parent_req = parent_sched->get_request();
+            auto parent_req = parent_schedule->get_request();
             // check for completed parent request, see comment above for how this works
             if (parent_req->complete_counter() == 1) {
                 // itt tracks only top-level sched execution
                 if (top_level_sched)
-                    complete_itt(parent_sched->coll_param.stream);
+                    complete_itt(parent_schedule->coll_param.stream);
                 // if we don't use cache, it doesn't make sense to restart the sched
                 // as there are never be any requests to restart
-                if (parent_sched->coll_attr.to_cache) {
+                if (parent_schedule->coll_attr.to_cache) {
                     // current sched execution is completed, always check if we need to
                     // restart it again
-                    parent_sched->try_to_restart();
+                    parent_schedule->try_to_restart();
                 }
                 parent_req->complete();
             }
@@ -489,7 +489,7 @@ size_t ccl_sched::entries_count() const {
 }
 
 #if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
-void ccl_sched::set_output_event(ccl_request* req) {
+void ccl_sched::set_output_event(ccl_request* request) {
     if (!use_output_event) {
         return;
     }
@@ -510,9 +510,20 @@ void ccl_sched::set_output_event(ccl_request* req) {
     LOG_DEBUG("convert L0 event: ", ev, "into a SYCL event and submit a barrier");
 
     auto sync_event = ccl::utils::make_event(context, ev);
-    req->set_sync_event(sync_event);
-    req->set_native_event(ccl::utils::submit_barrier(q, sync_event));
-    CCL_THROW_IF_NOT(!(req->get_native_event().is_host()), "something is wrong");
+    if (ccl::global_data::env().enable_external_queue) {
+        // Todo: when using external in-order queue, need to submit barrier after CCL kernel submission
+        LOG_DEBUG(
+            "Current external passed in-order queue means CCL kernel wait execution, so no need to submit barrier");
+        request->set_sync_event(sync_event);
+        request->set_native_event(sync_event);
+    }
+    else {
+        request->set_sync_event(sync_event);
+        if (this->coll_attr.synchronous)
+            request->set_native_event(ccl::utils::submit_barrier(q));
+        else
+            request->set_native_event(ccl::utils::submit_barrier(q, sync_event));
+    }
 
 #else // CCL_ENABLE_SYCL_INTEROP_EVENT
     CCL_THROW("interop event functionality is not available with current configuration, "
@@ -550,7 +561,7 @@ void ccl_sched::release_sync_event(ccl_request* request) {
 #if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
     if (use_output_event) {
         // check if the event has been reset already(is_host is true for an empty one)
-        if (request->get_sync_event().is_host()) {
+        if (!request->has_sync_event()) {
             LOG_DEBUG("request's event has been released already, skipping");
         }
         else {
diff --git a/src/sched/sched.hpp b/src/sched/sched.hpp
index d437b1c5f..3f6c6cee7 100644
--- a/src/sched/sched.hpp
+++ b/src/sched/sched.hpp
@@ -111,6 +111,14 @@ class alignas(CACHELINE_SIZE) ccl_sched : public ccl_sched_base {
         return op_id;
     }
 
+    void set_scaleout_flag() {
+        is_scaleout_subsched = true;
+    }
+
+    int get_scaleout_flag() {
+        return is_scaleout_subsched;
+    }
+
     void set_in_bin_status(ccl_sched_in_bin_status status) {
         in_bin_status = status;
     }
@@ -183,6 +191,7 @@ class alignas(CACHELINE_SIZE) ccl_sched : public ccl_sched_base {
       set once and then used for all entries
     */
     ccl_op_id_t op_id = 0;
+    bool is_scaleout_subsched = false;
 
     /* to track status of schedule wrt execution bin, not atomic as updated by single thread in time */
     ccl_sched_in_bin_status in_bin_status = ccl_sched_in_bin_none;
@@ -237,7 +246,7 @@ class alignas(CACHELINE_SIZE) ccl_sched : public ccl_sched_base {
     void release_sync_event(ccl_request* req);
 
 private:
-    void set_output_event(ccl_request* req);
+    void set_output_event(ccl_request* request);
     void update_active_request(bool use_delayed);
     static void complete_itt(const ccl_stream* stream);
 
diff --git a/src/sched/sched_base.cpp b/src/sched/sched_base.cpp
index 2bf17d875..a8cb49da8 100644
--- a/src/sched/sched_base.cpp
+++ b/src/sched/sched_base.cpp
@@ -20,10 +20,13 @@
 #include "coll/selection/selection.hpp"
 #include "common/global/global.hpp"
 #include "comm/comm.hpp"
-#include "common/utils/sycl_utils.hpp"
 #include "sched/entry/factory/entry_factory.hpp"
 #include "sched/sched_base.hpp"
 
+#ifdef CCL_ENABLE_SYCL
+#include "common/utils/sycl_utils.hpp"
+#endif // CCL_ENABLE_SYCL
+
 ccl_sched_base::ccl_sched_base(const ccl_sched_create_param& param)
         : sched_type(param.type),
           sched_id(param.id),
@@ -107,6 +110,7 @@ void ccl_sched_base::update_coll_param_and_attr(const ccl_coll_param& param,
     if (ccl::global_data::env().priority_mode == ccl_priority_direct) {
         coll_attr.priority = attr.priority;
     }
+    coll_param.stream = param.stream;
 }
 
 size_t ccl_sched_base::get_priority() const {
diff --git a/src/sched/sched_timer.cpp b/src/sched/sched_timer.cpp
index a476a359e..8043bc45f 100644
--- a/src/sched/sched_timer.cpp
+++ b/src/sched/sched_timer.cpp
@@ -27,33 +27,37 @@
 
 namespace ccl {
 
-void sched_timer::start() noexcept {
+void sched_timer::start() {
     start_time = std::chrono::high_resolution_clock::now();
+    started = true;
 }
 
-void sched_timer::stop() {
-    time_usec = get_elapsed_usec();
+void sched_timer::update() {
+    CCL_THROW_IF_NOT(started, "timer is not started, but update is requested");
+    auto current_time = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double, std::micro> time_span = current_time - start_time;
+    time_usec += time_span.count();
+    start_time = current_time;
 }
 
-std::string sched_timer::str() const {
-    std::stringstream ss;
-    ss.precision(2);
-    ss << std::fixed << time_usec;
-    return ss.str();
+void sched_timer::reset() {
+    time_usec = 0;
+    started = false;
 }
 
-void sched_timer::print(std::string title) const {
-    logger.info(title, ": ", this->str());
+bool sched_timer::is_started() const {
+    return started;
 }
 
-void sched_timer::reset() noexcept {
-    time_usec = 0;
+long double sched_timer::get_elapsed_usec() const {
+    return time_usec;
 }
 
-long double sched_timer::get_elapsed_usec() const noexcept {
-    auto current_time = std::chrono::high_resolution_clock::now();
-    std::chrono::duration<double, std::micro> time_span = current_time - start_time;
-    return time_span.count();
+std::string to_string(const sched_timer& timer) {
+    std::stringstream ss;
+    ss.precision(2);
+    ss << std::fixed << timer.get_elapsed_usec();
+    return ss.str();
 }
 
 #if defined(CCL_ENABLE_ITT)
diff --git a/src/sched/sched_timer.hpp b/src/sched/sched_timer.hpp
index 468b6cb08..aae3c37ac 100644
--- a/src/sched/sched_timer.hpp
+++ b/src/sched/sched_timer.hpp
@@ -19,7 +19,7 @@
 #include <string>
 
 #if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
-#include "common/ze/ze_api_wrapper.hpp"
+#include "common/api_wrapper/ze_api_wrapper.hpp"
 #endif
 
 namespace ccl {
@@ -27,19 +27,21 @@ namespace ccl {
 class sched_timer {
 public:
     sched_timer() = default;
-    void start() noexcept;
-    void stop();
-    std::string str() const;
-    void print(std::string title = {}) const;
-    void reset() noexcept;
+    void start();
+    void update();
+    void reset();
+    bool is_started() const;
 
-    long double get_elapsed_usec() const noexcept;
+    long double get_elapsed_usec() const;
 
 private:
-    long double time_usec;
+    bool started{};
+    long double time_usec{};
     std::chrono::high_resolution_clock::time_point start_time{};
 };
 
+std::string to_string(const sched_timer& timer);
+
 #ifdef CCL_ENABLE_ITT
 
 namespace profile {
diff --git a/src/sched/ze/ze_event_manager.cpp b/src/sched/ze/ze_event_manager.cpp
index 36b5a1a45..609511e03 100644
--- a/src/sched/ze/ze_event_manager.cpp
+++ b/src/sched/ze/ze_event_manager.cpp
@@ -216,11 +216,11 @@ ze_event_handle_t dynamic_event_pool::get_event() {
 void dynamic_event_pool::put_event(ze_event_handle_t event) {
     std::lock_guard<std::mutex> lg(lock);
 
-    auto it = event_alloc_info.find(event);
-    CCL_THROW_IF_NOT(it != event_alloc_info.end(), "event is not from the pool");
+    auto alloc_info_it = event_alloc_info.find(event);
+    CCL_THROW_IF_NOT(alloc_info_it != event_alloc_info.end(), "event is not from the pool");
 
-    event_info slot = it->second;
-    event_alloc_info.erase(it);
+    event_info slot = alloc_info_it->second;
+    event_alloc_info.erase(alloc_info_it);
 
     // make sure we always release the completed event
     CCL_ASSERT(zeEventQueryStatus(event) == ZE_RESULT_SUCCESS);
diff --git a/src/sched/ze/ze_event_manager.hpp b/src/sched/ze/ze_event_manager.hpp
index 7ffc5962c..68d7352d6 100644
--- a/src/sched/ze/ze_event_manager.hpp
+++ b/src/sched/ze/ze_event_manager.hpp
@@ -17,7 +17,7 @@
 
 #include <list>
 #include <unordered_map>
-#include "common/ze/ze_api_wrapper.hpp"
+#include "common/api_wrapper/ze_api_wrapper.hpp"
 #include "sched/entry/ze/ze_primitives.hpp"
 
 class ccl_stream;
diff --git a/src/sched/ze/ze_handle_manager.cpp b/src/sched/ze/ze_handle_manager.cpp
index 72988e41c..76808a90c 100644
--- a/src/sched/ze/ze_handle_manager.cpp
+++ b/src/sched/ze/ze_handle_manager.cpp
@@ -36,17 +36,48 @@ std::string to_string(ipc_mem_type mem_type) {
     }
 }
 
+// ipc_handle_desc
 ipc_handle_desc::ipc_handle_desc() {
-    memset(&handle, 0, sizeof(handle));
+    memset(&ipc_handle, 0, sizeof(ipc_handle));
 }
 
-ipc_handle_desc::ipc_handle_desc(const ze_ipc_mem_handle_t& handle,
+ipc_handle_desc::ipc_handle_desc(const ze_ipc_mem_handle_t& ipc_handle,
                                  size_t mem_offset,
-                                 ipc_mem_type mem_type)
-        : handle(handle),
+                                 ipc_mem_type mem_type,
+                                 int mem_handle)
+        : ipc_handle(ipc_handle),
           mem_offset(mem_offset),
-          mem_type(mem_type) {}
+          mem_type(mem_type),
+          mem_handle(mem_handle) {}
+
+ze_ipc_mem_handle_t ipc_handle_desc::mem_to_ipc_handle() const {
+    if (ccl::global_data::env().ze_ipc_exchange == ccl::ze::ipc_exchange_mode::sockets) {
+        // for the sockets mode, we don't need to do mem_handle_to_fd
+        // we return immediately the ipc handle which was already inited
+        return ipc_handle;
+    }
+
+    int fd = ccl::utils::invalid_fd;
+    if (ccl::global_data::env().ze_ipc_exchange == ccl::ze::ipc_exchange_mode::drmfd) {
+        CCL_THROW_IF_NOT(device_fd != ccl::utils::invalid_fd, "device_fd is invalid value");
+        fd = ccl::ze::fd_manager::mem_handle_to_fd(device_fd, mem_handle);
+        LOG_DEBUG("device_fd: ", device_fd, " gotten fd from mem_handle_to_fd: ", fd);
+    }
+    else if (ccl::global_data::env().ze_ipc_exchange == ccl::ze::ipc_exchange_mode::pidfd) {
+        CCL_THROW_IF_NOT(pidfd_fd != ccl::utils::invalid_fd, "pidfd_fd is invalid value");
+        fd = ccl::ze::fd_manager::mem_handle_to_fd(pidfd_fd, mem_handle);
+        LOG_DEBUG("pidfd_fd: ", pidfd_fd, ", gotten fd from mem_handle_to_fd: ", fd);
+    }
+    else {
+        CCL_THROW("unexpected ipc_exchange_mode");
+    }
+
+    CCL_THROW_IF_NOT(fd != ccl::utils::invalid_fd, "mem_handle_to_fd: invalid fd: ", fd);
+    LOG_DEBUG("mem_handle: ", mem_handle, ", fd: ", fd);
+    return ccl::ze::get_handle_from_fd(fd);
+}
 
+// ipc_handle_manager
 ipc_handle_manager::~ipc_handle_manager() {
     clear();
 }
@@ -72,24 +103,24 @@ void ipc_handle_manager::clear() {
     for (int rank = 0; rank < static_cast<int>(handles.size()); rank++) {
         for (size_t buf_idx = 0; buf_idx < handles[rank].size(); buf_idx++) {
             const auto& handle_info = handles[rank][buf_idx];
-            ze_ipc_mem_handle_t handle = handle_info.handle;
+            ze_ipc_mem_handle_t ipc_handle = handle_info.ipc_handle;
             auto mem_ptr = handle_info.mem_ptr;
             auto mem_type = handle_info.mem_type;
             size_t mem_offset = handle_info.mem_offset;
 
-            LOG_DEBUG("close handle: { base_ptr: ",
+            LOG_DEBUG("close ipc_handle: { base_ptr: ",
                       mem_ptr,
                       ", offset: ",
                       mem_offset,
                       ", fd: ",
-                      get_fd_from_handle(handle),
+                      get_fd_from_handle(ipc_handle),
                       ", rank: ",
                       rank,
                       ", buf_idx: ",
                       buf_idx,
                       " }");
 
-            // when closing the handle we need to take care of pointers that points to the
+            // when closing the ipc_handle we need to take care of pointers that points to the
             // same level zero allocation. They're simply offsetted from some base pointer
             // although represented by different FDs. If we close this base pointer,
             // all the derived pointers are closed(unmapped) as well. To handle this case
@@ -99,7 +130,7 @@ void ipc_handle_manager::clear() {
             if (mem_ptr) {
                 ze_result_t res{};
                 if (handle_info.is_cached) {
-                    // skip close, assume that handle will be closed in the cache
+                    // skip close, assume that ipc_handle will be closed in the cache
                     res = ZE_RESULT_SUCCESS;
                 }
                 else if (mem_type == ipc_mem_type::memory) {
@@ -121,7 +152,7 @@ void ipc_handle_manager::clear() {
                 }
 
                 if (res != ZE_RESULT_SUCCESS) {
-                    LOG_TRACE("unable to close memory handle: ",
+                    LOG_TRACE("unable to close memory ipc_handle: ",
                               "level-zero res: ",
                               to_string(res),
                               ", rank: ",
@@ -132,11 +163,6 @@ void ipc_handle_manager::clear() {
                               mem_ptr);
                 }
             }
-
-            if (!handle_info.is_cached) {
-                // ensure that fd is closed
-                close_handle_fd(handle);
-            }
         }
     }
 
@@ -158,7 +184,7 @@ void ipc_handle_manager::set(const mem_handle_map_t& handles_arg) {
     LOG_DEBUG("handles are set successfully, size of handles: ", handles.size());
 }
 
-void* ipc_handle_manager::get_ptr(int rank, size_t buf_idx, ccl_comm* map_comm) {
+void* ipc_handle_manager::get_ptr(int rank, size_t buf_idx, const ccl_comm* map_comm) {
     check_rank(rank, (map_comm) ? map_comm : comm);
     if (map_comm && (map_comm->id() != comm->id())) {
         int old_rank = rank;
@@ -184,9 +210,9 @@ void* ipc_handle_manager::get_ptr(int rank, size_t buf_idx, ccl_comm* map_comm)
     }
     CCL_THROW_IF_NOT(buf_idx < handles[rank].size(), "buf_idx is not valid value: ", buf_idx);
 
-    // Must be a non-const ref so it can be updated when handle is opened
+    // must be a non-const ref so it can be updated when ipc_handle is opened
     ipc_handle_desc& handle_info = handles[rank][buf_idx];
-    auto& handle = handle_info.handle;
+    auto& ipc_handle = handle_info.ipc_handle;
     auto& mem_ptr = handle_info.mem_ptr;
     auto mem_type = handle_info.mem_type;
 
@@ -198,7 +224,7 @@ void* ipc_handle_manager::get_ptr(int rank, size_t buf_idx, ccl_comm* map_comm)
         }
         else if (mem_type == ipc_mem_type::pool) {
             ze_ipc_event_pool_handle_t pool_handle;
-            cast_mem_to_pool_handle(&pool_handle, &handle);
+            cast_mem_to_pool_handle(&pool_handle, &ipc_handle);
             open_handle(pool_handle, (ze_event_pool_handle_t*)&mem_ptr);
         }
         else {
@@ -209,59 +235,69 @@ void* ipc_handle_manager::get_ptr(int rank, size_t buf_idx, ccl_comm* map_comm)
     LOG_DEBUG("get handle: { mem_ptr: ",
               mem_ptr,
               ", fd: ",
-              get_fd_from_handle(handle),
+              get_fd_from_handle(ipc_handle),
               ", rank: ",
               rank,
               ", buf_idx: ",
               buf_idx,
               " }");
 
-    // add offset that we received along with the handle
+    // add offset that we received along with the ipc_handle
     if (mem_type == ipc_mem_type::pool) {
         CCL_THROW_IF_NOT(handle_info.mem_offset == 0, "offsets should be 0 for event pool");
     }
     return static_cast<void*>(static_cast<char*>(mem_ptr) + handle_info.mem_offset);
 }
 
-void ipc_handle_manager::get(int rank, size_t buf_idx, ccl_buffer& buf, ccl_comm* map_comm) {
+void ipc_handle_manager::get(int rank, size_t buf_idx, ccl_buffer& buf, const ccl_comm* map_comm) {
     buf.set(get_ptr(rank, buf_idx, map_comm));
 }
 
 void ipc_handle_manager::get(int rank,
                              size_t buf_idx,
                              ze_event_pool_handle_t& buf,
-                             ccl_comm* map_comm) {
+                             const ccl_comm* map_comm) {
     buf = (ze_event_pool_handle_t)get_ptr(rank, buf_idx, map_comm);
 }
 
-void ipc_handle_manager::get_handle(const void* ptr, ze_ipc_mem_handle_t* handle) {
+void ipc_handle_manager::get_handle(void* ptr, ze_ipc_mem_handle_t* ipc_handle) {
     CCL_THROW_IF_NOT(ptr, "no mem pointer");
-    ZE_CALL(zeMemGetIpcHandle, (context, ptr, handle));
+    if (global_data::env().enable_ze_cache && global_data::env().enable_ze_cache_get_ipc_handles) {
+        ze_memory_allocation_properties_t alloc_props = ccl::ze::default_alloc_props;
+        ZE_CALL(zeMemGetAllocProperties, (context, ptr, &alloc_props, &device));
+
+        ipc_get_handle_desc ipc_desc = { ptr, alloc_props.id };
+        global_data::get().ze_data->cache->get(context, device, ipc_desc, ipc_handle);
+    }
+    else {
+        ZE_CALL(zeMemGetIpcHandle, (context, ptr, ipc_handle));
+    }
 }
 
 void ipc_handle_manager::get_handle(ze_event_pool_handle_t pool,
-                                    ze_ipc_event_pool_handle_t* handle) {
+                                    ze_ipc_event_pool_handle_t* ipc_handle) {
     CCL_THROW_IF_NOT(pool, "no pool");
-    ZE_CALL(zeEventPoolGetIpcHandle, (pool, handle));
+    ZE_CALL(zeEventPoolGetIpcHandle, (pool, ipc_handle));
 }
 
 void ipc_handle_manager::open_handle(ipc_handle_desc& info, void** ptr) {
-    if (global_data::env().enable_ze_cache && global_data::env().enable_ze_cache_ipc_handles) {
+    if (global_data::env().enable_ze_cache && global_data::env().enable_ze_cache_open_ipc_handles) {
         mem_handle_cache::value_t value{};
         global_data::get().ze_data->cache->get(context, device, info, &value);
-        CCL_THROW_IF_NOT(value != nullptr, "unable to open handle");
+        CCL_THROW_IF_NOT(value != nullptr, "unable to open ipc_handle");
         *ptr = const_cast<void*>(value->get_ptr());
         cached_handles.push_back(value);
         info.is_cached = true;
     }
     else {
-        ZE_CALL(zeMemOpenIpcHandle, (context, device, info.handle, 0 /* cache allocation */, ptr));
+        ZE_CALL(zeMemOpenIpcHandle,
+                (context, device, info.mem_to_ipc_handle(), 0 /* cache allocation */, ptr));
     }
 }
 
-void ipc_handle_manager::open_handle(const ze_ipc_event_pool_handle_t& handle,
+void ipc_handle_manager::open_handle(const ze_ipc_event_pool_handle_t& ipc_handle,
                                      ze_event_pool_handle_t* pool) {
-    ZE_CALL(zeEventPoolOpenIpcHandle, (context, handle, pool));
+    ZE_CALL(zeEventPoolOpenIpcHandle, (context, ipc_handle, pool));
 }
 
 void ipc_handle_manager::get_address_range(const void* ptr, void** base_ptr, size_t* size) {
@@ -276,7 +312,7 @@ void ipc_handle_manager::get_address_range(const void* ptr, void** base_ptr, siz
               *size);
 }
 
-void ipc_handle_manager::check_rank(int rank, ccl_comm* check_comm) {
+void ipc_handle_manager::check_rank(int rank, const ccl_comm* check_comm) {
     CCL_THROW_IF_NOT(
         (rank >= 0) && (rank < static_cast<int>(handles.size())) && (rank < check_comm->size()),
         "invalid rank: ",
@@ -286,7 +322,7 @@ void ipc_handle_manager::check_rank(int rank, ccl_comm* check_comm) {
         ", comm.size: ",
         check_comm->size());
     CCL_THROW_IF_NOT(
-        rank != check_comm->rank(), "do not expect to open handle for own rank: ", rank);
+        rank != check_comm->rank(), "do not expect to open ipc_handle for own rank: ", rank);
 }
 
 } // namespace ze
diff --git a/src/sched/ze/ze_handle_manager.hpp b/src/sched/ze/ze_handle_manager.hpp
index 42a9755aa..4534cbedd 100644
--- a/src/sched/ze/ze_handle_manager.hpp
+++ b/src/sched/ze/ze_handle_manager.hpp
@@ -18,11 +18,12 @@
 #include "common/log/log.hpp"
 #include "common/stream/stream.hpp"
 #include "common/utils/buffer.hpp"
+#include "common/utils/utils.hpp"
 #include "sched/entry/ze/ze_cache.hpp"
 #include "sched/entry/ze/ze_primitives.hpp"
 
 #include <unordered_map>
-#include "common/ze/ze_api_wrapper.hpp"
+#include "common/api_wrapper/ze_api_wrapper.hpp"
 
 class ccl_comm;
 
@@ -34,26 +35,40 @@ enum class ipc_mem_type : int { unknown = 0, memory, pool };
 
 std::string to_string(ipc_mem_type type);
 
+struct ipc_get_handle_desc {
+    void* ptr{ nullptr };
+    uint64_t mem_id{};
+};
+
 struct ipc_handle_desc {
-    ze_ipc_mem_handle_t handle{};
+    ze_ipc_mem_handle_t ipc_handle{};
     size_t mem_offset{};
     void* mem_ptr{};
     ipc_mem_type mem_type{};
-    pid_t remote_pid{};
-    ssize_t remote_context_id{ -1 };
+    int mem_handle{ ccl::utils::invalid_mem_handle };
+    pid_t remote_pid{ ccl::utils::invalid_pid };
+    ssize_t remote_context_id{ ccl::utils::invalid_context_id };
     uint64_t remote_mem_alloc_id{};
-    ssize_t remote_device_id{ -1 };
+    ssize_t remote_device_id{ ccl::utils::invalid_device_id };
+    int pidfd_fd{ ccl::utils::invalid_fd };
+    int device_fd{ ccl::utils::invalid_fd };
 
     bool is_cached = false;
 
     ipc_handle_desc();
-    ipc_handle_desc(const ze_ipc_mem_handle_t& handle, size_t offset, ipc_mem_type type);
+    ipc_handle_desc(const ze_ipc_mem_handle_t& ipc_handle,
+                    size_t offset,
+                    ipc_mem_type type,
+                    int mem_handle = ccl::utils::invalid_mem_handle);
     ipc_handle_desc(const ipc_handle_desc&) = default;
     ipc_handle_desc& operator=(const ipc_handle_desc&) = default;
+
+    ze_ipc_mem_handle_t mem_to_ipc_handle() const;
 };
 
 class ipc_handle_manager {
 public:
+    // matrix with ipc handles, row - rank, column - buf_idx
     using mem_handle_map_t = typename std::vector<std::vector<ipc_handle_desc>>;
 
     ipc_handle_manager() = default;
@@ -66,14 +81,14 @@ class ipc_handle_manager {
 
     void set(const mem_handle_map_t& handles_arg);
 
-    void* get_ptr(int rank, size_t buf_idx, ccl_comm* map_comm);
-    void get(int rank, size_t buf_idx, ccl_buffer& buf, ccl_comm* map_comm = nullptr);
-    void get(int rank, size_t buf_idx, ze_event_pool_handle_t& buf, ccl_comm* map_comm);
+    void* get_ptr(int rank, size_t buf_idx, const ccl_comm* map_comm);
+    void get(int rank, size_t buf_idx, ccl_buffer& buf, const ccl_comm* map_comm = nullptr);
+    void get(int rank, size_t buf_idx, ze_event_pool_handle_t& buf, const ccl_comm* map_comm);
 
-    void get_handle(const void* buffer, ze_ipc_mem_handle_t* handle);
-    void get_handle(ze_event_pool_handle_t pool, ze_ipc_event_pool_handle_t* handle);
+    void get_handle(void* ptr, ze_ipc_mem_handle_t* ipc_handle);
+    void get_handle(ze_event_pool_handle_t pool, ze_ipc_event_pool_handle_t* ipc_handle);
     void open_handle(ipc_handle_desc& info, void** ptr);
-    void open_handle(const ze_ipc_event_pool_handle_t& handle, ze_event_pool_handle_t* pool);
+    void open_handle(const ze_ipc_event_pool_handle_t& ipc_handle, ze_event_pool_handle_t* pool);
 
     void get_address_range(const void* ptr, void** base_ptr, size_t* size);
 
@@ -86,7 +101,7 @@ class ipc_handle_manager {
 
     /**
      * The value can be destroyed in the cache if the cache reaches its limit.
-     * This can happen at a time when the handle is really needed.
+     * This can happen at a time when the ipc_handle is really needed.
      * We can run a lot of ranks and get fail here.
      * Instead, the value will be popped from the cache, but only destroyed when not needed.
      * We rely on the smart pointer to work.
@@ -94,7 +109,7 @@ class ipc_handle_manager {
      */
     std::list<mem_handle_cache::value_t> cached_handles;
 
-    void check_rank(int rank, ccl_comm* check_comm);
+    void check_rank(int rank, const ccl_comm* check_comm);
 };
 
 } // namespace ze
diff --git a/src/sched/ze/ze_ipc_event_pool_manager.hpp b/src/sched/ze/ze_ipc_event_pool_manager.hpp
index 78f832749..06479e532 100644
--- a/src/sched/ze/ze_ipc_event_pool_manager.hpp
+++ b/src/sched/ze/ze_ipc_event_pool_manager.hpp
@@ -18,7 +18,7 @@
 #include "common/stream/stream.hpp"
 #include "sched/entry/ze/ze_primitives.hpp"
 
-#include "common/ze/ze_api_wrapper.hpp"
+#include "common/api_wrapper/ze_api_wrapper.hpp"
 
 namespace ccl {
 
diff --git a/src/sched/ze/ze_list_manager.cpp b/src/sched/ze/ze_list_manager.cpp
index 61693ecf4..2930a6b1b 100644
--- a/src/sched/ze/ze_list_manager.cpp
+++ b/src/sched/ze/ze_list_manager.cpp
@@ -66,27 +66,16 @@ bool queue_info::is_copy() const {
 
 queue_factory::queue_factory(ze_device_handle_t device,
                              ze_context_handle_t context,
-                             queue_group_type type)
+                             queue_group_type type,
+                             ze_command_queue_handle_t cmd_queue)
         : device(device),
           context(context),
           is_copy_queue(type == queue_group_type::main || type == queue_group_type::link),
-          type(type) {
+          type(type),
+          cmd_queue(cmd_queue) {
     ze_queue_properties_t queue_props;
     get_queues_properties(device, &queue_props);
 
-    if (!global_data::env().disable_ze_family_check) {
-        if (queue_props.size() == 1 && queue_props.front().numQueues == 1 &&
-            (get_device_family(device) == ccl::device_family::unknown)) {
-            ze_device_properties_t device_props = ccl::ze::default_device_props;
-            ZE_CALL(zeDeviceGetProperties, (device, &device_props));
-            bool is_integrated = device_props.flags & ZE_DEVICE_PROPERTY_FLAG_INTEGRATED;
-
-            CCL_THROW_IF_NOT(is_integrated,
-                             "unexpected device properties flags: ",
-                             flags_to_string<ze_device_property_flag_t>(device_props.flags));
-        }
-    }
-
     queue_ordinal = get_queue_group_ordinal(queue_props, type);
     LOG_DEBUG(get_type_str(),
               " queue factory: use ",
@@ -127,22 +116,32 @@ queue_info_t queue_factory::get(uint32_t index) {
     uint32_t queue_index = get_queue_index(index);
     CCL_THROW_IF_NOT(queue_index < queues.size(), "wrong queue index");
     auto& queue = queues.at(queue_index);
+
     if (!queue || !queue->is_valid()) {
         queue = std::make_shared<queue_info>();
         queue->desc = default_cmd_queue_desc;
-        queue->desc.ordinal = queue_ordinal;
-        queue->desc.index = queue_index;
         queue->is_copy_queue = is_copy_queue;
         queue->type = type;
-        global_data::get().ze_data->cache->get(
-            worker_idx, context, device, queue->desc, &queue->queue);
-        LOG_DEBUG("created new ",
-                  get_type_str(),
-                  " queue: { ordinal: ",
-                  queue_ordinal,
-                  ", index: ",
-                  queue_index,
-                  " }");
+        if (!is_copy_queue && ccl::global_data::env().enable_external_queue && cmd_queue) {
+            // todo: no API to support getting queue desc by command queue
+            // queue->desc.ordinal = external_command_queue->desc.ordinal;
+            // queue->desc.index = external_command_queue->desc.index;
+            queue->queue = cmd_queue;
+            LOG_DEBUG("use external command queue for ", get_type_str(), " kernels");
+        }
+        else {
+            queue->desc.ordinal = queue_ordinal;
+            queue->desc.index = queue_index;
+            global_data::get().ze_data->cache->get(
+                worker_idx, context, device, queue->desc, &queue->queue);
+            LOG_DEBUG("created new ",
+                      get_type_str(),
+                      " queue: { ordinal: ",
+                      queue_ordinal,
+                      ", index: ",
+                      queue_index,
+                      " }");
+        }
     }
     return queue;
 }
@@ -151,8 +150,11 @@ void queue_factory::destroy(queue_info_t& queue) {
     if (!queue || !queue->is_valid()) {
         return;
     }
-
-    global_data::get().ze_data->cache->push(worker_idx, context, device, queue->desc, queue->queue);
+    // No need to cache external cmd queue
+    if (queue->queue != cmd_queue) {
+        global_data::get().ze_data->cache->push(
+            worker_idx, context, device, queue->desc, queue->queue);
+    }
     queue.reset();
 }
 
@@ -200,19 +202,20 @@ uint32_t queue_factory::get_ordinal() const {
     return queue_ordinal;
 }
 
-bool queue_factory::can_use_queue_group(ze_device_handle_t device, queue_group_type type) {
-    auto user_engine = global_data::env().ze_copy_engine;
+bool queue_factory::can_use_queue_group(ze_device_handle_t device,
+                                        queue_group_type type,
+                                        copy_engine_mode mode) {
     switch (type) {
         case queue_group_type::compute: break;
 
         case queue_group_type::main:
-            if (user_engine != ccl_ze_copy_engine_auto && user_engine != ccl_ze_copy_engine_main) {
+            if (mode != copy_engine_mode::auto_mode && mode != copy_engine_mode::main) {
                 return false;
             }
             break;
 
         case queue_group_type::link:
-            if (user_engine != ccl_ze_copy_engine_auto && user_engine != ccl_ze_copy_engine_link) {
+            if (mode != copy_engine_mode::auto_mode && mode != copy_engine_mode::link) {
                 return false;
             }
             break;
@@ -220,7 +223,7 @@ bool queue_factory::can_use_queue_group(ze_device_handle_t device, queue_group_t
         default: CCL_THROW("unknown queue group type"); break;
     }
 
-    if (type != queue_group_type::compute && user_engine == ccl_ze_copy_engine_none) {
+    if (type != queue_group_type::compute && mode == ccl::ze::copy_engine_mode::none) {
         return false;
     }
 
@@ -280,28 +283,39 @@ bool list_factory::is_copy() const {
 list_manager::list_manager(const ccl_sched_base* sched, const ccl_stream* stream)
         : sched(sched),
           device(stream->get_ze_device()),
-          context(stream->get_ze_context()) {
+          context(stream->get_ze_context()),
+          cmd_queue(stream->get_ze_command_queue()) {
     LOG_DEBUG("create list manager");
     CCL_THROW_IF_NOT(device, "no device");
     CCL_THROW_IF_NOT(context, "no context");
+    CCL_THROW_IF_NOT(sched->coll_param.comm, "no comm");
+
+    h2d_copy_engine_mode h2d_copy_mode = global_data::env().ze_h2d_copy_engine;
 
     comp_queue_factory =
-        std::make_unique<queue_factory>(device, context, queue_group_type::compute);
+        std::make_unique<queue_factory>(device, context, queue_group_type::compute, cmd_queue);
     comp_list_factory = std::make_unique<list_factory>(device, context, false);
 
-    can_use_main_queue = queue_factory::can_use_queue_group(device, queue_group_type::main);
-    if (can_use_main_queue) {
+    auto copy_engine_mode = sched->coll_param.comm->get_env()->get_ze_copy_engine();
+
+    main_queue_available =
+        queue_factory::can_use_queue_group(device, queue_group_type::main, copy_engine_mode);
+
+    main_queue_available = main_queue_available || (h2d_copy_mode == h2d_copy_engine_mode::main);
+
+    if (main_queue_available) {
         main_queue_factory =
-            std::make_unique<queue_factory>(device, context, queue_group_type::main);
+            std::make_unique<queue_factory>(device, context, queue_group_type::main, nullptr);
     }
 
-    can_use_link_queue = queue_factory::can_use_queue_group(device, queue_group_type::link);
-    if (can_use_link_queue) {
+    link_queue_available =
+        queue_factory::can_use_queue_group(device, queue_group_type::link, copy_engine_mode);
+    if (link_queue_available) {
         link_queue_factory =
-            std::make_unique<queue_factory>(device, context, queue_group_type::link);
+            std::make_unique<queue_factory>(device, context, queue_group_type::link, nullptr);
     }
 
-    use_copy_queue = can_use_main_queue || can_use_link_queue;
+    use_copy_queue = main_queue_available || link_queue_available;
     if (use_copy_queue) {
         copy_list_factory = std::make_unique<list_factory>(device, context, true);
     }
@@ -313,25 +327,46 @@ list_manager::~list_manager() {
 
 std::pair<queue_factory*, list_manager::queue_map_t*> list_manager::get_factory_and_map(
     bool is_copy,
-    bool peer_card_copy) const {
+    copy_direction direction) const {
+    CCL_THROW_IF_NOT((!is_copy && direction == copy_direction::undefined) ||
+                         (is_copy && direction != copy_direction::undefined),
+                     "wrong direction");
+
     queue_factory* factory = nullptr;
     queue_map_t* queue_map = nullptr;
-    if (is_copy) {
-        if (can_use_link_queue && peer_card_copy) {
+
+    if (direction == copy_direction::c2c) {
+        if (link_queue_available) {
             factory = link_queue_factory.get();
             queue_map = const_cast<queue_map_t*>(&link_queue_map);
         }
-        else {
+        else if (main_queue_available) {
             factory = main_queue_factory.get();
             queue_map = const_cast<queue_map_t*>(&main_queue_map);
         }
     }
-    else {
+    // h2d, d2h, d2d, t2t
+    else if (direction != copy_direction::undefined) {
+        const bool use_compute_fallback =
+            ccl::global_data::env().ze_enable_ccs_fallback_for_copy && !main_queue_available;
+
+        if (main_queue_available) {
+            factory = main_queue_factory.get();
+            queue_map = const_cast<queue_map_t*>(&main_queue_map);
+        }
+        else if (link_queue_available && !use_compute_fallback) {
+            factory = link_queue_factory.get();
+            queue_map = const_cast<queue_map_t*>(&link_queue_map);
+        }
+    }
+
+    // fallback
+    if (!factory || !queue_map) {
         factory = comp_queue_factory.get();
         queue_map = const_cast<queue_map_t*>(&comp_queue_map);
     }
-    CCL_THROW_IF_NOT(factory, "no factory");
-    CCL_THROW_IF_NOT(queue_map, "no map");
+
+    CCL_THROW_IF_NOT(factory && queue_map, "unable select list queue");
     return std::make_pair(factory, queue_map);
 }
 
@@ -339,9 +374,9 @@ list_info_t list_manager::get_list(const sched_entry* entry,
                                    uint32_t index,
                                    bool is_copy,
                                    const std::vector<ze_event_handle_t>& wait_events,
-                                   bool peer_card_copy) {
+                                   copy_direction direction) {
     // get comp or copy primitives
-    auto factory_map_pair = get_factory_and_map(is_copy, peer_card_copy);
+    auto factory_map_pair = get_factory_and_map(is_copy, direction);
     queue_factory* factory = factory_map_pair.first;
     queue_map_t* queue_map = factory_map_pair.second;
     auto queue = factory->get(index);
@@ -372,8 +407,10 @@ list_info_t list_manager::get_list(const sched_entry* entry,
 
     // if we dont have any lists for current queue
     if (new_list_for_queue && new_entry_for_list) {
+        auto& list_factory = (is_copy) ? copy_list_factory : comp_list_factory;
+        CCL_THROW_IF_NOT(list_factory, "no factory");
         // creaete new list
-        list = (is_copy) ? copy_list_factory->get(queue) : comp_list_factory->get(queue);
+        list = list_factory->get(queue);
         access_list.push_back({ queue, list });
         // remember list for current entry
         entry_map[entry].push_back(std::make_pair(queue, list));
@@ -410,23 +447,20 @@ ze_command_list_handle_t list_manager::get_comp_list(
     const sched_entry* entry,
     const std::vector<ze_event_handle_t>& wait_events,
     uint32_t index) {
-    auto list = get_list(entry, index, false, wait_events, false);
+    auto list = get_list(entry, index, false, wait_events, copy_direction::undefined);
     return list->get_native();
 }
 
 ze_command_list_handle_t list_manager::get_copy_list(
     const sched_entry* entry,
     const std::vector<ze_event_handle_t>& wait_events,
-    uint32_t index,
-    bool peer_card_copy) {
-    // use main for intra copy or link for inter copy
-    if ((!peer_card_copy && can_use_main_queue) || (peer_card_copy && can_use_link_queue)) {
-        auto list = get_list(entry, index, true, wait_events, peer_card_copy);
+    copy_direction direction,
+    uint32_t index) {
+    if (link_queue_available || main_queue_available) {
+        auto list = get_list(entry, index, true, wait_events, direction);
         return list->get_native();
     }
-    else {
-        return get_comp_list(entry, wait_events, index);
-    }
+    return get_comp_list(entry, wait_events, index);
 }
 
 void list_manager::clear() {
@@ -472,6 +506,10 @@ bool list_manager::can_use_copy_queue() const {
     return use_copy_queue;
 }
 
+bool list_manager::can_use_main_queue() const {
+    return main_queue_available;
+}
+
 bool list_manager::is_executed() const {
     return executed;
 }
@@ -563,7 +601,7 @@ void list_manager::print_dump() const {
             ss << " entries:";
             for (const auto& list_entries_pair : map->at(queue_index).second) {
                 auto entry = list_entries_pair.first;
-                ss << " " << entry->name();
+                ss << " " << entry->name_ext();
             }
             ss << "\n";
         }
@@ -571,7 +609,7 @@ void list_manager::print_dump() const {
     }
 
     if (sched->use_single_list && !access_list.empty()) {
-        ss << "execution order: {\n";
+        ss << "submission order: {\n";
         for (auto& queue_list_pair : access_list) {
             auto& queue = queue_list_pair.first;
             auto& list = queue_list_pair.second;
diff --git a/src/sched/ze/ze_list_manager.hpp b/src/sched/ze/ze_list_manager.hpp
index c323e0233..df3162139 100644
--- a/src/sched/ze/ze_list_manager.hpp
+++ b/src/sched/ze/ze_list_manager.hpp
@@ -15,6 +15,7 @@
 */
 #pragma once
 
+#include "sched/entry/copy/copy_helper.hpp"
 #include "sched/entry/ze/ze_primitives.hpp"
 
 #include <list>
@@ -73,7 +74,10 @@ using queue_info_t = typename std::shared_ptr<queue_info>;
 
 class queue_factory {
 public:
-    queue_factory(ze_device_handle_t device, ze_context_handle_t context, queue_group_type type);
+    queue_factory(ze_device_handle_t device,
+                  ze_context_handle_t context,
+                  queue_group_type type,
+                  ze_command_queue_handle_t cmd_queue);
     queue_factory& operator=(const queue_factory&) = delete;
     queue_factory& operator=(queue_factory&&) = delete;
     ~queue_factory();
@@ -83,13 +87,16 @@ class queue_factory {
     bool is_copy() const;
     uint32_t get_ordinal() const;
 
-    static bool can_use_queue_group(ze_device_handle_t device, queue_group_type type);
+    static bool can_use_queue_group(ze_device_handle_t device,
+                                    queue_group_type type,
+                                    copy_engine_mode mode);
 
 private:
     const ze_device_handle_t device;
     const ze_context_handle_t context;
     const bool is_copy_queue;
     const queue_group_type type;
+    const ze_command_queue_handle_t cmd_queue;
 
     static constexpr ssize_t worker_idx = 0;
 
@@ -138,19 +145,21 @@ class list_manager {
                                            uint32_t index = 0);
     ze_command_list_handle_t get_copy_list(const sched_entry* entry = nullptr,
                                            const std::vector<ze_event_handle_t>& wait_events = {},
-                                           uint32_t index = 0,
-                                           bool peer_card_copy = false);
+                                           copy_direction direction = copy_direction::d2d,
+                                           uint32_t index = 0);
 
     void clear();
     void reset_execution_state();
 
     bool can_use_copy_queue() const;
+    bool can_use_main_queue() const;
     bool is_executed() const;
 
 private:
     const ccl_sched_base* sched;
     const ze_device_handle_t device;
     const ze_context_handle_t context;
+    const ze_command_queue_handle_t cmd_queue;
     std::unique_ptr<queue_factory> comp_queue_factory;
     std::unique_ptr<queue_factory> link_queue_factory;
     std::unique_ptr<queue_factory> main_queue_factory;
@@ -182,16 +191,16 @@ class list_manager {
     std::list<std::pair<queue_info_t, list_info_t>> access_list;
     bool executed = false;
     bool use_copy_queue = false;
-    bool can_use_main_queue = false;
-    bool can_use_link_queue = false;
+    bool main_queue_available = false;
+    bool link_queue_available = false;
 
     std::pair<queue_factory*, queue_map_t*> get_factory_and_map(bool is_copy,
-                                                                bool peer_card_copy) const;
+                                                                copy_direction direction) const;
     list_info_t get_list(const sched_entry* entry,
                          uint32_t index,
                          bool is_copy,
                          const std::vector<ze_event_handle_t>& wait_events,
-                         bool peer_card_copy);
+                         copy_direction direction);
 
     void execute_list(queue_info_t& queue, list_info_t& list);
 
diff --git a/src/topology/topo_manager.cpp b/src/topology/topo_manager.cpp
index e5ed1302e..434e0b253 100644
--- a/src/topology/topo_manager.cpp
+++ b/src/topology/topo_manager.cpp
@@ -20,7 +20,6 @@
 #if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
 #include "common/utils/sycl_utils.hpp"
 #include "sched/entry/ze/ze_primitives.hpp"
-#include <CL/sycl/backend/level_zero.hpp>
 #endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
 
 namespace ccl {
@@ -116,6 +115,36 @@ std::string to_string(const ze_rank_info_vec_t& ze_rank_info_vec,
     return ss.str();
 }
 
+bool topo_manager::has_oversubscription() const {
+    return is_oversubscription_detected;
+}
+
+bool topo_manager::oversubscription_detected(const ze_rank_info_vec_t& ze_rank_infos,
+                                             const host_info_vec_t& host_infos) {
+    size_t unique_device_uuids_count = topo_manager::invalid_device_uuids_count;
+    for (const auto& host_info : host_infos) {
+        std::vector<ze_device_uuid_t> unique_device_uuids;
+        for (auto rank : host_info.ranks) {
+            const auto& rank_info = ze_rank_infos[rank];
+            if (is_unique_uuid(unique_device_uuids, rank_info.device_uuid)) {
+                unique_device_uuids.push_back(rank_info.device_uuid);
+            }
+        }
+        unique_device_uuids_count += unique_device_uuids.size();
+    }
+
+    CCL_THROW_IF_NOT(unique_device_uuids_count != topo_manager::invalid_device_uuids_count,
+                     "invalid unique_device_uuids_count");
+    if (unique_device_uuids_count < rank_info_vec.size()) {
+        LOG_DEBUG("unique_device_uuids_count: ",
+                  unique_device_uuids_count,
+                  ", comm_size: ",
+                  rank_info_vec.size());
+        return true;
+    }
+    return false;
+}
+
 std::string to_string(const p2p_matrix_t& matrix) {
     CCL_THROW_IF_NOT(!matrix.empty());
 
@@ -315,13 +344,7 @@ bool topo_manager::is_sub_vector(const std::vector<ze_device_uuid_t>& vec,
 
     std::vector<ze_device_uuid_t> unique_sub_vec;
     for (const auto& uuid : sub_vec) {
-        bool is_unique_uuid =
-            (std::find_if(
-                 unique_sub_vec.begin(), unique_sub_vec.end(), [&uuid](const ze_device_uuid_t& u) {
-                     return ze::is_same_dev_uuid(uuid, u);
-                 }) == unique_sub_vec.end());
-
-        if (is_unique_uuid) {
+        if (is_unique_uuid(unique_sub_vec, uuid)) {
             unique_sub_vec.push_back(uuid);
         }
     }
@@ -385,32 +408,42 @@ domains_t topo_manager::parse_topo_env() {
     ccl::utils::str_to_array(std::string(env_to_parse), ";", domain_raw_strs);
     check_domain_count(domain_raw_strs.size());
 
-    std::vector<std::map<int, std::string>> domain_strs;
-    domain_strs.push_back(get_domain_string(domain_raw_strs[topo_manager::card_domain_idx],
-                                            std::string(topo_manager::card_domain_name)));
-    domain_strs.push_back(get_domain_string(domain_raw_strs[topo_manager::plane_domain_idx],
-                                            std::string(topo_manager::plane_domain_name)));
-
-    for (const auto& domain_str : domain_strs) {
-        for (const auto& domain_pair : domain_str) {
-            std::vector<std::vector<int>> proc_indexes;
-            auto substrs = get_subdomain_strings(domain_pair.second);
-            for (const auto& substr : substrs) {
-                std::vector<int> procs{};
-                ccl::utils::str_to_array(substr, ",", procs);
-                for (const auto& proc : procs) {
-                    const auto local_proc_count =
-                        ccl::global_data::get().executor->get_local_proc_count();
-                    CCL_THROW_IF_NOT(proc < local_proc_count,
-                                     "unexpected process number: ",
-                                     proc,
-                                     ", it should be less than: ",
-                                     local_proc_count);
-                }
-                proc_indexes.push_back(procs);
-            }
-            domains.insert({ domain_pair.first, proc_indexes });
+    std::vector<std::pair<int, std::string>> domain_pairs;
+    domain_pairs.push_back(get_domain_pair(domain_raw_strs[topo_manager::card_domain_idx],
+                                           std::string(topo_manager::card_domain_name)));
+    domain_pairs.push_back(get_domain_pair(domain_raw_strs[topo_manager::plane_domain_idx],
+                                           std::string(topo_manager::plane_domain_name)));
+
+    const auto local_proc_count = ccl::global_data::get().get_local_proc_count();
+
+    std::vector<int> all_local_procs(local_proc_count);
+    std::iota(all_local_procs.begin(), all_local_procs.end(), 0);
+
+    for (const auto& domain_pair : domain_pairs) {
+        std::vector<std::vector<int>> proc_indexes;
+        auto domain_idx = domain_pair.first;
+        auto& domain_raw_str = domain_pair.second;
+        auto substrs = get_subdomain_strings(domain_raw_str);
+        for (const auto& substr : substrs) {
+            std::vector<int> procs{};
+            ccl::utils::str_to_array(substr, ",", procs);
+            proc_indexes.push_back(procs);
+        }
+
+        std::vector<int> all_domain_procs;
+        for (const auto& procs : proc_indexes) {
+            all_domain_procs.insert(all_domain_procs.end(), procs.begin(), procs.end());
         }
+        std::sort(all_domain_procs.begin(), all_domain_procs.end());
+
+        CCL_THROW_IF_NOT(all_domain_procs == all_local_procs,
+                         "unexpected process indexes for topo domain ",
+                         domain_raw_strs[domain_idx],
+                         ", all local processes should be covered by user-supplied topo domain",
+                         ", local process count ",
+                         local_proc_count);
+
+        domains.insert({ domain_idx, proc_indexes });
     }
     check_domain_count(domains.size());
     return domains;
@@ -524,41 +557,8 @@ bool topo_manager::check_colors() const {
     return expected_colors;
 }
 
-void topo_manager::allgather(const void* send_buf, void* recv_buf, int bytes) {
-    std::vector<int> recv_bytes(comm->get_size(), bytes);
-    allgatherv(send_buf, recv_buf, recv_bytes);
-}
-
-void topo_manager::allgatherv(const void* send_buf,
-                              void* recv_buf,
-                              const std::vector<int>& recv_bytes) {
-    atl_req_t req{};
-
-    int comm_rank = comm->get_rank();
-    int comm_size = comm->get_size();
-
-    CCL_THROW_IF_NOT((int)recv_bytes.size() == comm->get_size(),
-                     "unexpected recv_bytes size ",
-                     recv_bytes.size(),
-                     ", comm_size ",
-                     comm_size);
-
-    std::vector<int> offsets(comm_size, 0);
-    for (int i = 1; i < comm_size; i++) {
-        offsets[i] = offsets[i - 1] + recv_bytes[i - 1];
-    }
-
-    comm->allgatherv(0 /* ep_idx */,
-                     send_buf,
-                     recv_bytes[comm_rank],
-                     recv_buf,
-                     recv_bytes.data(),
-                     offsets.data(),
-                     req);
-    comm->wait(0 /* ep_idx */, req);
-}
-
 void topo_manager::fill_env_colors(const rank_info_vec_t& info_vec) {
+    CCL_THROW_IF_NOT(!domains.empty());
     for (const auto& domain : domains) {
         auto& subdomains = domain.second;
         int color_idx = 0;
@@ -619,11 +619,15 @@ void topo_manager::fill_ze_intra_colors(const rank_info_vec_t& local_info_vec) {
 
     for (const auto& info : local_info_vec) {
         const auto& pci_addr = ze_rank_info_vec[info.rank].pci_addr;
+
+        // search last card with the same pci_addr
         auto card_it =
-            std::find_if(cards.begin(), cards.end(), [&pci_addr](const card_info_t& info) {
+            std::find_if(cards.rbegin(), cards.rend(), [&pci_addr](const card_info_t& info) {
                 return ze::is_same_pci_addr(pci_addr, info.first);
             });
-        if (card_it == cards.end()) {
+
+        // if there is no such card or card already filled create new one
+        if (card_it == cards.rend() || (card_it->second.size() == max_ranks_per_card)) {
             cards.push_back(std::make_pair(pci_addr, std::vector<int>{ info.rank }));
         }
         else {
@@ -631,20 +635,13 @@ void topo_manager::fill_ze_intra_colors(const rank_info_vec_t& local_info_vec) {
         }
     }
 
-    int color = 0;
-    size_t ranks_per_color = 0;
-    for (const auto& card : cards) {
-        const auto& card_ranks = card.second;
+    for (size_t card_idx = 0; card_idx < cards.size(); card_idx++) {
+        const auto& card_ranks = cards[card_idx].second;
         auto unique_card_ranks = std::set<int>(card_ranks.begin(), card_ranks.end());
         CCL_THROW_IF_NOT(card_ranks.size() == unique_card_ranks.size());
         for (const auto& rank : card_ranks) {
             check_invalid_color(intra_card_colors[rank]);
-            intra_card_colors[rank] = color;
-            ranks_per_color++;
-            if ((ranks_per_color == max_ranks_per_card) || (rank == card_ranks.back())) {
-                color++;
-                ranks_per_color = 0;
-            }
+            intra_card_colors[rank] = card_idx;
         }
     }
 }
@@ -818,8 +815,8 @@ fabric_ports_t topo_manager::get_fabric_ports() {
     uint32_t port_count{};
 
     // ZE_CALL(zesDeviceEnumFabricPorts, ((zes_device_handle_t)ze_device, &port_count, NULL));
-    if (zesDeviceEnumFabricPorts((zes_device_handle_t)ze_device, &port_count, NULL) ==
-        ZE_RESULT_ERROR_UNINITIALIZED) {
+    if (zesDeviceEnumFabricPorts((zes_device_handle_t)ze_device, &port_count, NULL) !=
+        ZE_RESULT_SUCCESS) {
         LOG_INFO("can not retrieve ze fabric ports");
         return {};
     }
@@ -883,7 +880,7 @@ fabric_ports_t topo_manager::get_fabric_ports() {
     int my_port_count = (int)my_ports.size();
     std::vector<int> all_port_counts(comm_size);
 
-    allgather(&my_port_count, all_port_counts.data(), sizeof(my_port_count));
+    utils::allgather(comm, &my_port_count, all_port_counts.data(), sizeof(my_port_count));
 
     size_t total_port_count = std::accumulate(all_port_counts.begin(), all_port_counts.end(), 0);
 
@@ -909,7 +906,7 @@ fabric_ports_t topo_manager::get_fabric_ports() {
 
     std::vector<topo_ze_port_info> all_ports(total_port_count);
 
-    allgatherv(my_ports.data(), all_ports.data(), recv_bytes);
+    utils::allgatherv(comm, my_ports.data(), all_ports.data(), recv_bytes);
 
     // print all ports before filtering
     if (comm_rank == 0) {
@@ -1059,6 +1056,14 @@ void topo_manager::check_planes(const std::vector<plane_t>& planes) {
                      ", expected_size ",
                      expected_size);
 }
+
+bool topo_manager::is_unique_uuid(std::vector<ze_device_uuid_t>& unique_vec,
+                                  const ze_device_uuid_t& uuid) {
+    return (std::find_if(unique_vec.begin(), unique_vec.end(), [&uuid](const ze_device_uuid_t& u) {
+                return ze::is_same_dev_uuid(uuid, u);
+            }) == unique_vec.end());
+}
+
 #endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
 
 rank_info_vec_t topo_manager::get_filtered_rank_info_vec(int filter_host_idx) const {
@@ -1089,9 +1094,8 @@ void topo_manager::check_domain_count(size_t domain_count) {
                      topo_manager::max_domain_count);
 }
 
-std::map<int, std::string> topo_manager::get_domain_string(const std::string& input_str,
-                                                           const std::string& key) {
-    std::map<int, std::string> map;
+std::pair<int, std::string> topo_manager::get_domain_pair(const std::string& input_str,
+                                                          const std::string& key) {
     auto str = input_str;
 
     size_t pos = str.find(key);
@@ -1109,8 +1113,8 @@ std::map<int, std::string> topo_manager::get_domain_string(const std::string& in
 
     CCL_THROW_IF_NOT(
         domain_idx != topo_manager::invalid_domain_idx, "unexpected domain index: ", domain_idx);
-    map.insert({ domain_idx, str });
-    return map;
+
+    return std::make_pair(domain_idx, str);
 }
 
 std::vector<std::string> topo_manager::get_subdomain_strings(const std::string& input_str) {
@@ -1144,7 +1148,7 @@ void topo_manager::build_host_info() {
     gethostname(my_hostname, max_hostname_len - 1);
     LOG_DEBUG("rank: ", comm_rank, ", size: ", comm_size, ", host: ", my_hostname);
 
-    allgather(my_hostname, all_hostnames_raw.data(), max_hostname_len);
+    utils::allgather(comm, my_hostname, all_hostnames_raw.data(), max_hostname_len);
 
     std::vector<std::string> all_hostnames(comm_size);
     std::set<std::string> unique_hostnames;
@@ -1217,11 +1221,11 @@ void topo_manager::base_init(std::shared_ptr<atl_base_comm> atl_comm,
     topo_rank_info rank_info{};
     rank_info.rank = comm_rank;
     rank_info.host_idx = host_idx;
-    rank_info.local_proc_idx = ccl::global_data::get().executor->get_local_proc_idx();
+    rank_info.local_proc_idx = ccl::global_data::get().get_local_proc_idx();
     std::string rank_uuid = topo_manager::generate_uuid();
     std::copy(rank_uuid.begin(), rank_uuid.end(), rank_info.uuid);
 
-    allgather(&rank_info, rank_info_vec.data(), sizeof(rank_info));
+    utils::allgather(comm, &rank_info, rank_info_vec.data(), sizeof(rank_info));
 
     for (size_t idx = 0; idx < rank_info_vec.size(); idx++) {
         uuids[idx] = std::string(rank_info_vec[idx].uuid);
@@ -1232,15 +1236,22 @@ void topo_manager::base_init(std::shared_ptr<atl_base_comm> atl_comm,
                          idx);
     }
 
-#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
-    ze_base_init(device, context);
-    is_p2p_access_enabled = check_p2p_access();
-#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
-
     if (!(device && context)) {
         return;
     }
 
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+    if (device.get()->get_native().get_backend() == utils::get_level_zero_backend()) {
+        ze_base_init(device, context);
+    }
+    else {
+        if (ccl::global_data::env().topo_color == topo_color_mode::ze) {
+            LOG_INFO("fallback to fixed topo color mode");
+            ccl::global_data::env().topo_color = topo_color_mode::fixed;
+        }
+    }
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
+
     if (ccl::global_data::env().topo_color == topo_color_mode::fixed) {
         for (int h_idx = 0; h_idx < (int)host_info_vec.size(); h_idx++) {
             fill_fixed_colors(get_filtered_rank_info_vec(h_idx));
@@ -1292,19 +1303,6 @@ void topo_manager::ze_base_init(std::shared_ptr<ccl::device> device,
     int comm_rank = comm->get_rank();
     int comm_size = comm->get_size();
 
-    p2p_matrix.resize(comm_size);
-    for (size_t i = 0; i < p2p_matrix.size(); i++) {
-        p2p_matrix[i].resize(comm_size, false);
-    }
-
-    if (!(device && context)) {
-        return;
-    }
-
-    if (device.get()->get_native().get_backend() != utils::get_level_zero_backend()) {
-        return;
-    }
-
     ze_device = sycl::get_native<utils::get_level_zero_backend()>(device.get()->get_native());
     CCL_THROW_IF_NOT(ze_device, "null ze device");
     ZE_CALL(zeDeviceGetProperties, (ze_device, &dev_props));
@@ -1318,8 +1316,8 @@ void topo_manager::ze_base_init(std::shared_ptr<ccl::device> device,
     zes_pci_properties_t pci_props = {};
 
     // ZE_CALL(zesDevicePciGetProperties, ((zes_device_handle_t)ze_device, &pci_props));
-    if (zesDevicePciGetProperties((zes_device_handle_t)ze_device, &pci_props) ==
-        ZE_RESULT_ERROR_UNINITIALIZED) {
+    if (zesDevicePciGetProperties((zes_device_handle_t)ze_device, &pci_props) !=
+        ZE_RESULT_SUCCESS) {
         LOG_INFO("can not retrieve ze pci properties");
     }
     else {
@@ -1330,7 +1328,7 @@ void topo_manager::ze_base_init(std::shared_ptr<ccl::device> device,
     ze_rank_info.subdev_id = dev_props.subdeviceId;
     ze_rank_info.dev_prop_flags = dev_props.flags;
 
-    allgather(&ze_rank_info, ze_rank_info_vec.data(), sizeof(ze_rank_info));
+    utils::allgather(comm, &ze_rank_info, ze_rank_info_vec.data(), sizeof(ze_rank_info));
 
     // build fabric port info
     fabric_ports = get_fabric_ports();
@@ -1338,6 +1336,7 @@ void topo_manager::ze_base_init(std::shared_ptr<ccl::device> device,
     // build p2p connectivity info
     const auto& node_devices = global_data::get().ze_data->devices;
     p2p_matrix = build_p2p_matrix(get_filtered_devices(node_devices));
+    is_p2p_access_enabled = check_p2p_access();
     LOG_DEBUG("p2p matrix: \n",
               ccl::to_string(p2p_matrix),
               "\nnumber of node devices: ",
@@ -1346,6 +1345,8 @@ void topo_manager::ze_base_init(std::shared_ptr<ccl::device> device,
     if (comm_rank == 0) {
         LOG_INFO("ze_rank_info_vec: ", ccl::to_string(ze_rank_info_vec, host_info_vec));
     }
+
+    is_oversubscription_detected = oversubscription_detected(ze_rank_info_vec, host_info_vec);
 }
 #endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
 
diff --git a/src/topology/topo_manager.hpp b/src/topology/topo_manager.hpp
index 32a7862fd..54a777d0e 100644
--- a/src/topology/topo_manager.hpp
+++ b/src/topology/topo_manager.hpp
@@ -15,7 +15,6 @@
 */
 #pragma once
 
-#include "atl/atl_base_comm.hpp"
 #include "common/utils/utils.hpp"
 #include "oneapi/ccl/config.h"
 
@@ -26,8 +25,9 @@
 #include <string>
 
 #if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
-#include "common/global/ze_data.hpp"
+#include "common/global/ze/ze_data.hpp"
 #endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
+#include "common/utils/exchange_utils.hpp"
 
 namespace ccl {
 
@@ -112,6 +112,7 @@ class topo_manager {
     static constexpr int max_ranks_per_card = 2;
     static constexpr int max_ranks_per_plane = 8;
     static constexpr int max_domain_count = 2;
+    static constexpr size_t invalid_device_uuids_count = 0;
 
     static constexpr int card_domain_idx = 0;
     static constexpr int plane_domain_idx = 1;
@@ -145,7 +146,11 @@ class topo_manager {
     static p2p_matrix_t build_p2p_matrix(const std::vector<ze_device_handle_t>& devices);
     static bool is_sub_vector(const std::vector<ze_device_uuid_t>& vec,
                               const std::vector<ze_device_uuid_t>& sub_vec);
+
+    bool has_oversubscription() const;
+    bool is_oversubscription_detected = false;
 #endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
+    rank_info_vec_t get_filtered_rank_info_vec(int filter_host_idx) const;
 
     static std::string generate_uuid();
     static domains_t parse_topo_env();
@@ -158,9 +163,6 @@ class topo_manager {
 private:
     bool check_colors() const;
 
-    void allgather(const void* send_buf, void* recv_buf, int bytes);
-    void allgatherv(const void* send_buf, void* recv_buf, const std::vector<int>& recv_bytes);
-
     void fill_env_colors(const rank_info_vec_t& local_info_vec);
     void fill_fixed_colors(const rank_info_vec_t& info_vec);
 
@@ -177,15 +179,15 @@ class topo_manager {
     fabric_ports_t get_fabric_ports();
 
     static void check_planes(const std::vector<plane_t>& planes);
+    static bool is_unique_uuid(std::vector<ze_device_uuid_t>& unique_vec,
+                               const ze_device_uuid_t& uuid);
 #endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
 
-    rank_info_vec_t get_filtered_rank_info_vec(int filter_host_idx) const;
-
     static void check_invalid_color(int color);
     static void check_domain_count(size_t domain_count);
 
-    static std::map<int, std::string> get_domain_string(const std::string& input_str,
-                                                        const std::string& key);
+    static std::pair<int, std::string> get_domain_pair(const std::string& input_str,
+                                                       const std::string& key);
     static std::vector<std::string> get_subdomain_strings(const std::string& input_str);
 
     void build_host_info();
@@ -195,6 +197,9 @@ class topo_manager {
                    std::shared_ptr<ccl::context> context);
 #if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
     void ze_base_init(std::shared_ptr<ccl::device> device, std::shared_ptr<ccl::context> context);
+
+    bool oversubscription_detected(const ze_rank_info_vec_t& ze_rank_infos,
+                                   const host_info_vec_t& host_infos);
 #endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
     void post_init();
 
diff --git a/src/unordered_coll/unordered_coll.cpp b/src/unordered_coll/unordered_coll.cpp
index a6589ca88..dadec45bc 100644
--- a/src/unordered_coll/unordered_coll.cpp
+++ b/src/unordered_coll/unordered_coll.cpp
@@ -113,9 +113,9 @@ ccl_request* ccl_unordered_coll_manager::postpone(ccl_sched* sched) {
             lock.unlock();
 
             CCL_ASSERT(sched->coll_param.comm);
-            auto comm = sched->coll_param.comm->clone_with_new_id(comm_id);
-            add_comm(match_id, comm);
-            run_sched(sched, comm.get());
+            auto new_comm = sched->coll_param.comm->clone_with_new_id(comm_id);
+            add_comm(match_id, new_comm);
+            run_sched(sched, new_comm.get());
         }
     }
 
diff --git a/tests/functional/CMakeLists.txt b/tests/functional/CMakeLists.txt
index e89bc3d46..db1d299fe 100644
--- a/tests/functional/CMakeLists.txt
+++ b/tests/functional/CMakeLists.txt
@@ -44,6 +44,7 @@ if (${CMAKE_PROJECT_NAME} STREQUAL ${PROJECT_NAME})
     set(COMMON_CMAKE_DIR ${PROJECT_SOURCE_DIR}/../../cmake)
     include(${COMMON_CMAKE_DIR}/helpers.cmake)
     set_lp_env()
+    define_compute_backend()
     if (COMPUTE_BACKEND)
         set_compute_backend(${COMMON_CMAKE_DIR})
     endif()
@@ -75,6 +76,7 @@ include_directories(${INC_DIRS})
 message(STATUS "FT build type: ${CMAKE_BUILD_TYPE}")
 message(STATUS "FT CCL_ROOT: ${CCL_ROOT}")
 message(STATUS "FT INC_DIRS: ${INC_DIRS}")
+
 message(STATUS "FT COMPUTE_BACKEND: ${COMPUTE_BACKEND}")
 
 if (${CMAKE_VERSION} VERSION_LESS 3.1)
@@ -133,17 +135,11 @@ foreach(src ${sources})
     target_link_libraries(${executable} PUBLIC ccl)
     target_link_libraries(${executable} PUBLIC -L${I_MPI_ROOT}/lib/release/)
     target_link_libraries(${executable} PUBLIC mpi)
-    target_link_libraries(${executable} PUBLIC -L${LIBFABRIC_LIB_DIR})
-    target_link_libraries(${executable} PUBLIC fabric)
     target_link_libraries(${executable} PUBLIC ${COMPUTE_BACKEND_TARGET_NAME})
     install(TARGETS ${executable} RUNTIME DESTINATION ${CCL_INSTALL_TESTS} OPTIONAL)
     add_test (NAME ${executable} CONFIGURATIONS default COMMAND mpiexec.hydra -l -n 2 -ppn 1 ${CCL_INSTALL_TESTS}/${executable} --gtest_output=xml:${CCL_INSTALL_TESTS}/${executable}_default_report.junit.xml)
 endforeach()
 
-foreach(algo direct; naive; flat; multi_bcast; topo)
-add_test (NAME allgatherv_${algo} CONFIGURATIONS allgatherv_${algo} COMMAND mpiexec.hydra -l -n 2 -ppn 1 ${CCL_INSTALL_TESTS}/allgatherv_test --gtest_output=xml:${CCL_INSTALL_TESTS}/allgatherv_${algo}_report.junit.xml)
-endforeach()
-
 add_test (NAME allreduce_fusion CONFIGURATIONS allreduce_fusion COMMAND mpiexec.hydra -l -n 2 -ppn 1 ${CCL_INSTALL_TESTS}/allreduce_test --gtest_output=xml:${CCL_INSTALL_TESTS}/allreduce_fusion_report.junit.xml)
 
 foreach(proc_map ${PROC_MAPS})
@@ -158,10 +154,22 @@ foreach(proc_map ${PROC_MAPS})
 
     foreach(ppn ${PPNS})
 
+        foreach(algo direct; naive; flat; multi_bcast; topo)
+            add_test (NAME allgatherv_${algo}_${N}_${ppn} CONFIGURATIONS allgatherv_${algo}_${N}_${ppn} COMMAND mpiexec.hydra -l -n ${N} -ppn ${ppn} ${CCL_INSTALL_TESTS}/allgatherv_test --gtest_output=xml:${CCL_INSTALL_TESTS}/allgatherv_${algo}_${N}_${ppn}_report.junit.xml)
+        endforeach()
+
         foreach(algo direct; rabenseifner; nreduce; ring; ring_rma; double_tree; recursive_doubling; 2d; topo)
             add_test (NAME allreduce_${algo}_${N}_${ppn} CONFIGURATIONS allreduce_${algo}_${N}_${ppn} COMMAND mpiexec.hydra -l -n ${N} -ppn ${ppn} ${CCL_INSTALL_TESTS}/allreduce_test --gtest_output=xml:${CCL_INSTALL_TESTS}/allreduce_${algo}_${N}_${ppn}_report.junit.xml)
         endforeach()
 
+        foreach(algo direct; naive; scatter; topo)
+            add_test (NAME alltoall_${algo}_${N}_${ppn} CONFIGURATIONS alltoall_${algo}_${N}_${ppn} COMMAND mpiexec.hydra -l -n ${N} -ppn ${ppn} ${CCL_INSTALL_TESTS}/alltoall_test --gtest_output=xml:${CCL_INSTALL_TESTS}/alltoall_${algo}_${N}_${ppn}_report.junit.xml)
+        endforeach()
+
+        foreach(algo direct; naive; scatter; topo)
+            add_test (NAME alltoallv_${algo}_${N}_${ppn} CONFIGURATIONS alltoallv_${algo}_${N}_${ppn} COMMAND mpiexec.hydra -l -n ${N} -ppn ${ppn} ${CCL_INSTALL_TESTS}/alltoallv_test --gtest_output=xml:${CCL_INSTALL_TESTS}/alltoallv_${algo}_${N}_${ppn}_report.junit.xml)
+        endforeach()
+
         foreach(algo direct; ring; double_tree; naive; topo)
             add_test (NAME bcast_${algo}_${N}_${ppn} CONFIGURATIONS bcast_${algo}_${N}_${ppn} COMMAND mpiexec.hydra -l -n ${N} -ppn ${ppn} ${CCL_INSTALL_TESTS}/bcast_test --gtest_output=xml:${CCL_INSTALL_TESTS}/bcast_${algo}_${N}_${ppn}_report.junit.xml)
         endforeach()
@@ -170,29 +178,9 @@ foreach(proc_map ${PROC_MAPS})
             add_test (NAME reduce_${algo}_${N}_${ppn} CONFIGURATIONS reduce_${algo}_${N}_${ppn} COMMAND mpiexec.hydra -l -n ${N} -ppn ${ppn} ${CCL_INSTALL_TESTS}/reduce_test --gtest_output=xml:${CCL_INSTALL_TESTS}/reduce_${algo}_${N}_${ppn}_report.junit.xml)
         endforeach()
 
-    endforeach()
-endforeach()
-
-foreach(algo nreduce; ring; 2d)
-add_test (NAME allreduce_${algo}_chunked CONFIGURATIONS allreduce_${algo}_chunked COMMAND mpiexec.hydra -l -n 2 -ppn 1 ${CCL_INSTALL_TESTS}/allreduce_test --gtest_output=xml:${CCL_INSTALL_TESTS}/allreduce_${algo}_chunked_report.junit.xml)
-endforeach()
-
-foreach(algo direct; naive; scatter; topo)
-add_test (NAME alltoall_${algo} CONFIGURATIONS alltoall_${algo} COMMAND mpiexec.hydra -l -n 2 -ppn 1 ${CCL_INSTALL_TESTS}/alltoall_test --gtest_output=xml:${CCL_INSTALL_TESTS}/alltoall_${algo}_report.junit.xml)
-endforeach()
-
-foreach(algo scatter)
-add_test (NAME alltoall_${algo}_chunked CONFIGURATIONS alltoall_${algo}_chunked COMMAND mpiexec.hydra -l -n 2 -ppn 1 ${CCL_INSTALL_TESTS}/alltoall_test --gtest_output=xml:${CCL_INSTALL_TESTS}/alltoall_${algo}_chunked_report.junit.xml)
-endforeach()
-
-foreach(algo direct; naive; scatter; topo)
-add_test (NAME alltoallv_${algo} CONFIGURATIONS alltoallv_${algo} COMMAND mpiexec.hydra -l -n 2 -ppn 1 ${CCL_INSTALL_TESTS}/alltoallv_test --gtest_output=xml:${CCL_INSTALL_TESTS}/alltoallv_${algo}_report.junit.xml)
-endforeach()
-
-foreach(algo scatter)
-add_test (NAME alltoallv_${algo}_chunked CONFIGURATIONS alltoallv_${algo}_chunked COMMAND mpiexec.hydra -l -n 2 -ppn 1 ${CCL_INSTALL_TESTS}/alltoallv_test --gtest_output=xml:${CCL_INSTALL_TESTS}/alltoallv_${algo}_chunked_report.junit.xml)
-endforeach()
+        foreach(algo direct; ring)
+            add_test (NAME reduce_scatter_${algo}_${N}_${ppn} CONFIGURATIONS reduce_scatter_${algo}_${N}_${ppn} COMMAND mpiexec.hydra -l -n ${N} -ppn ${ppn} ${CCL_INSTALL_TESTS}/reduce_scatter_test --gtest_output=xml:${CCL_INSTALL_TESTS}/reduce_scatter_${algo}_${N}_${ppn}_report.junit.xml)
+        endforeach()
 
-foreach(algo direct; ring)
-add_test (NAME reduce_scatter_${algo} CONFIGURATIONS reduce_scatter_${algo} COMMAND mpiexec.hydra -l -n 2 -ppn 1 ${CCL_INSTALL_TESTS}/reduce_scatter_test --gtest_output=xml:${CCL_INSTALL_TESTS}/reduce_scatter_${algo}_report.junit.xml)
+    endforeach()
 endforeach()
diff --git a/third-party-programs.txt b/third-party-programs.txt
index 14d88e660..8d2fc7d16 100644
--- a/third-party-programs.txt
+++ b/third-party-programs.txt
@@ -1,5 +1,5 @@
 Intel(R) oneAPI Collective Communications Library (oneCCL) 
-2021.7.1 Third Party Programs File
+2021.8.0 Third Party Programs File
 
 This file is the "third-party-programs.txt" file specified in the associated 
 Intel end user license agreement for the Intel software you are licensing.
@@ -338,6 +338,100 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 
+-------------------------------------------------------------------------------
+
+9. OpenPMIx
+
+Most files in this release are marked with the copyrights of the
+organizations who have edited them.  The copyrights below are in no
+particular order and generally reflect members of the Open MPI core
+team who have contributed code that may or may not have been ported
+to PMIx. Per the terms of that LICENSE, we include the list here.
+The copyrights for code used under license from other parties
+are included in the corresponding files.
+
+Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
+                        University Research and Technology
+                        Corporation.  All rights reserved.
+Copyright (c) 2004-2010 The University of Tennessee and The University
+                        of Tennessee Research Foundation.  All rights
+                        reserved.
+Copyright (c) 2004-2010 High Performance Computing Center Stuttgart,
+                        University of Stuttgart.  All rights reserved.
+Copyright (c) 2004-2008 The Regents of the University of California.
+                        All rights reserved.
+Copyright (c) 2006-2010 Los Alamos National Security, LLC.  All rights
+                        reserved.
+Copyright (c) 2006-2010 Cisco Systems, Inc.  All rights reserved.
+Copyright (c) 2006-2010 Voltaire, Inc. All rights reserved.
+Copyright (c) 2006-2011 Sandia National Laboratories. All rights reserved.
+Copyright (c) 2006-2010 Sun Microsystems, Inc.  All rights reserved.
+                        Use is subject to license terms.
+Copyright (c) 2006-2010 The University of Houston. All rights reserved.
+Copyright (c) 2006-2009 Myricom, Inc.  All rights reserved.
+Copyright (c) 2007-2008 UT-Battelle, LLC. All rights reserved.
+Copyright (c) 2007-2019 IBM Corporation.  All rights reserved.
+Copyright (c) 1998-2005 Forschungszentrum Juelich, Juelich Supercomputing
+                        Centre, Federal Republic of Germany
+Copyright (c) 2005-2008 ZIH, TU Dresden, Federal Republic of Germany
+Copyright (c) 2007      Evergrid, Inc. All rights reserved.
+Copyright (c) 2008      Chelsio, Inc.  All rights reserved.
+Copyright (c) 2008-2009 Institut National de Recherche en
+                        Informatique.  All rights reserved.
+Copyright (c) 2007      Lawrence Livermore National Security, LLC.
+                        All rights reserved.
+Copyright (c) 2007-2019 Mellanox Technologies.  All rights reserved.
+Copyright (c) 2006-2010 QLogic Corporation.  All rights reserved.
+Copyright (c) 2008-2010 Oak Ridge National Labs.  All rights reserved.
+Copyright (c) 2006-2010 Oracle and/or its affiliates.  All rights reserved.
+Copyright (c) 2009      Bull SAS.  All rights reserved.
+Copyright (c) 2010      ARM ltd.  All rights reserved.
+Copyright (c) 2010-2011 Alex Brick <bricka@ccs.neu.edu>.  All rights reserved.
+Copyright (c) 2012      The University of Wisconsin-La Crosse. All rights
+                        reserved.
+Copyright (c) 2013-2019 Intel, Inc. All rights reserved.
+Copyright (c) 2011-2014 NVIDIA Corporation.  All rights reserved.
+Copyright (c) 2019      Amazon.com, Inc. or its affiliates.  All Rights
+                        reserved.
+
+The following LICENSE pertains to both PMIx and any code ported
+from Open MPI.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+- Redistributions of source code must retain the above copyright
+  notice, this list of conditions and the following disclaimer.
+
+- Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions and the following disclaimer listed
+  in this license in the documentation and/or other materials
+  provided with the distribution.
+
+- Neither the name of the copyright holders nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+The copyright holders provide no reassurances that the source code
+provided does not infringe any patent, copyright, or any other
+intellectual property rights of third parties.  The copyright holders
+disclaim any liability to any recipient for claims brought against
+recipient by any third party for infringement of that parties
+intellectual property rights.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 -------------------------------------------------------------------------------
 
   The following third party programs have their own third party programs. These