From b0bd146a4a1fca183efcfae7a7f35245a39f0564 Mon Sep 17 00:00:00 2001 From: "Neil R. Spruit" Date: Mon, 21 Oct 2024 14:32:56 -0700 Subject: [PATCH] [L0] Refactor Copy Engine Usage checks for Performance - Only use copy engines given discrete devices for Memory Copy. - Given non D2D memory copy, then copy engines are preffered only when the device has both copy and link copy engine oridinals active. - The above logic changes provide for the bext performance when using copy engines in memory copy across device types. Signed-off-by: Neil R. Spruit --- source/adapters/level_zero/command_buffer.cpp | 15 +---- source/adapters/level_zero/memory.cpp | 55 +++++++++---------- source/adapters/level_zero/memory.hpp | 3 + 3 files changed, 31 insertions(+), 42 deletions(-) diff --git a/source/adapters/level_zero/command_buffer.cpp b/source/adapters/level_zero/command_buffer.cpp index 431e544101..eb66544691 100644 --- a/source/adapters/level_zero/command_buffer.cpp +++ b/source/adapters/level_zero/command_buffer.cpp @@ -784,19 +784,10 @@ ur_result_t urCommandBufferAppendUSMMemcpyExp( std::ignore = Event; std::ignore = Command; - bool PreferCopyEngine = !IsDevicePointer(CommandBuffer->Context, Src) || - !IsDevicePointer(CommandBuffer->Context, Dst); - // For better performance, Copy Engines are not preferred given Shared - // pointers on DG2. - if (CommandBuffer->Device->isDG2() && - (IsSharedPointer(CommandBuffer->Context, Src) || - IsSharedPointer(CommandBuffer->Context, Dst))) { - PreferCopyEngine = false; - } - PreferCopyEngine |= UseCopyEngineForD2DCopy; - return enqueueCommandBufferMemCopyHelper( - UR_COMMAND_USM_MEMCPY, CommandBuffer, Dst, Src, Size, PreferCopyEngine, + UR_COMMAND_USM_MEMCPY, CommandBuffer, Dst, Src, Size, + PreferCopyEngineUsage(CommandBuffer->Device, CommandBuffer->Context, Src, + Dst), NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); } diff --git a/source/adapters/level_zero/memory.cpp b/source/adapters/level_zero/memory.cpp index 08ff8b5656..893d5ff817 100644 --- a/source/adapters/level_zero/memory.cpp +++ b/source/adapters/level_zero/memory.cpp @@ -57,6 +57,27 @@ bool IsSharedPointer(ur_context_handle_t Context, const void *Ptr) { return (ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_SHARED); } +// Helper Function to check if the Copy Engine should be preferred given the +// types of memory used. +bool PreferCopyEngineUsage(ur_device_handle_t Device, + ur_context_handle_t Context, const void *Src, + void *Dst) { + bool PreferCopyEngine = false; + // Given Integrated Devices, Copy Engines are not preferred for any Copy + // operations. + if (!Device->isIntegrated()) { + // Given non D2D Copies, for better performance, Copy Engines are preferred + // only if one has both the Main and Link Copy Engines. + if (Device->hasLinkCopyEngine() && Device->hasMainCopyEngine() && + (!IsDevicePointer(Context, Src) || !IsDevicePointer(Context, Dst))) { + PreferCopyEngine = true; + } + } + // Temporary option added to use force engine for D2D copy + PreferCopyEngine |= UseCopyEngineForD2DCopy; + return PreferCopyEngine; +} + // Shared by all memory read/write/copy PI interfaces. // PI interfaces must have queue's and destination buffer's mutexes locked for // exclusive use and source buffer's mutex locked for shared use on entry. @@ -1189,23 +1210,10 @@ ur_result_t urEnqueueUSMMemcpy( ) { std::scoped_lock lock(Queue->Mutex); - // Device to Device copies are found to execute slower on copy engine - // (versus compute engine). - bool PreferCopyEngine = !IsDevicePointer(Queue->Context, Src) || - !IsDevicePointer(Queue->Context, Dst); - // For better performance, Copy Engines are not preferred given Shared - // pointers on DG2. - if (Queue->Device->isDG2() && (IsSharedPointer(Queue->Context, Src) || - IsSharedPointer(Queue->Context, Dst))) { - PreferCopyEngine = false; - } - - // Temporary option added to use copy engine for D2D copy - PreferCopyEngine |= UseCopyEngineForD2DCopy; - return enqueueMemCopyHelper( // TODO: do we need a new command type for this? UR_COMMAND_MEM_BUFFER_COPY, Queue, Dst, Blocking, Size, Src, - NumEventsInWaitList, EventWaitList, OutEvent, PreferCopyEngine); + NumEventsInWaitList, EventWaitList, OutEvent, + PreferCopyEngineUsage(Queue->Device, Queue->Context, Src, Dst)); } ur_result_t urEnqueueUSMPrefetch( @@ -1396,26 +1404,13 @@ ur_result_t urEnqueueUSMMemcpy2D( std::scoped_lock lock(Queue->Mutex); - // Device to Device copies are found to execute slower on copy engine - // (versus compute engine). - bool PreferCopyEngine = !IsDevicePointer(Queue->Context, Src) || - !IsDevicePointer(Queue->Context, Dst); - // For better performance, Copy Engines are not preferred given Shared - // pointers on DG2. - if (Queue->Device->isDG2() && (IsSharedPointer(Queue->Context, Src) || - IsSharedPointer(Queue->Context, Dst))) { - PreferCopyEngine = false; - } - - // Temporary option added to use copy engine for D2D copy - PreferCopyEngine |= UseCopyEngineForD2DCopy; - return enqueueMemCopyRectHelper( // TODO: do we need a new command type for // this? UR_COMMAND_MEM_BUFFER_COPY_RECT, Queue, Src, Dst, ZeroOffset, ZeroOffset, Region, SrcPitch, DstPitch, 0, /*SrcSlicePitch=*/ 0, /*DstSlicePitch=*/ - Blocking, NumEventsInWaitList, EventWaitList, Event, PreferCopyEngine); + Blocking, NumEventsInWaitList, EventWaitList, Event, + PreferCopyEngineUsage(Queue->Device, Queue->Context, Src, Dst)); } static ur_result_t ur2zeImageDesc(const ur_image_format_t *ImageFormat, diff --git a/source/adapters/level_zero/memory.hpp b/source/adapters/level_zero/memory.hpp index b8e683e16e..c2e653b297 100644 --- a/source/adapters/level_zero/memory.hpp +++ b/source/adapters/level_zero/memory.hpp @@ -30,6 +30,9 @@ struct ur_device_handle_t_; bool IsDevicePointer(ur_context_handle_t Context, const void *Ptr); bool IsSharedPointer(ur_context_handle_t Context, const void *Ptr); +bool PreferCopyEngineUsage(ur_device_handle_t Device, + ur_context_handle_t Context, const void *Src, + void *Dst); // This is an experimental option to test performance of device to device copy // operations on copy engines (versus compute engine)