Skip to content

Commit

Permalink
[L0] Refactor Copy Engine Usage checks for Performance
Browse files Browse the repository at this point in the history
- Only use copy engines given discrete devices for Memory Copy.
- Given non D2D memory copy, then copy engines are preffered only when
  the device has both copy and link copy engine oridinals active.
- The above logic changes provide for the bext performance when using
  copy engines in memory copy across device types.

Signed-off-by: Neil R. Spruit <[email protected]>
  • Loading branch information
nrspruit committed Oct 30, 2024
1 parent 8c0cc4c commit b0bd146
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 42 deletions.
15 changes: 3 additions & 12 deletions source/adapters/level_zero/command_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -784,19 +784,10 @@ ur_result_t urCommandBufferAppendUSMMemcpyExp(
std::ignore = Event;
std::ignore = Command;

bool PreferCopyEngine = !IsDevicePointer(CommandBuffer->Context, Src) ||
!IsDevicePointer(CommandBuffer->Context, Dst);
// For better performance, Copy Engines are not preferred given Shared
// pointers on DG2.
if (CommandBuffer->Device->isDG2() &&
(IsSharedPointer(CommandBuffer->Context, Src) ||
IsSharedPointer(CommandBuffer->Context, Dst))) {
PreferCopyEngine = false;
}
PreferCopyEngine |= UseCopyEngineForD2DCopy;

return enqueueCommandBufferMemCopyHelper(
UR_COMMAND_USM_MEMCPY, CommandBuffer, Dst, Src, Size, PreferCopyEngine,
UR_COMMAND_USM_MEMCPY, CommandBuffer, Dst, Src, Size,
PreferCopyEngineUsage(CommandBuffer->Device, CommandBuffer->Context, Src,
Dst),
NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint);
}

Expand Down
55 changes: 25 additions & 30 deletions source/adapters/level_zero/memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,27 @@ bool IsSharedPointer(ur_context_handle_t Context, const void *Ptr) {
return (ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_SHARED);
}

// Helper Function to check if the Copy Engine should be preferred given the
// types of memory used.
bool PreferCopyEngineUsage(ur_device_handle_t Device,
ur_context_handle_t Context, const void *Src,
void *Dst) {
bool PreferCopyEngine = false;
// Given Integrated Devices, Copy Engines are not preferred for any Copy
// operations.
if (!Device->isIntegrated()) {
// Given non D2D Copies, for better performance, Copy Engines are preferred
// only if one has both the Main and Link Copy Engines.
if (Device->hasLinkCopyEngine() && Device->hasMainCopyEngine() &&
(!IsDevicePointer(Context, Src) || !IsDevicePointer(Context, Dst))) {
PreferCopyEngine = true;
}
}
// Temporary option added to use force engine for D2D copy
PreferCopyEngine |= UseCopyEngineForD2DCopy;
return PreferCopyEngine;
}

// Shared by all memory read/write/copy PI interfaces.
// PI interfaces must have queue's and destination buffer's mutexes locked for
// exclusive use and source buffer's mutex locked for shared use on entry.
Expand Down Expand Up @@ -1189,23 +1210,10 @@ ur_result_t urEnqueueUSMMemcpy(
) {
std::scoped_lock<ur_shared_mutex> lock(Queue->Mutex);

// Device to Device copies are found to execute slower on copy engine
// (versus compute engine).
bool PreferCopyEngine = !IsDevicePointer(Queue->Context, Src) ||
!IsDevicePointer(Queue->Context, Dst);
// For better performance, Copy Engines are not preferred given Shared
// pointers on DG2.
if (Queue->Device->isDG2() && (IsSharedPointer(Queue->Context, Src) ||
IsSharedPointer(Queue->Context, Dst))) {
PreferCopyEngine = false;
}

// Temporary option added to use copy engine for D2D copy
PreferCopyEngine |= UseCopyEngineForD2DCopy;

return enqueueMemCopyHelper( // TODO: do we need a new command type for this?
UR_COMMAND_MEM_BUFFER_COPY, Queue, Dst, Blocking, Size, Src,
NumEventsInWaitList, EventWaitList, OutEvent, PreferCopyEngine);
NumEventsInWaitList, EventWaitList, OutEvent,
PreferCopyEngineUsage(Queue->Device, Queue->Context, Src, Dst));
}

ur_result_t urEnqueueUSMPrefetch(
Expand Down Expand Up @@ -1396,26 +1404,13 @@ ur_result_t urEnqueueUSMMemcpy2D(

std::scoped_lock<ur_shared_mutex> lock(Queue->Mutex);

// Device to Device copies are found to execute slower on copy engine
// (versus compute engine).
bool PreferCopyEngine = !IsDevicePointer(Queue->Context, Src) ||
!IsDevicePointer(Queue->Context, Dst);
// For better performance, Copy Engines are not preferred given Shared
// pointers on DG2.
if (Queue->Device->isDG2() && (IsSharedPointer(Queue->Context, Src) ||
IsSharedPointer(Queue->Context, Dst))) {
PreferCopyEngine = false;
}

// Temporary option added to use copy engine for D2D copy
PreferCopyEngine |= UseCopyEngineForD2DCopy;

return enqueueMemCopyRectHelper( // TODO: do we need a new command type for
// this?
UR_COMMAND_MEM_BUFFER_COPY_RECT, Queue, Src, Dst, ZeroOffset, ZeroOffset,
Region, SrcPitch, DstPitch, 0, /*SrcSlicePitch=*/
0, /*DstSlicePitch=*/
Blocking, NumEventsInWaitList, EventWaitList, Event, PreferCopyEngine);
Blocking, NumEventsInWaitList, EventWaitList, Event,
PreferCopyEngineUsage(Queue->Device, Queue->Context, Src, Dst));
}

static ur_result_t ur2zeImageDesc(const ur_image_format_t *ImageFormat,
Expand Down
3 changes: 3 additions & 0 deletions source/adapters/level_zero/memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@ struct ur_device_handle_t_;

bool IsDevicePointer(ur_context_handle_t Context, const void *Ptr);
bool IsSharedPointer(ur_context_handle_t Context, const void *Ptr);
bool PreferCopyEngineUsage(ur_device_handle_t Device,
ur_context_handle_t Context, const void *Src,
void *Dst);

// This is an experimental option to test performance of device to device copy
// operations on copy engines (versus compute engine)
Expand Down

0 comments on commit b0bd146

Please sign in to comment.