From 502406d5d09462fadad410a521d997690cc32223 Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Tue, 23 Jan 2024 19:19:00 -0600 Subject: [PATCH 01/16] [AMDGPU] Emit a waitcnt instruction after each memory instruction This patch introduces a new command-line option for clang, namely, amdgpu-precise-mem-op. When this option is specified, a waitcnt instruction is generated after each memory load/store instruction. The counter values are always 0, but which counters are involved depends on the memory instruction. --- clang/include/clang/Driver/Options.td | 4 + clang/test/Driver/amdgpu-features.c | 6 + llvm/lib/Target/AMDGPU/AMDGPU.td | 4 + llvm/lib/Target/AMDGPU/GCNSubtarget.h | 3 + llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 79 +++++++ .../CodeGen/AMDGPU/insert_waitcnt_for_all.ll | 199 ++++++++++++++++++ 6 files changed, 295 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index c3e90a70925b78..bba239c86d5e1c 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -4902,6 +4902,10 @@ defm tgsplit : SimpleMFlag<"tgsplit", "Enable", "Disable", defm wavefrontsize64 : SimpleMFlag<"wavefrontsize64", "Specify wavefront size 64", "Specify wavefront size 32", " mode (AMDGPU only)">; +defm amdgpu_precise_memory_op + : SimpleMFlag<"amdgpu-precise-memory-op", "Enable", "Disable", + " precise memory mode (AMDGPU only)", + m_amdgpu_Features_Group>; defm unsafe_fp_atomics : BoolMOption<"unsafe-fp-atomics", TargetOpts<"AllowAMDGPUUnsafeFPAtomics">, DefaultFalse, diff --git a/clang/test/Driver/amdgpu-features.c b/clang/test/Driver/amdgpu-features.c index a516bc6b7ff200..57d31ccedd8783 100644 --- a/clang/test/Driver/amdgpu-features.c +++ b/clang/test/Driver/amdgpu-features.c @@ -32,3 +32,9 @@ // RUN: %clang -### -target amdgcn -mcpu=gfx1010 -mno-cumode %s 2>&1 | FileCheck --check-prefix=NO-CUMODE %s // NO-CUMODE: "-target-feature" "-cumode" + +// RUN: %clang -### -target amdgcn -mcpu=gfx1010 -mamdgpu-precise-memory-op %s 2>&1 | FileCheck --check-prefix=PREC-MEM %s +// PREC-MEM: "-target-feature" "+amdgpu-precise-memory-op" + +// RUN: %clang -### -target amdgcn -mcpu=gfx1010 -mno-amdgpu-precise-memory-op %s 2>&1 | FileCheck --check-prefix=NO-PREC-MEM %s +// NO-PREC-MEM: "-target-feature" "-amdgpu-precise-memory-op" diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 37dcfef3b2a3da..c6aea7f0865fae 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -168,6 +168,10 @@ def FeatureCuMode : SubtargetFeature<"cumode", "Enable CU wavefront execution mode" >; +def FeaturePreciseMemory + : SubtargetFeature<"amdgpu-precise-memory-op", "EnablePreciseMemory", + "true", "Enable precise memory mode">; + def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug", "SGPRInitBug", "true", diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 4da10beabe3162..46260e793f954a 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -87,6 +87,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool EnableTgSplit = false; bool EnableCuMode = false; bool TrapHandler = false; + bool EnablePreciseMemory = false; // Used as options. bool EnableLoadStoreOpt = false; @@ -599,6 +600,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, return EnableCuMode; } + bool isPreciseMemoryEnabled() const { return EnablePreciseMemory; } + bool hasFlatAddressSpace() const { return FlatAddressSpace; } diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 62306fa667b360..4c16dae24dad0b 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -17,6 +17,7 @@ #include "AMDGPUMachineModuleInfo.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/BitmaskEnum.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -24,6 +25,8 @@ #include "llvm/Support/AtomicOrdering.h" #include "llvm/TargetParser/TargetParser.h" +#include + using namespace llvm; using namespace llvm::AMDGPU; @@ -661,6 +664,9 @@ class SIMemoryLegalizer final : public MachineFunctionPass { bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, MachineBasicBlock::iterator &MI); + bool GFX9InsertWaitcntForPreciseMem(MachineFunction &MF); + bool GFX10And11InsertWaitcntForPreciseMem(MachineFunction &MF); + public: static char ID; @@ -2622,6 +2628,70 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, return Changed; } +bool SIMemoryLegalizer::GFX9InsertWaitcntForPreciseMem(MachineFunction &MF) { + const GCNSubtarget &ST = MF.getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); + IsaVersion IV = getIsaVersion(ST.getCPU()); + + bool Changed = false; + + for (auto &MBB : MF) { + for (auto MI = MBB.begin(); MI != MBB.end();) { + MachineInstr &Inst = *MI; + ++MI; + if (Inst.mayLoadOrStore() == false) + continue; + + // Todo: if next insn is an s_waitcnt + AMDGPU::Waitcnt Wait; + + if (!(Inst.getDesc().TSFlags & SIInstrFlags::maybeAtomic)) { + if (TII->isSMRD(Inst)) { // scalar + Wait.DsCnt = 0; // LgkmCnt + } else { // vector + if (Inst.mayLoad()) { // vector load + if (TII->isVMEM(Inst)) // VMEM load + Wait.LoadCnt = 0; // VmCnt + else if (TII->isFLAT(Inst)) { // Flat load + Wait.LoadCnt = 0; // VmCnt + Wait.DsCnt = 0; // LgkmCnt + } else // LDS load ? + Wait.DsCnt = 0; // LgkmCnt + } else { // vector store + if (TII->isVMEM(Inst)) // VMEM store + Wait.LoadCnt = 0; // VmCnt + else if (TII->isFLAT(Inst)) { // Flat store + Wait.LoadCnt = 0; // VmCnt + Wait.DsCnt = 0; // LgkmCnt + } else + Wait.DsCnt = 0; // LDS store? LgkmCnt + } + } // vector + } else { // atomic + Wait.DsCnt = 0; // LgkmCnt + Wait.LoadCnt = 0; // VmCnt + } + + unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); + BuildMI(MBB, MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); + Changed = true; + } + } + return Changed; +} + +bool SIMemoryLegalizer::GFX10And11InsertWaitcntForPreciseMem( + MachineFunction &MF) { + for (auto &MBB : MF) { + for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) { + MachineInstr &Inst = *MI; + if (Inst.mayLoadOrStore() == false) + continue; + } + } + return true; +} + bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { bool Changed = false; @@ -2662,6 +2732,15 @@ bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { } Changed |= removeAtomicPseudoMIs(); + + const GCNSubtarget &ST = MF.getSubtarget(); + if (ST.isPreciseMemoryEnabled()) { + if (AMDGPU::isGFX10Plus(ST)) + Changed |= GFX10And11InsertWaitcntForPreciseMem(MF); + else + Changed |= GFX9InsertWaitcntForPreciseMem(MF); + } + return Changed; } diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll new file mode 100644 index 00000000000000..abb9b9071227f8 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll @@ -0,0 +1,199 @@ +; Testing the -amdgpu-precise-memory-op option +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+amdgpu-precise-memory-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9 +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+amdgpu-precise-memory-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX90A +; COM: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+amdgpu-precise-memory-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10 +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global,+enable-flat-scratch,+amdgpu-precise-memory-op -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-FLATSCR %s + +; from atomicrmw-expand.ll +; covers flat_load, flat_atomic +define void @syncscope_workgroup_nortn(ptr %addr, float %val) { +; GFX90A-LABEL: syncscope_workgroup_nortn: +; GFX90A: ; %bb.0: +; GFX90A: flat_load_dword v5, v[0:1] +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A: .LBB0_1: ; %atomicrmw.start +; GFX90A: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) + %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst + ret void +} + +; from atomicrmw-nand.ll +; covers global_atomic, global_load +define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { +; GFX9-LABEL: atomic_nand_i32_global: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v2, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NOT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-NEXT: v_not_b32_e32 v2, v3 +; GFX9-NEXT: v_or_b32_e32 v2, -5, v2 +; GFX9-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB1_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw nand ptr addrspace(1) %ptr, i32 4 seq_cst + ret i32 %result +} + +; from bf16.ll +; covers buffer_load, buffer_store, flat_load, flat_store, global_load, global_store +define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; +; GFX9-LABEL: test_load_store: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_store_short v[2:3], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_load_store: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_ushort v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_store_short v[2:3], v0, off +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %val = load bfloat, ptr addrspace(1) %in + store bfloat %val, ptr addrspace(1) %out + ret void +} + +; from scratch-simple.ll +; covers scratch_load, scratch_store +; +; GFX9-FLATSCR-LABEL: {{^}}vs_main: +; GFX9-FLATSCR: scratch_store_dwordx4 off, v[{{[0-9:]+}}], +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +define amdgpu_vs float @vs_main(i32 %idx) { + %v1 = extractelement <81 x float> , i32 %idx + %v2 = extractelement <81 x float> , i32 %idx + %r = fadd float %v1, %v2 + ret float %r +} + +; from udiv.ll +; covers s_load +define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { +; GFX9-LABEL: udiv_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NOT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 + %r = udiv i32 %x, %y + store i32 %r, ptr addrspace(1) %out + ret void +} + +declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32) + +; from smrd.ll +; covers s_buffer_load +; GFX9-LABEL: {{^}}smrd_sgpr_offset: +; GFX9: s_buffer_load_dword s{{[0-9]}}, s[0:3], s4 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +define amdgpu_ps float @smrd_sgpr_offset(<4 x i32> inreg %desc, i32 inreg %offset) #0 { +main_body: + %r = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %offset, i32 0) + ret float %r +} + +; from atomic_load_add.ll +; covers s_load, ds_add +; GFX9-LABEL: atomic_add_local: +; GFX9: ; %bb.1: +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9: ds_add_u32 v0, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { + %unused = atomicrmw volatile add ptr addrspace(3) %local, i32 5 seq_cst + ret void +} + +declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32, ptr addrspace(8), i32, i32, i32 immarg) + +; from atomic_optimizations_buffer.ll +; covers buffer_atomic +; GFX9-LABEL: add_i32_constant: +; GFX9: ; %bb.1: +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9: buffer_atomic_add v1, off, s[8:11], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace(8) %inout) { +entry: + %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32 5, ptr addrspace(8) %inout, i32 0, i32 0, i32 0) + store i32 %old, ptr addrspace(1) %out + ret void +} + +declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32, i16, <8 x i32>, i32, i32) + +; from llvm.amdgcn.image.load.a16.ll +; covers image_load +; GFX9-LABEL: {{^}}load.f32.1d: +; GFX9: image_load v0, v0, s[0:7] dmask:0x1 unorm a16 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +define amdgpu_ps <4 x float> @load.f32.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) { +main_body: + %x = extractelement <2 x i16> %coords, i32 0 + %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 1, i16 %x, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +declare void @llvm.amdgcn.image.store.1d.v4f32.i16(<4 x float>, i32, i16, <8 x i32>, i32, i32) + +; from llvm.amdgcn.image.store.a16.ll +; covers image_store +define amdgpu_ps void @store_f32_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <4 x float> %val) { +; GFX9-LABEL: store_f32_1d: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: image_store v[1:4], v0, s[0:7] dmask:0x1 unorm a16 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_endpgm +; +main_body: + %x = extractelement <2 x i16> %coords, i32 0 + call void @llvm.amdgcn.image.store.1d.v4f32.i16(<4 x float> %val, i32 1, i16 %x, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +declare i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) + +; from llvm.amdgcn.image.atomic.dim.ll +; covers image_atomic +; GFX90A-LABEL: {{^}}atomic_swap_1d: +; GFX90A: image_atomic_swap v0, v{{[02468]}}, s[0:7] dmask:0x1 unorm glc{{$}} +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +define amdgpu_ps float @atomic_swap_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { +main_body: + %v = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + %out = bitcast i32 %v to float + ret float %out +} + + + + From 6624b6a49362009036b03abb2c3d6e1cfb5ab301 Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Tue, 30 Jan 2024 19:21:01 -0600 Subject: [PATCH 02/16] Combined insertions of waitcnt with existing SIMemoryLegalizer code. --- llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 301 +++++++++++++----- ...l => insert_waitcnt_for_precise_memory.ll} | 187 ++++++++++- 2 files changed, 389 insertions(+), 99 deletions(-) rename llvm/test/CodeGen/AMDGPU/{insert_waitcnt_for_all.ll => insert_waitcnt_for_precise_memory.ll} (62%) diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 4c16dae24dad0b..319a071dd791a9 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -25,8 +25,6 @@ #include "llvm/Support/AtomicOrdering.h" #include "llvm/TargetParser/TargetParser.h" -#include - using namespace llvm; using namespace llvm::AMDGPU; @@ -628,12 +626,197 @@ class SIGfx12CacheControl : public SIGfx11CacheControl { bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override; }; +class SIPreciseMemorySupport { +protected: + const GCNSubtarget &ST; + const SIInstrInfo *TII = nullptr; + + IsaVersion IV; + + SIPreciseMemorySupport(const GCNSubtarget &ST) : ST(ST) { + TII = ST.getInstrInfo(); + IV = getIsaVersion(ST.getCPU()); + } + +public: + static std::unique_ptr create(const GCNSubtarget &ST); + + virtual bool handleNonAtomic(MachineBasicBlock::iterator &MI) = 0; + /// Handles atomic instruction \p MI with \p ret indicating whether \p MI + /// returns a result. + virtual bool handleAtomic(MachineBasicBlock::iterator &MI, bool ret) = 0; +}; + +class SIGfx9PreciseMemorySupport : public SIPreciseMemorySupport { +public: + SIGfx9PreciseMemorySupport(const GCNSubtarget &ST) + : SIPreciseMemorySupport(ST) {} + bool handleNonAtomic(MachineBasicBlock::iterator &MI) override; + bool handleAtomic(MachineBasicBlock::iterator &MI, bool ret) override; +}; + +class SIGfx10And11PreciseMemorySupport : public SIPreciseMemorySupport { +public: + SIGfx10And11PreciseMemorySupport(const GCNSubtarget &ST) + : SIPreciseMemorySupport(ST) {} + bool handleNonAtomic(MachineBasicBlock::iterator &MI) override; + bool handleAtomic(MachineBasicBlock::iterator &MI, bool ret) override; +}; + +std::unique_ptr +SIPreciseMemorySupport::create(const GCNSubtarget &ST) { + GCNSubtarget::Generation Generation = ST.getGeneration(); + if (Generation < AMDGPUSubtarget::GFX10) + return std::make_unique(ST); + return std::make_unique(ST); +} + +bool SIGfx9PreciseMemorySupport ::handleNonAtomic( + MachineBasicBlock::iterator &MI) { + assert(MI->mayLoadOrStore()); + + MachineInstr &Inst = *MI; + AMDGPU::Waitcnt Wait; + + if (TII->isSMRD(Inst)) { // scalar + if (Inst.mayStore()) + return false; + Wait.DsCnt = 0; // LgkmCnt + } else { // vector + if (Inst.mayLoad()) { // vector load + if (TII->isVMEM(Inst)) { // VMEM load + Wait.LoadCnt = 0; // VmCnt + } else if (TII->isFLAT(Inst)) { // Flat load + Wait.LoadCnt = 0; // VmCnt + Wait.DsCnt = 0; // LgkmCnt + } else { // LDS load + Wait.DsCnt = 0; // LgkmCnt + } + } else { // vector store + if (TII->isVMEM(Inst)) { // VMEM store + Wait.LoadCnt = 0; // VmCnt + } else if (TII->isFLAT(Inst)) { // Flat store + Wait.LoadCnt = 0; // VmCnt + Wait.DsCnt = 0; // LgkmCnt + } else { + Wait.DsCnt = 0; // LDS store; LgkmCnt + } + } + } + + unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); + MachineBasicBlock &MBB = *MI->getParent(); + BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); + --MI; + return true; +} + +bool SIGfx9PreciseMemorySupport ::handleAtomic(MachineBasicBlock::iterator &MI, + bool ret) { + assert(MI->mayLoadOrStore()); + + AMDGPU::Waitcnt Wait; + + Wait.LoadCnt = 0; // VmCnt + Wait.DsCnt = 0; // LgkmCnt + + unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); + MachineBasicBlock &MBB = *MI->getParent(); + BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); + --MI; + return true; +} + +bool SIGfx10And11PreciseMemorySupport ::handleNonAtomic( + MachineBasicBlock::iterator &MI) { + assert(MI->mayLoadOrStore()); + + MachineInstr &Inst = *MI; + AMDGPU::Waitcnt Wait; + + bool BuildWaitCnt = true; + bool BuildVsCnt = false; + + if (TII->isSMRD(Inst)) { // scalar + if (Inst.mayStore()) + return false; + Wait.DsCnt = 0; // LgkmCnt + } else { // vector + if (Inst.mayLoad()) { // vector load + if (TII->isVMEM(Inst)) { // VMEM load + Wait.LoadCnt = 0; // VmCnt + } else if (TII->isFLAT(Inst)) { // Flat load + Wait.LoadCnt = 0; // VmCnt + Wait.DsCnt = 0; // LgkmCnt + } else { // LDS load + Wait.DsCnt = 0; // LgkmCnt + } + } + + // For some instructions, mayLoad() and mayStore() can be both true. + if (Inst.mayStore()) { // vector store; an instruction can be both + // load/store + if (TII->isVMEM(Inst)) { // VMEM store + if (!Inst.mayLoad()) + BuildWaitCnt = false; + BuildVsCnt = true; + } else if (TII->isFLAT(Inst)) { // Flat store + Wait.DsCnt = 0; // LgkmCnt + BuildVsCnt = true; + } else { + Wait.DsCnt = 0; // LDS store; LgkmCnt + } + } + } + + MachineBasicBlock &MBB = *MI->getParent(); + if (BuildWaitCnt) { + unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); + BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); + --MI; + } + + if (BuildVsCnt) { + BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT)) + .addReg(AMDGPU::SGPR_NULL, RegState::Undef) + .addImm(0); + --MI; + } + return true; +} + +bool SIGfx10And11PreciseMemorySupport ::handleAtomic( + MachineBasicBlock::iterator &MI, bool ret) { + assert(MI->mayLoadOrStore()); + + AMDGPU::Waitcnt Wait; + + Wait.DsCnt = 0; // LgkmCnt + if (ret) + Wait.LoadCnt = 0; // VmCnt + + unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); + MachineBasicBlock &MBB = *MI->getParent(); + BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); + --MI; + if (!ret) { + BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT)) + .addReg(AMDGPU::SGPR_NULL, RegState::Undef) + .addImm(0); + --MI; + } + return true; +} + class SIMemoryLegalizer final : public MachineFunctionPass { private: /// Cache Control. std::unique_ptr CC = nullptr; + /// Precise Memory support. + std::unique_ptr PM = nullptr; + /// List of atomic pseudo instructions. std::list AtomicPseudoMIs; @@ -664,9 +847,6 @@ class SIMemoryLegalizer final : public MachineFunctionPass { bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, MachineBasicBlock::iterator &MI); - bool GFX9InsertWaitcntForPreciseMem(MachineFunction &MF); - bool GFX10And11InsertWaitcntForPreciseMem(MachineFunction &MF); - public: static char ID; @@ -2490,9 +2670,13 @@ bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI, // Atomic instructions already bypass caches to the scope specified by the // SyncScope operand. Only non-atomic volatile and nontemporal/last-use // instructions need additional treatment. - Changed |= CC->enableVolatileAndOrNonTemporal( - MI, MOI.getInstrAddrSpace(), SIMemOp::LOAD, MOI.isVolatile(), - MOI.isNonTemporal(), MOI.isLastUse()); + // SyncScope operand. Only non-atomic volatile and nontemporal instructions + // need additional treatment. + Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(), + SIMemOp::LOAD, MOI.isVolatile(), + MOI.isNonTemporal()); + if (PM) + Changed |= PM->handleNonAtomic(MI); return Changed; } @@ -2531,6 +2715,10 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, // GFX12 specific, scope(desired coherence domain in cache hierarchy) is // instruction field, do not confuse it with atomic scope. Changed |= CC->expandSystemScopeStore(MI); + + if (PM) + Changed |= PM->handleNonAtomic(MI); + return Changed; } @@ -2611,12 +2799,13 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || MOI.getFailureOrdering() == AtomicOrdering::Acquire || MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) { - Changed |= CC->insertWait(MI, MOI.getScope(), - MOI.getInstrAddrSpace(), - isAtomicRet(*MI) ? SIMemOp::LOAD : - SIMemOp::STORE, - MOI.getIsCrossAddressSpaceOrdering(), - Position::AFTER); + if (PM) + Changed |= PM->handleAtomic(MI, isAtomicRet(*MI)); + else + Changed |= CC->insertWait( + MI, MOI.getScope(), MOI.getInstrAddrSpace(), + isAtomicRet(*MI) ? SIMemOp::LOAD : SIMemOp::STORE, + MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER); Changed |= CC->insertAcquire(MI, MOI.getScope(), MOI.getOrderingAddrSpace(), Position::AFTER); @@ -2625,79 +2814,22 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, return Changed; } - return Changed; -} - -bool SIMemoryLegalizer::GFX9InsertWaitcntForPreciseMem(MachineFunction &MF) { - const GCNSubtarget &ST = MF.getSubtarget(); - const SIInstrInfo *TII = ST.getInstrInfo(); - IsaVersion IV = getIsaVersion(ST.getCPU()); - - bool Changed = false; - - for (auto &MBB : MF) { - for (auto MI = MBB.begin(); MI != MBB.end();) { - MachineInstr &Inst = *MI; - ++MI; - if (Inst.mayLoadOrStore() == false) - continue; + if (PM) + Changed |= PM->handleNonAtomic(MI); - // Todo: if next insn is an s_waitcnt - AMDGPU::Waitcnt Wait; - - if (!(Inst.getDesc().TSFlags & SIInstrFlags::maybeAtomic)) { - if (TII->isSMRD(Inst)) { // scalar - Wait.DsCnt = 0; // LgkmCnt - } else { // vector - if (Inst.mayLoad()) { // vector load - if (TII->isVMEM(Inst)) // VMEM load - Wait.LoadCnt = 0; // VmCnt - else if (TII->isFLAT(Inst)) { // Flat load - Wait.LoadCnt = 0; // VmCnt - Wait.DsCnt = 0; // LgkmCnt - } else // LDS load ? - Wait.DsCnt = 0; // LgkmCnt - } else { // vector store - if (TII->isVMEM(Inst)) // VMEM store - Wait.LoadCnt = 0; // VmCnt - else if (TII->isFLAT(Inst)) { // Flat store - Wait.LoadCnt = 0; // VmCnt - Wait.DsCnt = 0; // LgkmCnt - } else - Wait.DsCnt = 0; // LDS store? LgkmCnt - } - } // vector - } else { // atomic - Wait.DsCnt = 0; // LgkmCnt - Wait.LoadCnt = 0; // VmCnt - } - - unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); - BuildMI(MBB, MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); - Changed = true; - } - } return Changed; } -bool SIMemoryLegalizer::GFX10And11InsertWaitcntForPreciseMem( - MachineFunction &MF) { - for (auto &MBB : MF) { - for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) { - MachineInstr &Inst = *MI; - if (Inst.mayLoadOrStore() == false) - continue; - } - } - return true; -} - bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { bool Changed = false; SIMemOpAccess MOA(MF); CC = SICacheControl::create(MF.getSubtarget()); + const GCNSubtarget &ST = MF.getSubtarget(); + if (ST.isPreciseMemoryEnabled()) + PM = SIPreciseMemorySupport::create(ST); + for (auto &MBB : MF) { for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) { @@ -2716,8 +2848,12 @@ bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { MI = II->getIterator(); } - if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic)) + if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic)) { + if (PM && MI->mayLoadOrStore()) { + Changed |= PM->handleNonAtomic(MI); + } continue; + } if (const auto &MOI = MOA.getLoadInfo(MI)) Changed |= expandLoad(*MOI, MI); @@ -2732,15 +2868,6 @@ bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { } Changed |= removeAtomicPseudoMIs(); - - const GCNSubtarget &ST = MF.getSubtarget(); - if (ST.isPreciseMemoryEnabled()) { - if (AMDGPU::isGFX10Plus(ST)) - Changed |= GFX10And11InsertWaitcntForPreciseMem(MF); - else - Changed |= GFX9InsertWaitcntForPreciseMem(MF); - } - return Changed; } diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll similarity index 62% rename from llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll rename to llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll index abb9b9071227f8..8d2c0c73aa1520 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll @@ -1,8 +1,7 @@ -; Testing the -amdgpu-precise-memory-op option -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+amdgpu-precise-memory-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9 -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+amdgpu-precise-memory-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX90A -; COM: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+amdgpu-precise-memory-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10 -; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global,+enable-flat-scratch,+amdgpu-precise-memory-op -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-FLATSCR %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+amdgpu-precise-memory-op < %s | FileCheck %s -check-prefixes=GFX9 +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+amdgpu-precise-memory-op < %s | FileCheck %s -check-prefixes=GFX90A +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+amdgpu-precise-memory-op < %s | FileCheck %s -check-prefixes=GFX10 +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global,+enable-flat-scratch,+amdgpu-precise-memory-op -amdgpu-use-divergent-register-indexing < %s | FileCheck --check-prefixes=GFX9-FLATSCR %s ; from atomicrmw-expand.ll ; covers flat_load, flat_atomic @@ -14,6 +13,14 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) { ; GFX90A: .LBB0_1: ; %atomicrmw.start ; GFX90A: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; +; GFX10-LABEL: syncscope_workgroup_nortn: +; GFX10: ; %bb.0: +; GFX10: flat_load_dword v4, v[0:1] +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10: .LBB0_1: ; %atomicrmw.start +; GFX10: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst ret void } @@ -44,6 +51,28 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: atomic_nand_i32_global: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v2, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NOT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_not_b32_e32 v2, v3 +; GFX10-NEXT: v_or_b32_e32 v2, -5, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB1_1 %result = atomicrmw nand ptr addrspace(1) %ptr, i32 4 seq_cst ret i32 %result } @@ -65,10 +94,9 @@ define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_store_short v[2:3], v0, off -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %val = load bfloat, ptr addrspace(1) %in @@ -101,6 +129,16 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NOT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; +; GFX10-LABEL: udiv_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX10: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_endpgm %r = udiv i32 %x, %y store i32 %r, ptr addrspace(1) %out ret void @@ -113,6 +151,10 @@ declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32) ; GFX9-LABEL: {{^}}smrd_sgpr_offset: ; GFX9: s_buffer_load_dword s{{[0-9]}}, s[0:3], s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; +; GFX10-LABEL: {{^}}smrd_sgpr_offset: +; GFX10: s_buffer_load_dword s0, s[0:3], s4 offset:0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) define amdgpu_ps float @smrd_sgpr_offset(<4 x i32> inreg %desc, i32 inreg %offset) #0 { main_body: %r = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %offset, i32 0) @@ -127,6 +169,15 @@ main_body: ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9: ds_add_u32 v0, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; +; GFX10-LABEL: atomic_add_local: +; GFX10: ; %bb.1: +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10: ds_add_u32 v0, v1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 + define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { %unused = atomicrmw volatile add ptr addrspace(3) %local, i32 5 seq_cst ret void @@ -141,7 +192,16 @@ declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32, ptr addrspace(8), i32, i ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9: buffer_atomic_add v1, off, s[8:11], 0 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(0) +; +; GFX10-LABEL: add_i32_constant: +; GFX10: ; %bb.1: +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10: buffer_atomic_add v1, off, s[4:7], 0 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 + define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace(8) %inout) { entry: %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32 5, ptr addrspace(8) %inout, i32 0, i32 0, i32 0) @@ -155,7 +215,13 @@ declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32, i16, <8 x i32>, i3 ; covers image_load ; GFX9-LABEL: {{^}}load.f32.1d: ; GFX9: image_load v0, v0, s[0:7] dmask:0x1 unorm a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(0) +; +; GFX10-LABEL: {{^}}load.f32.1d: +; GFX10: %bb.0: ; %main_body +; GFX10-NEXT: image_load v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) + define amdgpu_ps <4 x float> @load.f32.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) { main_body: %x = extractelement <2 x i16> %coords, i32 0 @@ -171,9 +237,14 @@ define amdgpu_ps void @store_f32_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <4 ; GFX9-LABEL: store_f32_1d: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_store v[1:4], v0, s[0:7] dmask:0x1 unorm a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; +; GFX10-LABEL: store_f32_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_store v[1:4], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm a16 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_endpgm main_body: %x = extractelement <2 x i16> %coords, i32 0 call void @llvm.amdgcn.image.store.1d.v4f32.i16(<4 x float> %val, i32 1, i16 %x, <8 x i32> %rsrc, i32 0, i32 0) @@ -186,7 +257,14 @@ declare i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32, i32, <8 x i32>, i32, ; covers image_atomic ; GFX90A-LABEL: {{^}}atomic_swap_1d: ; GFX90A: image_atomic_swap v0, v{{[02468]}}, s[0:7] dmask:0x1 unorm glc{{$}} -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; +; GFX10-LABEL: {{^}}atomic_swap_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 + define amdgpu_ps float @atomic_swap_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { main_body: %v = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -194,6 +272,91 @@ main_body: ret float %out } +; from lds-bounds.ll +; covers ds_write_b64 +@compute_lds = external addrspace(3) global [512 x i32], align 16 +; GFX9-LABEL: {{^}}store_aligned: +; GFX9: ds_write_b64 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; +; GFX10-LABEL: {{^}}store_aligned: +; GFX10: ds_write_b64 v0, v[1:2] +; GFX10-NEXT: s_waitcnt lgkmcnt(0) + + +define amdgpu_cs void @store_aligned(ptr addrspace(3) %ptr) #0 { +entry: + %ptr.gep.1 = getelementptr i32, ptr addrspace(3) %ptr, i32 1 + + store i32 42, ptr addrspace(3) %ptr, align 8 + store i32 43, ptr addrspace(3) %ptr.gep.1 + ret void +} + +; from lds-bounds.ll +; covers ds_read_b64 +; GFX9-LABEL: {{^}}load_aligned: +; GFX9: ds_read_b64 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; +; GFX10-LABEL: {{^}}load_aligned: +; GFX10: ds_read_b64 v[0:1], v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +define amdgpu_cs <2 x float> @load_aligned(ptr addrspace(3) %ptr) #0 { +entry: + %ptr.gep.1 = getelementptr i32, ptr addrspace(3) %ptr, i32 1 + + %v.0 = load i32, ptr addrspace(3) %ptr, align 8 + %v.1 = load i32, ptr addrspace(3) %ptr.gep.1 + + %r.0 = insertelement <2 x i32> undef, i32 %v.0, i32 0 + %r.1 = insertelement <2 x i32> %r.0, i32 %v.1, i32 1 + %bc = bitcast <2 x i32> %r.1 to <2 x float> + ret <2 x float> %bc +} + +; from lds-bounds.ll +; covers ds_write2_b32 +; GFX9-LABEL: {{^}}store_global_const_idx: +; GFX9: ds_write2_b32 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; +; GFX10-LABEL: {{^}}store_global_const_idx: +; GFX10: ds_write2_b32 v0, v1, v2 offset0:3 offset1:4 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) + +define amdgpu_cs void @store_global_const_idx() #0 { +entry: + %ptr.a = getelementptr [512 x i32], ptr addrspace(3) @compute_lds, i32 0, i32 3 + %ptr.b = getelementptr [512 x i32], ptr addrspace(3) @compute_lds, i32 0, i32 4 + + store i32 42, ptr addrspace(3) %ptr.a + store i32 43, ptr addrspace(3) %ptr.b + ret void +} + +; from lds-bounds.ll +; covers ds_read2_b32 +; GFX9-LABEL: {{^}}load_global_const_idx: +; GFX9: ds_read2_b32 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; +; GFX10-LABEL: {{^}}load_global_const_idx: +; GFX10: ds_read2_b32 v[0:1], v0 offset0:3 offset1:4 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +define amdgpu_cs <2 x float> @load_global_const_idx() #0 { +entry: + %ptr.a = getelementptr [512 x i32], ptr addrspace(3) @compute_lds, i32 0, i32 3 + %ptr.b = getelementptr [512 x i32], ptr addrspace(3) @compute_lds, i32 0, i32 4 + + %v.0 = load i32, ptr addrspace(3) %ptr.a + %v.1 = load i32, ptr addrspace(3) %ptr.b + + %r.0 = insertelement <2 x i32> undef, i32 %v.0, i32 0 + %r.1 = insertelement <2 x i32> %r.0, i32 %v.1, i32 1 + %bc = bitcast <2 x i32> %r.1 to <2 x float> + ret <2 x float> %bc +} From 8abdc34121273296658d1f20c633574b89ee7d2b Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Mon, 12 Feb 2024 14:01:05 -0600 Subject: [PATCH 03/16] Merge code for precise mem with the existing SICacheControl classes. --- llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 330 ++++++++++-------- .../insert_waitcnt_for_precise_memory.ll | 104 +++++- 2 files changed, 289 insertions(+), 145 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 319a071dd791a9..63d207418bb536 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -365,6 +365,18 @@ class SICacheControl { MachineBasicBlock::iterator &MI) const { return false; } + +public: + // The following is for supporting precise memory mode. When the option + // amdgpu-precise-memory is enabled, an s_waitcnt instruction is inserted + // after each memory instruction. + + virtual bool + handleNonAtomicForPreciseMemory(MachineBasicBlock::iterator &MI) = 0; + /// Handles atomic instruction \p MI with \p ret indicating whether \p MI + /// returns a result. + virtual bool handleAtomicForPreciseMemory(MachineBasicBlock::iterator &MI, + bool ret) = 0; }; class SIGfx6CacheControl : public SICacheControl { @@ -420,6 +432,11 @@ class SIGfx6CacheControl : public SICacheControl { SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, Position Pos) const override; + + bool + handleNonAtomicForPreciseMemory(MachineBasicBlock::iterator &MI) override; + bool handleAtomicForPreciseMemory(MachineBasicBlock::iterator &MI, + bool ret) override; }; class SIGfx7CacheControl : public SIGfx6CacheControl { @@ -572,6 +589,11 @@ class SIGfx10CacheControl : public SIGfx7CacheControl { SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, Position Pos) const override; + + bool + handleNonAtomicForPreciseMemory(MachineBasicBlock::iterator &MI) override; + bool handleAtomicForPreciseMemory(MachineBasicBlock::iterator &MI, + bool ret) override; }; class SIGfx11CacheControl : public SIGfx10CacheControl { @@ -624,8 +646,14 @@ class SIGfx12CacheControl : public SIGfx11CacheControl { bool IsLastUse) const override; bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override; + + bool + handleNonAtomicForPreciseMemory(MachineBasicBlock::iterator &MI) override; + bool handleAtomicForPreciseMemory(MachineBasicBlock::iterator &MI, + bool ret) override; }; +#if 0 class SIPreciseMemorySupport { protected: const GCNSubtarget &ST; @@ -670,143 +698,7 @@ SIPreciseMemorySupport::create(const GCNSubtarget &ST) { return std::make_unique(ST); return std::make_unique(ST); } - -bool SIGfx9PreciseMemorySupport ::handleNonAtomic( - MachineBasicBlock::iterator &MI) { - assert(MI->mayLoadOrStore()); - - MachineInstr &Inst = *MI; - AMDGPU::Waitcnt Wait; - - if (TII->isSMRD(Inst)) { // scalar - if (Inst.mayStore()) - return false; - Wait.DsCnt = 0; // LgkmCnt - } else { // vector - if (Inst.mayLoad()) { // vector load - if (TII->isVMEM(Inst)) { // VMEM load - Wait.LoadCnt = 0; // VmCnt - } else if (TII->isFLAT(Inst)) { // Flat load - Wait.LoadCnt = 0; // VmCnt - Wait.DsCnt = 0; // LgkmCnt - } else { // LDS load - Wait.DsCnt = 0; // LgkmCnt - } - } else { // vector store - if (TII->isVMEM(Inst)) { // VMEM store - Wait.LoadCnt = 0; // VmCnt - } else if (TII->isFLAT(Inst)) { // Flat store - Wait.LoadCnt = 0; // VmCnt - Wait.DsCnt = 0; // LgkmCnt - } else { - Wait.DsCnt = 0; // LDS store; LgkmCnt - } - } - } - - unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); - MachineBasicBlock &MBB = *MI->getParent(); - BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); - --MI; - return true; -} - -bool SIGfx9PreciseMemorySupport ::handleAtomic(MachineBasicBlock::iterator &MI, - bool ret) { - assert(MI->mayLoadOrStore()); - - AMDGPU::Waitcnt Wait; - - Wait.LoadCnt = 0; // VmCnt - Wait.DsCnt = 0; // LgkmCnt - - unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); - MachineBasicBlock &MBB = *MI->getParent(); - BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); - --MI; - return true; -} - -bool SIGfx10And11PreciseMemorySupport ::handleNonAtomic( - MachineBasicBlock::iterator &MI) { - assert(MI->mayLoadOrStore()); - - MachineInstr &Inst = *MI; - AMDGPU::Waitcnt Wait; - - bool BuildWaitCnt = true; - bool BuildVsCnt = false; - - if (TII->isSMRD(Inst)) { // scalar - if (Inst.mayStore()) - return false; - Wait.DsCnt = 0; // LgkmCnt - } else { // vector - if (Inst.mayLoad()) { // vector load - if (TII->isVMEM(Inst)) { // VMEM load - Wait.LoadCnt = 0; // VmCnt - } else if (TII->isFLAT(Inst)) { // Flat load - Wait.LoadCnt = 0; // VmCnt - Wait.DsCnt = 0; // LgkmCnt - } else { // LDS load - Wait.DsCnt = 0; // LgkmCnt - } - } - - // For some instructions, mayLoad() and mayStore() can be both true. - if (Inst.mayStore()) { // vector store; an instruction can be both - // load/store - if (TII->isVMEM(Inst)) { // VMEM store - if (!Inst.mayLoad()) - BuildWaitCnt = false; - BuildVsCnt = true; - } else if (TII->isFLAT(Inst)) { // Flat store - Wait.DsCnt = 0; // LgkmCnt - BuildVsCnt = true; - } else { - Wait.DsCnt = 0; // LDS store; LgkmCnt - } - } - } - - MachineBasicBlock &MBB = *MI->getParent(); - if (BuildWaitCnt) { - unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); - BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); - --MI; - } - - if (BuildVsCnt) { - BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT)) - .addReg(AMDGPU::SGPR_NULL, RegState::Undef) - .addImm(0); - --MI; - } - return true; -} - -bool SIGfx10And11PreciseMemorySupport ::handleAtomic( - MachineBasicBlock::iterator &MI, bool ret) { - assert(MI->mayLoadOrStore()); - - AMDGPU::Waitcnt Wait; - - Wait.DsCnt = 0; // LgkmCnt - if (ret) - Wait.LoadCnt = 0; // VmCnt - - unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); - MachineBasicBlock &MBB = *MI->getParent(); - BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); - --MI; - if (!ret) { - BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT)) - .addReg(AMDGPU::SGPR_NULL, RegState::Undef) - .addImm(0); - --MI; - } - return true; -} +#endif class SIMemoryLegalizer final : public MachineFunctionPass { private: @@ -815,7 +707,7 @@ class SIMemoryLegalizer final : public MachineFunctionPass { std::unique_ptr CC = nullptr; /// Precise Memory support. - std::unique_ptr PM = nullptr; + bool PM = false; /// List of atomic pseudo instructions. std::list AtomicPseudoMIs; @@ -2611,12 +2503,161 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( return Changed; } +<<<<<<< HEAD bool SIGfx12CacheControl::expandSystemScopeStore( MachineBasicBlock::iterator &MI) const { MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol); if (CPol && ((CPol->getImm() & CPol::SCOPE) == CPol::SCOPE_SYS)) return insertWaitsBeforeSystemScopeStore(MI); +======= +bool SIGfx6CacheControl ::handleNonAtomicForPreciseMemory( + MachineBasicBlock::iterator &MI) { + assert(MI->mayLoadOrStore()); + + MachineInstr &Inst = *MI; + AMDGPU::Waitcnt Wait; + + if (TII->isSMRD(Inst)) { // scalar + if (Inst.mayStore()) + return false; + Wait.DsCnt = 0; // LgkmCnt + } else { // vector + if (Inst.mayLoad()) { // vector load + if (TII->isVMEM(Inst)) { // VMEM load + Wait.LoadCnt = 0; // VmCnt + } else if (TII->isFLAT(Inst)) { // Flat load + Wait.LoadCnt = 0; // VmCnt + Wait.DsCnt = 0; // LgkmCnt + } else { // LDS load + Wait.DsCnt = 0; // LgkmCnt + } + } else { // vector store + if (TII->isVMEM(Inst)) { // VMEM store + Wait.LoadCnt = 0; // VmCnt + } else if (TII->isFLAT(Inst)) { // Flat store + Wait.LoadCnt = 0; // VmCnt + Wait.DsCnt = 0; // LgkmCnt + } else { + Wait.DsCnt = 0; // LDS store; LgkmCnt + } + } + } + + unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); + MachineBasicBlock &MBB = *MI->getParent(); + BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); + --MI; + return true; +} + +bool SIGfx6CacheControl ::handleAtomicForPreciseMemory( + MachineBasicBlock::iterator &MI, bool ret) { + assert(MI->mayLoadOrStore()); + + AMDGPU::Waitcnt Wait; + + Wait.LoadCnt = 0; // VmCnt + Wait.DsCnt = 0; // LgkmCnt + + unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); + MachineBasicBlock &MBB = *MI->getParent(); + BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); + --MI; + return true; +} + +bool SIGfx10CacheControl ::handleNonAtomicForPreciseMemory( + MachineBasicBlock::iterator &MI) { + assert(MI->mayLoadOrStore()); + + MachineInstr &Inst = *MI; + AMDGPU::Waitcnt Wait; + + bool BuildWaitCnt = true; + bool BuildVsCnt = false; + + if (TII->isSMRD(Inst)) { // scalar + if (Inst.mayStore()) + return false; + Wait.DsCnt = 0; // LgkmCnt + } else { // vector + if (Inst.mayLoad()) { // vector load + if (TII->isVMEM(Inst)) { // VMEM load + Wait.LoadCnt = 0; // VmCnt + } else if (TII->isFLAT(Inst)) { // Flat load + Wait.LoadCnt = 0; // VmCnt + Wait.DsCnt = 0; // LgkmCnt + } else { // LDS load + Wait.DsCnt = 0; // LgkmCnt + } + } + + // For some instructions, mayLoad() and mayStore() can be both true. + if (Inst.mayStore()) { // vector store; an instruction can be both + // load/store + if (TII->isVMEM(Inst)) { // VMEM store + if (!Inst.mayLoad()) + BuildWaitCnt = false; + BuildVsCnt = true; + } else if (TII->isFLAT(Inst)) { // Flat store + Wait.DsCnt = 0; // LgkmCnt + BuildVsCnt = true; + } else { + Wait.DsCnt = 0; // LDS store; LgkmCnt + } + } + } + + MachineBasicBlock &MBB = *MI->getParent(); + if (BuildWaitCnt) { + unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); + BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); + --MI; + } + + if (BuildVsCnt) { + BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT)) + .addReg(AMDGPU::SGPR_NULL, RegState::Undef) + .addImm(0); + --MI; + } + return true; +} + +bool SIGfx10CacheControl ::handleAtomicForPreciseMemory( + MachineBasicBlock::iterator &MI, bool ret) { + assert(MI->mayLoadOrStore()); + + AMDGPU::Waitcnt Wait; + + Wait.DsCnt = 0; // LgkmCnt + if (ret) + Wait.LoadCnt = 0; // VmCnt + + unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); + MachineBasicBlock &MBB = *MI->getParent(); + BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); + --MI; + if (!ret) { + BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT)) + .addReg(AMDGPU::SGPR_NULL, RegState::Undef) + .addImm(0); + --MI; + } + return true; +} + +bool SIGfx12CacheControl ::handleNonAtomicForPreciseMemory( + MachineBasicBlock::iterator &MI) { + // To be implemented. + return false; +} + +bool SIGfx12CacheControl ::handleAtomicForPreciseMemory( + MachineBasicBlock::iterator &MI, bool ret) { + // To be implemented. +>>>>>>> Merge code for precise mem with the existing SICacheControl classes. return false; } @@ -2676,7 +2717,7 @@ bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI, SIMemOp::LOAD, MOI.isVolatile(), MOI.isNonTemporal()); if (PM) - Changed |= PM->handleNonAtomic(MI); + Changed |= CC->handleNonAtomicForPreciseMemory(MI); return Changed; } @@ -2717,7 +2758,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, Changed |= CC->expandSystemScopeStore(MI); if (PM) - Changed |= PM->handleNonAtomic(MI); + Changed |= CC->handleNonAtomicForPreciseMemory(MI); return Changed; } @@ -2800,7 +2841,7 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, MOI.getFailureOrdering() == AtomicOrdering::Acquire || MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) { if (PM) - Changed |= PM->handleAtomic(MI, isAtomicRet(*MI)); + Changed |= CC->handleAtomicForPreciseMemory(MI, isAtomicRet(*MI)); else Changed |= CC->insertWait( MI, MOI.getScope(), MOI.getInstrAddrSpace(), @@ -2815,7 +2856,7 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, } if (PM) - Changed |= PM->handleNonAtomic(MI); + Changed |= CC->handleNonAtomicForPreciseMemory(MI); return Changed; } @@ -2827,8 +2868,9 @@ bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { CC = SICacheControl::create(MF.getSubtarget()); const GCNSubtarget &ST = MF.getSubtarget(); + PM = false; if (ST.isPreciseMemoryEnabled()) - PM = SIPreciseMemorySupport::create(ST); + PM = true; for (auto &MBB : MF) { for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) { @@ -2850,7 +2892,7 @@ bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic)) { if (PM && MI->mayLoadOrStore()) { - Changed |= PM->handleNonAtomic(MI); + Changed |= CC->handleNonAtomicForPreciseMemory(MI); } continue; } diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll index 8d2c0c73aa1520..fa9d94ed4377b7 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll @@ -2,6 +2,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+amdgpu-precise-memory-op < %s | FileCheck %s -check-prefixes=GFX90A ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+amdgpu-precise-memory-op < %s | FileCheck %s -check-prefixes=GFX10 ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global,+enable-flat-scratch,+amdgpu-precise-memory-op -amdgpu-use-divergent-register-indexing < %s | FileCheck --check-prefixes=GFX9-FLATSCR %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+amdgpu-precise-memory-op < %s | FileCheck %s -check-prefixes=GFX11 ; from atomicrmw-expand.ll ; covers flat_load, flat_atomic @@ -21,6 +22,14 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) { ; GFX10: .LBB0_1: ; %atomicrmw.start ; GFX10: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; +; GFX11-LABEL: syncscope_workgroup_nortn: +; GFX11: ; %bb.0: +; GFX11: flat_load_b32 v4, v[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11: .LBB0_1: ; %atomicrmw.start +; GFX11: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst ret void } @@ -73,6 +82,23 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB1_1 +; +; GFX11-LABEL: atomic_nand_i32_global: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11: .LBB1_1: ; %atomicrmw.start +; GFX11: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v2, v3 +; GFX11-NEXT: v_or_b32_e32 v2, -5, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv + %result = atomicrmw nand ptr addrspace(1) %ptr, i32 4 seq_cst ret i32 %result } @@ -99,6 +125,17 @@ define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_load_store: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_u16 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_store_b16 v[2:3], v0, off +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %val = load bfloat, ptr addrspace(1) %in store bfloat %val, ptr addrspace(1) %out ret void @@ -139,6 +176,17 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: udiv_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX11: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 + %r = udiv i32 %x, %y store i32 %r, ptr addrspace(1) %out ret void @@ -155,6 +203,11 @@ declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32) ; GFX10-LABEL: {{^}}smrd_sgpr_offset: ; GFX10: s_buffer_load_dword s0, s[0:3], s4 offset:0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; +; GFX11-LABEL: {{^}}smrd_sgpr_offset: +; GFX11: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s0 define amdgpu_ps float @smrd_sgpr_offset(<4 x i32> inreg %desc, i32 inreg %offset) #0 { main_body: %r = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %offset, i32 0) @@ -177,6 +230,14 @@ main_body: ; GFX10: ds_add_u32 v0, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; +; GFX11-LABEL: atomic_add_local: +; GFX11: ; %bb.1: +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11: ds_add_u32 v0, v1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { %unused = atomicrmw volatile add ptr addrspace(3) %local, i32 5 seq_cst @@ -201,6 +262,14 @@ declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32, ptr addrspace(8), i32, i ; GFX10: buffer_atomic_add v1, off, s[4:7], 0 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; +; GFX11-LABEL: add_i32_constant: +; GFX11: ; %bb.1: +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace(8) %inout) { entry: @@ -221,6 +290,11 @@ declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32, i16, <8 x i32>, i3 ; GFX10: %bb.0: ; %main_body ; GFX10-NEXT: image_load v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; +; GFX11-LABEL: {{^}}load.f32.1d: +; GFX11: %bb.0: ; %main_body +; GFX11-NEXT: image_load v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm a16 +; GFX11-NEXT: s_waitcnt vmcnt(0) define amdgpu_ps <4 x float> @load.f32.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) { main_body: @@ -245,6 +319,12 @@ define amdgpu_ps void @store_f32_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <4 ; GFX10-NEXT: image_store v[1:4], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm a16 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: store_f32_1d: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_store v[1:4], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm a16 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 + main_body: %x = extractelement <2 x i16> %coords, i32 0 call void @llvm.amdgcn.image.store.1d.v4f32.i16(<4 x float> %val, i32 1, i16 %x, <8 x i32> %rsrc, i32 0, i32 0) @@ -264,6 +344,12 @@ declare i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32, i32, <8 x i32>, i32, ; GFX10-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; +; GFX11-LABEL: {{^}}atomic_swap_1d: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 define amdgpu_ps float @atomic_swap_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { main_body: @@ -282,7 +368,10 @@ main_body: ; GFX10-LABEL: {{^}}store_aligned: ; GFX10: ds_write_b64 v0, v[1:2] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) - +; +; GFX11-LABEL: {{^}}store_aligned: +; GFX11: ds_store_b64 v0, v[1:2] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) define amdgpu_cs void @store_aligned(ptr addrspace(3) %ptr) #0 { entry: @@ -303,6 +392,10 @@ entry: ; GFX10-LABEL: {{^}}load_aligned: ; GFX10: ds_read_b64 v[0:1], v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; +; GFX11-LABEL: {{^}}load_aligned: +; GFX11: ds_load_b64 v[0:1], v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) define amdgpu_cs <2 x float> @load_aligned(ptr addrspace(3) %ptr) #0 { entry: @@ -326,6 +419,10 @@ entry: ; GFX10-LABEL: {{^}}store_global_const_idx: ; GFX10: ds_write2_b32 v0, v1, v2 offset0:3 offset1:4 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; +; GFX11-LABEL: {{^}}store_global_const_idx: +; GFX11: ds_store_2addr_b32 v0, v1, v2 offset0:3 offset1:4 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) define amdgpu_cs void @store_global_const_idx() #0 { entry: @@ -346,6 +443,11 @@ entry: ; GFX10-LABEL: {{^}}load_global_const_idx: ; GFX10: ds_read2_b32 v[0:1], v0 offset0:3 offset1:4 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; +; GFX11-LABEL: {{^}}load_global_const_idx: +; GFX11: ds_load_2addr_b32 v[0:1], v0 offset0:3 offset1:4 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) + define amdgpu_cs <2 x float> @load_global_const_idx() #0 { entry: %ptr.a = getelementptr [512 x i32], ptr addrspace(3) @compute_lds, i32 0, i32 3 From adaa16ca3a1f2ee15be9c6957ffdd9181b4d9a87 Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Wed, 14 Feb 2024 17:37:06 -0600 Subject: [PATCH 04/16] Add support for GFX12. --- llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 93 ++++++++++-- .../insert_waitcnt_for_precise_memory.ll | 133 ++++++++++++++++-- 2 files changed, 202 insertions(+), 24 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 63d207418bb536..b231570eeaba01 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -2503,14 +2503,15 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( return Changed; } -<<<<<<< HEAD bool SIGfx12CacheControl::expandSystemScopeStore( MachineBasicBlock::iterator &MI) const { MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol); if (CPol && ((CPol->getImm() & CPol::SCOPE) == CPol::SCOPE_SYS)) return insertWaitsBeforeSystemScopeStore(MI); + return false; +} + -======= bool SIGfx6CacheControl ::handleNonAtomicForPreciseMemory( MachineBasicBlock::iterator &MI) { assert(MI->mayLoadOrStore()); @@ -2593,7 +2594,7 @@ bool SIGfx10CacheControl ::handleNonAtomicForPreciseMemory( } } - // For some instructions, mayLoad() and mayStore() can be both true. + // For some vector instructions, mayLoad() and mayStore() can be both true. if (Inst.mayStore()) { // vector store; an instruction can be both // load/store if (TII->isVMEM(Inst)) { // VMEM store @@ -2650,15 +2651,80 @@ bool SIGfx10CacheControl ::handleAtomicForPreciseMemory( bool SIGfx12CacheControl ::handleNonAtomicForPreciseMemory( MachineBasicBlock::iterator &MI) { - // To be implemented. - return false; + assert(MI->mayLoadOrStore()); + + MachineInstr &Inst = *MI; + unsigned WaitType = 0; + // For some vector instructions, mayLoad() and mayStore() can be both true. + bool LoadAndStore = false; + + if (TII->isSMRD(Inst)) { // scalar + if (Inst.mayStore()) + return false; + + WaitType = AMDGPU::S_WAIT_KMCNT; + } else { // vector + if (Inst.mayLoad() && Inst.mayStore()) { + WaitType = AMDGPU::S_WAIT_LOADCNT; + LoadAndStore = true; + } else if (Inst.mayLoad()) { // vector load + if (TII->isVMEM(Inst)) { // VMEM load + WaitType = AMDGPU::S_WAIT_LOADCNT; + } else if (TII->isFLAT(Inst)) { // Flat load + WaitType = AMDGPU::S_WAIT_LOADCNT_DSCNT; + } else { // LDS load + WaitType = AMDGPU::S_WAIT_DSCNT; + } + } else { // vector store + if (TII->isVMEM(Inst)) { // VMEM store + WaitType = AMDGPU::S_WAIT_STORECNT; + } else if (TII->isFLAT(Inst)) { // Flat store + WaitType = AMDGPU::S_WAIT_STORECNT_DSCNT; + } else { + WaitType = AMDGPU::S_WAIT_DSCNT; + } + } + } + + assert(WaitType != 0); + + MachineBasicBlock &MBB = *MI->getParent(); + + unsigned Enc = 0; + if (WaitType == AMDGPU::S_WAIT_LOADCNT_DSCNT) { + AMDGPU::Waitcnt Wait; + Wait.DsCnt = 0; + Wait.LoadCnt = 0; + Enc = AMDGPU::encodeLoadcntDscnt(IV, Wait); + } else if (WaitType == AMDGPU::S_WAIT_STORECNT_DSCNT) { + AMDGPU::Waitcnt Wait; + Wait.DsCnt = 0; + Wait.StoreCnt = 0; + Enc = AMDGPU::encodeStorecntDscnt(IV, Wait); + } + + BuildMI(MBB, ++MI, DebugLoc(), TII->get(WaitType)).addImm(Enc); + --MI; + if (LoadAndStore) { + BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAIT_STORECNT)) + .addImm(Enc); + --MI; + } + return true; } bool SIGfx12CacheControl ::handleAtomicForPreciseMemory( MachineBasicBlock::iterator &MI, bool ret) { - // To be implemented. ->>>>>>> Merge code for precise mem with the existing SICacheControl classes. - return false; + assert(MI->mayLoadOrStore()); + + MachineBasicBlock &MBB = *MI->getParent(); + if (ret) { + BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAIT_LOADCNT)).addImm(0); + } else { + BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAIT_STORECNT)).addImm(0); + } + --MI; + return true; } bool SIMemoryLegalizer::removeAtomicPseudoMIs() { @@ -2840,13 +2906,10 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || MOI.getFailureOrdering() == AtomicOrdering::Acquire || MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) { - if (PM) - Changed |= CC->handleAtomicForPreciseMemory(MI, isAtomicRet(*MI)); - else - Changed |= CC->insertWait( - MI, MOI.getScope(), MOI.getInstrAddrSpace(), - isAtomicRet(*MI) ? SIMemOp::LOAD : SIMemOp::STORE, - MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER); + Changed |= + CC->insertWait(MI, MOI.getScope(), MOI.getInstrAddrSpace(), + isAtomicRet(*MI) ? SIMemOp::LOAD : SIMemOp::STORE, + MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER); Changed |= CC->insertAcquire(MI, MOI.getScope(), MOI.getOrderingAddrSpace(), Position::AFTER); diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll index fa9d94ed4377b7..048a1f999f0195 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll @@ -3,6 +3,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+amdgpu-precise-memory-op < %s | FileCheck %s -check-prefixes=GFX10 ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global,+enable-flat-scratch,+amdgpu-precise-memory-op -amdgpu-use-divergent-register-indexing < %s | FileCheck --check-prefixes=GFX9-FLATSCR %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+amdgpu-precise-memory-op < %s | FileCheck %s -check-prefixes=GFX11 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+amdgpu-precise-memory-op < %s | FileCheck %s -check-prefixes=GFX12 ; from atomicrmw-expand.ll ; covers flat_load, flat_atomic @@ -30,13 +31,24 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) { ; GFX11: .LBB0_1: ; %atomicrmw.start ; GFX11: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; +; GFX12-LABEL: syncscope_workgroup_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 + %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst ret void } ; from atomicrmw-nand.ll ; covers global_atomic, global_load -define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { +; ; GFX9-LABEL: atomic_nand_i32_global: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -50,7 +62,7 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { ; GFX9-NEXT: v_not_b32_e32 v2, v3 ; GFX9-NEXT: v_or_b32_e32 v2, -5, v2 ; GFX9-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] @@ -75,7 +87,7 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { ; GFX10-NEXT: v_or_b32_e32 v2, -5, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -95,17 +107,37 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { ; GFX11-NEXT: v_or_b32_e32 v2, -5, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv +; +; GFX12-LABEL: atomic_nand_i32_global: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v2, v[0:1], off +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12: .LBB1_1: ; %atomicrmw.start +; GFX12: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_not_b32_e32 v2, v3 +; GFX12-NEXT: v_or_b32_e32 v2, -5, v2 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { %result = atomicrmw nand ptr addrspace(1) %ptr, i32 4 seq_cst ret i32 %result } ; from bf16.ll ; covers buffer_load, buffer_store, flat_load, flat_store, global_load, global_store -define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; ; GFX9-LABEL: test_load_store: ; GFX9: ; %bb.0: @@ -135,7 +167,21 @@ define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_load_store: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_u16 v0, v[0:1], off +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_store_b16 v[2:3], v0, off +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] +define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) { %val = load bfloat, ptr addrspace(1) %in store bfloat %val, ptr addrspace(1) %out ret void @@ -158,7 +204,7 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; from udiv.ll ; covers s_load -define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { +; ; GFX9-LABEL: udiv_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 @@ -186,7 +232,17 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_nop 0 +; +; GFX12-LABEL: udiv_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cvt_f32_u32 s4, s3 +; GFX12: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: s_nop 0 +define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { %r = udiv i32 %x, %y store i32 %r, ptr addrspace(1) %out ret void @@ -208,6 +264,12 @@ declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32) ; GFX11: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; +; GFX12-LABEL: {{^}}smrd_sgpr_offset: +; GFX12: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0 +; GFX12: s_wait_kmcnt 0x0 +; GFX12: v_mov_b32_e32 v0, s0 + define amdgpu_ps float @smrd_sgpr_offset(<4 x i32> inreg %desc, i32 inreg %offset) #0 { main_body: %r = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %offset, i32 0) @@ -221,7 +283,7 @@ main_body: ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9: ds_add_u32 v0, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; ; GFX10-LABEL: atomic_add_local: ; GFX10: ; %bb.1: @@ -229,7 +291,7 @@ main_body: ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10: ds_add_u32 v0, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv ; ; GFX11-LABEL: atomic_add_local: ; GFX11: ; %bb.1: @@ -237,7 +299,15 @@ main_body: ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11: ds_add_u32 v0, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; +; GFX12-LABEL: atomic_add_local: +; GFX12: ; %bb.1: +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12: ds_add_u32 v0, v1 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { %unused = atomicrmw volatile add ptr addrspace(3) %local, i32 5 seq_cst @@ -270,6 +340,19 @@ declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32, ptr addrspace(8), i32, i ; GFX11: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; +; GFX12-LABEL: add_i32_constant: +; GFX12: ; %bb.1: +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace(8) %inout) { entry: @@ -295,6 +378,11 @@ declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32, i16, <8 x i32>, i3 ; GFX11: %bb.0: ; %main_body ; GFX11-NEXT: image_load v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm a16 ; GFX11-NEXT: s_waitcnt vmcnt(0) +; +; GFX12-LABEL: {{^}}load.f32.1d: +; GFX12: %bb.0: ; %main_body +; GFX12-NEXT: image_load v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D a16 +; GFX12-NEXT: s_wait_loadcnt 0x0 define amdgpu_ps <4 x float> @load.f32.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) { main_body: @@ -324,6 +412,11 @@ define amdgpu_ps void @store_f32_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <4 ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[1:4], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm a16 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; +; GFX12-LABEL: store_f32_1d: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: image_store v[1:4], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D a16 +; GFX12-NEXT: s_wait_storecnt 0x0 main_body: %x = extractelement <2 x i16> %coords, i32 0 @@ -350,6 +443,12 @@ declare i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32, i32, <8 x i32>, i32, ; GFX11-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; +; GFX12-LABEL: {{^}}atomic_swap_1d: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 define amdgpu_ps float @atomic_swap_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { main_body: @@ -372,6 +471,10 @@ main_body: ; GFX11-LABEL: {{^}}store_aligned: ; GFX11: ds_store_b64 v0, v[1:2] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; +; GFX12-LABEL: {{^}}store_aligned: +; GFX12: ds_store_b64 v0, v[1:2] +; GFX12-NEXT: s_wait_dscnt 0x0 define amdgpu_cs void @store_aligned(ptr addrspace(3) %ptr) #0 { entry: @@ -396,6 +499,10 @@ entry: ; GFX11-LABEL: {{^}}load_aligned: ; GFX11: ds_load_b64 v[0:1], v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; +; GFX12-LABEL: {{^}}load_aligned: +; GFX12: ds_load_b64 v[0:1], v0 +; GFX12-NEXT: s_wait_dscnt 0x0 define amdgpu_cs <2 x float> @load_aligned(ptr addrspace(3) %ptr) #0 { entry: @@ -423,6 +530,10 @@ entry: ; GFX11-LABEL: {{^}}store_global_const_idx: ; GFX11: ds_store_2addr_b32 v0, v1, v2 offset0:3 offset1:4 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; +; GFX12-LABEL: {{^}}store_global_const_idx: +; GFX12: ds_store_2addr_b32 v0, v1, v2 offset0:3 offset1:4 +; GFX12-NEXT: s_wait_dscnt 0x0 define amdgpu_cs void @store_global_const_idx() #0 { entry: @@ -447,6 +558,10 @@ entry: ; GFX11-LABEL: {{^}}load_global_const_idx: ; GFX11: ds_load_2addr_b32 v[0:1], v0 offset0:3 offset1:4 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; +; GFX12-LABEL: {{^}}load_global_const_idx: +; GFX12: ds_load_2addr_b32 v[0:1], v0 offset0:3 offset1:4 +; GFX12-NEXT: s_wait_dscnt 0x0 define amdgpu_cs <2 x float> @load_global_const_idx() #0 { entry: From c02b87b11e918968f1dfdc3b9e8568fb53a2334c Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Thu, 15 Feb 2024 16:54:33 -0600 Subject: [PATCH 05/16] Some small changes based on code review. --- llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 131 ++++++------------- 1 file changed, 39 insertions(+), 92 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index b231570eeaba01..6cb9519d4169b4 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -373,10 +373,10 @@ class SICacheControl { virtual bool handleNonAtomicForPreciseMemory(MachineBasicBlock::iterator &MI) = 0; - /// Handles atomic instruction \p MI with \p ret indicating whether \p MI - /// returns a result. + /// Handles atomic instruction \p MI with \p IsAtomicWithRet indicating + /// whether \p MI returns a result. virtual bool handleAtomicForPreciseMemory(MachineBasicBlock::iterator &MI, - bool ret) = 0; + bool IsAtomicWithRet) = 0; }; class SIGfx6CacheControl : public SICacheControl { @@ -436,7 +436,7 @@ class SIGfx6CacheControl : public SICacheControl { bool handleNonAtomicForPreciseMemory(MachineBasicBlock::iterator &MI) override; bool handleAtomicForPreciseMemory(MachineBasicBlock::iterator &MI, - bool ret) override; + bool IsAtomicWithRet) override; }; class SIGfx7CacheControl : public SIGfx6CacheControl { @@ -593,7 +593,7 @@ class SIGfx10CacheControl : public SIGfx7CacheControl { bool handleNonAtomicForPreciseMemory(MachineBasicBlock::iterator &MI) override; bool handleAtomicForPreciseMemory(MachineBasicBlock::iterator &MI, - bool ret) override; + bool IsAtomicWithRet) override; }; class SIGfx11CacheControl : public SIGfx10CacheControl { @@ -650,56 +650,9 @@ class SIGfx12CacheControl : public SIGfx11CacheControl { bool handleNonAtomicForPreciseMemory(MachineBasicBlock::iterator &MI) override; bool handleAtomicForPreciseMemory(MachineBasicBlock::iterator &MI, - bool ret) override; -}; - -#if 0 -class SIPreciseMemorySupport { -protected: - const GCNSubtarget &ST; - const SIInstrInfo *TII = nullptr; - - IsaVersion IV; - - SIPreciseMemorySupport(const GCNSubtarget &ST) : ST(ST) { - TII = ST.getInstrInfo(); - IV = getIsaVersion(ST.getCPU()); - } - -public: - static std::unique_ptr create(const GCNSubtarget &ST); - - virtual bool handleNonAtomic(MachineBasicBlock::iterator &MI) = 0; - /// Handles atomic instruction \p MI with \p ret indicating whether \p MI - /// returns a result. - virtual bool handleAtomic(MachineBasicBlock::iterator &MI, bool ret) = 0; + bool IsAtomicWithRet) override; }; -class SIGfx9PreciseMemorySupport : public SIPreciseMemorySupport { -public: - SIGfx9PreciseMemorySupport(const GCNSubtarget &ST) - : SIPreciseMemorySupport(ST) {} - bool handleNonAtomic(MachineBasicBlock::iterator &MI) override; - bool handleAtomic(MachineBasicBlock::iterator &MI, bool ret) override; -}; - -class SIGfx10And11PreciseMemorySupport : public SIPreciseMemorySupport { -public: - SIGfx10And11PreciseMemorySupport(const GCNSubtarget &ST) - : SIPreciseMemorySupport(ST) {} - bool handleNonAtomic(MachineBasicBlock::iterator &MI) override; - bool handleAtomic(MachineBasicBlock::iterator &MI, bool ret) override; -}; - -std::unique_ptr -SIPreciseMemorySupport::create(const GCNSubtarget &ST) { - GCNSubtarget::Generation Generation = ST.getGeneration(); - if (Generation < AMDGPUSubtarget::GFX10) - return std::make_unique(ST); - return std::make_unique(ST); -} -#endif - class SIMemoryLegalizer final : public MachineFunctionPass { private: @@ -707,7 +660,7 @@ class SIMemoryLegalizer final : public MachineFunctionPass { std::unique_ptr CC = nullptr; /// Precise Memory support. - bool PM = false; + bool PrecMem = false; /// List of atomic pseudo instructions. std::list AtomicPseudoMIs; @@ -2525,23 +2478,21 @@ bool SIGfx6CacheControl ::handleNonAtomicForPreciseMemory( Wait.DsCnt = 0; // LgkmCnt } else { // vector if (Inst.mayLoad()) { // vector load - if (TII->isVMEM(Inst)) { // VMEM load + if (TII->isVMEM(Inst)) // VMEM load Wait.LoadCnt = 0; // VmCnt - } else if (TII->isFLAT(Inst)) { // Flat load + else if (TII->isFLAT(Inst)) { // Flat load Wait.LoadCnt = 0; // VmCnt Wait.DsCnt = 0; // LgkmCnt - } else { // LDS load + } else // LDS load Wait.DsCnt = 0; // LgkmCnt - } } else { // vector store - if (TII->isVMEM(Inst)) { // VMEM store + if (TII->isVMEM(Inst)) // VMEM store Wait.LoadCnt = 0; // VmCnt - } else if (TII->isFLAT(Inst)) { // Flat store + else if (TII->isFLAT(Inst)) { // Flat store Wait.LoadCnt = 0; // VmCnt Wait.DsCnt = 0; // LgkmCnt - } else { + } else Wait.DsCnt = 0; // LDS store; LgkmCnt - } } } @@ -2552,8 +2503,8 @@ bool SIGfx6CacheControl ::handleNonAtomicForPreciseMemory( return true; } -bool SIGfx6CacheControl ::handleAtomicForPreciseMemory( - MachineBasicBlock::iterator &MI, bool ret) { +bool SIGfx6CacheControl::handleAtomicForPreciseMemory( + MachineBasicBlock::iterator &MI, bool IsAtomicWithRet) { assert(MI->mayLoadOrStore()); AMDGPU::Waitcnt Wait; @@ -2568,7 +2519,7 @@ bool SIGfx6CacheControl ::handleAtomicForPreciseMemory( return true; } -bool SIGfx10CacheControl ::handleNonAtomicForPreciseMemory( +bool SIGfx10CacheControl::handleNonAtomicForPreciseMemory( MachineBasicBlock::iterator &MI) { assert(MI->mayLoadOrStore()); @@ -2584,14 +2535,13 @@ bool SIGfx10CacheControl ::handleNonAtomicForPreciseMemory( Wait.DsCnt = 0; // LgkmCnt } else { // vector if (Inst.mayLoad()) { // vector load - if (TII->isVMEM(Inst)) { // VMEM load + if (TII->isVMEM(Inst)) // VMEM load Wait.LoadCnt = 0; // VmCnt - } else if (TII->isFLAT(Inst)) { // Flat load + else if (TII->isFLAT(Inst)) { // Flat load Wait.LoadCnt = 0; // VmCnt Wait.DsCnt = 0; // LgkmCnt - } else { // LDS load + } else // LDS load Wait.DsCnt = 0; // LgkmCnt - } } // For some vector instructions, mayLoad() and mayStore() can be both true. @@ -2604,9 +2554,8 @@ bool SIGfx10CacheControl ::handleNonAtomicForPreciseMemory( } else if (TII->isFLAT(Inst)) { // Flat store Wait.DsCnt = 0; // LgkmCnt BuildVsCnt = true; - } else { + } else Wait.DsCnt = 0; // LDS store; LgkmCnt - } } } @@ -2627,20 +2576,20 @@ bool SIGfx10CacheControl ::handleNonAtomicForPreciseMemory( } bool SIGfx10CacheControl ::handleAtomicForPreciseMemory( - MachineBasicBlock::iterator &MI, bool ret) { + MachineBasicBlock::iterator &MI, bool IsAtomicWithRet) { assert(MI->mayLoadOrStore()); AMDGPU::Waitcnt Wait; Wait.DsCnt = 0; // LgkmCnt - if (ret) + if (IsAtomicWithRet) Wait.LoadCnt = 0; // VmCnt unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); MachineBasicBlock &MBB = *MI->getParent(); BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); --MI; - if (!ret) { + if (!IsAtomicWithRet) { BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT)) .addReg(AMDGPU::SGPR_NULL, RegState::Undef) .addImm(0); @@ -2668,21 +2617,19 @@ bool SIGfx12CacheControl ::handleNonAtomicForPreciseMemory( WaitType = AMDGPU::S_WAIT_LOADCNT; LoadAndStore = true; } else if (Inst.mayLoad()) { // vector load - if (TII->isVMEM(Inst)) { // VMEM load + if (TII->isVMEM(Inst)) // VMEM load WaitType = AMDGPU::S_WAIT_LOADCNT; - } else if (TII->isFLAT(Inst)) { // Flat load + else if (TII->isFLAT(Inst)) // Flat load WaitType = AMDGPU::S_WAIT_LOADCNT_DSCNT; - } else { // LDS load + else // LDS load WaitType = AMDGPU::S_WAIT_DSCNT; - } } else { // vector store - if (TII->isVMEM(Inst)) { // VMEM store + if (TII->isVMEM(Inst)) // VMEM store WaitType = AMDGPU::S_WAIT_STORECNT; - } else if (TII->isFLAT(Inst)) { // Flat store + else if (TII->isFLAT(Inst)) // Flat store WaitType = AMDGPU::S_WAIT_STORECNT_DSCNT; - } else { + else WaitType = AMDGPU::S_WAIT_DSCNT; - } } } @@ -2714,15 +2661,15 @@ bool SIGfx12CacheControl ::handleNonAtomicForPreciseMemory( } bool SIGfx12CacheControl ::handleAtomicForPreciseMemory( - MachineBasicBlock::iterator &MI, bool ret) { + MachineBasicBlock::iterator &MI, bool IsAtomicWithRet) { assert(MI->mayLoadOrStore()); MachineBasicBlock &MBB = *MI->getParent(); - if (ret) { + if (IsAtomicWithRet) BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAIT_LOADCNT)).addImm(0); - } else { + else BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAIT_STORECNT)).addImm(0); - } + --MI; return true; } @@ -2782,7 +2729,7 @@ bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI, Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(), SIMemOp::LOAD, MOI.isVolatile(), MOI.isNonTemporal()); - if (PM) + if (PrecMem) Changed |= CC->handleNonAtomicForPreciseMemory(MI); return Changed; @@ -2823,7 +2770,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, // instruction field, do not confuse it with atomic scope. Changed |= CC->expandSystemScopeStore(MI); - if (PM) + if (PrecMem) Changed |= CC->handleNonAtomicForPreciseMemory(MI); return Changed; @@ -2918,7 +2865,7 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, return Changed; } - if (PM) + if (PrecMem) Changed |= CC->handleNonAtomicForPreciseMemory(MI); return Changed; @@ -2931,9 +2878,9 @@ bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { CC = SICacheControl::create(MF.getSubtarget()); const GCNSubtarget &ST = MF.getSubtarget(); - PM = false; + PrecMem = false; if (ST.isPreciseMemoryEnabled()) - PM = true; + PrecMem = true; for (auto &MBB : MF) { for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) { @@ -2954,7 +2901,7 @@ bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { } if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic)) { - if (PM && MI->mayLoadOrStore()) { + if (PrecMem && MI->mayLoadOrStore()) { Changed |= CC->handleNonAtomicForPreciseMemory(MI); } continue; From 571ce5844a88063164a8aaa36dea553082c79184 Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Thu, 15 Feb 2024 17:12:53 -0600 Subject: [PATCH 06/16] Code formatting. --- llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 56 ++++++++++---------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 6cb9519d4169b4..c6f0191f4454c6 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -2475,22 +2475,22 @@ bool SIGfx6CacheControl ::handleNonAtomicForPreciseMemory( if (TII->isSMRD(Inst)) { // scalar if (Inst.mayStore()) return false; - Wait.DsCnt = 0; // LgkmCnt - } else { // vector - if (Inst.mayLoad()) { // vector load - if (TII->isVMEM(Inst)) // VMEM load - Wait.LoadCnt = 0; // VmCnt - else if (TII->isFLAT(Inst)) { // Flat load - Wait.LoadCnt = 0; // VmCnt - Wait.DsCnt = 0; // LgkmCnt - } else // LDS load - Wait.DsCnt = 0; // LgkmCnt - } else { // vector store - if (TII->isVMEM(Inst)) // VMEM store - Wait.LoadCnt = 0; // VmCnt - else if (TII->isFLAT(Inst)) { // Flat store - Wait.LoadCnt = 0; // VmCnt - Wait.DsCnt = 0; // LgkmCnt + Wait.DsCnt = 0; // LgkmCnt + } else { // vector + if (Inst.mayLoad()) { // vector load + if (TII->isVMEM(Inst)) // VMEM load + Wait.LoadCnt = 0; // VmCnt + else if (TII->isFLAT(Inst)) { // Flat load + Wait.LoadCnt = 0; // VmCnt + Wait.DsCnt = 0; // LgkmCnt + } else // LDS load + Wait.DsCnt = 0; // LgkmCnt + } else { // vector store + if (TII->isVMEM(Inst)) // VMEM store + Wait.LoadCnt = 0; // VmCnt + else if (TII->isFLAT(Inst)) { // Flat store + Wait.LoadCnt = 0; // VmCnt + Wait.DsCnt = 0; // LgkmCnt } else Wait.DsCnt = 0; // LDS store; LgkmCnt } @@ -2532,16 +2532,16 @@ bool SIGfx10CacheControl::handleNonAtomicForPreciseMemory( if (TII->isSMRD(Inst)) { // scalar if (Inst.mayStore()) return false; - Wait.DsCnt = 0; // LgkmCnt - } else { // vector - if (Inst.mayLoad()) { // vector load - if (TII->isVMEM(Inst)) // VMEM load - Wait.LoadCnt = 0; // VmCnt - else if (TII->isFLAT(Inst)) { // Flat load - Wait.LoadCnt = 0; // VmCnt - Wait.DsCnt = 0; // LgkmCnt - } else // LDS load - Wait.DsCnt = 0; // LgkmCnt + Wait.DsCnt = 0; // LgkmCnt + } else { // vector + if (Inst.mayLoad()) { // vector load + if (TII->isVMEM(Inst)) // VMEM load + Wait.LoadCnt = 0; // VmCnt + else if (TII->isFLAT(Inst)) { // Flat load + Wait.LoadCnt = 0; // VmCnt + Wait.DsCnt = 0; // LgkmCnt + } else // LDS load + Wait.DsCnt = 0; // LgkmCnt } // For some vector instructions, mayLoad() and mayStore() can be both true. @@ -2623,8 +2623,8 @@ bool SIGfx12CacheControl ::handleNonAtomicForPreciseMemory( WaitType = AMDGPU::S_WAIT_LOADCNT_DSCNT; else // LDS load WaitType = AMDGPU::S_WAIT_DSCNT; - } else { // vector store - if (TII->isVMEM(Inst)) // VMEM store + } else { // vector store + if (TII->isVMEM(Inst)) // VMEM store WaitType = AMDGPU::S_WAIT_STORECNT; else if (TII->isFLAT(Inst)) // Flat store WaitType = AMDGPU::S_WAIT_STORECNT_DSCNT; From 93b00bf009a96020d3b1e3a01b6f2a1d324c126f Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Wed, 21 Feb 2024 12:41:59 -0600 Subject: [PATCH 07/16] Change the option from amdgpu-precise-memory-op to precise-memory for the backend. --- clang/include/clang/Driver/Options.td | 3 +-- clang/lib/Driver/ToolChains/AMDGPU.cpp | 4 ++++ llvm/lib/Target/AMDGPU/AMDGPU.td | 2 +- llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 4 ++-- .../AMDGPU/insert_waitcnt_for_precise_memory.ll | 12 ++++++------ 5 files changed, 14 insertions(+), 11 deletions(-) diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index bba239c86d5e1c..e2075db2885720 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -4904,8 +4904,7 @@ defm wavefrontsize64 : SimpleMFlag<"wavefrontsize64", " mode (AMDGPU only)">; defm amdgpu_precise_memory_op : SimpleMFlag<"amdgpu-precise-memory-op", "Enable", "Disable", - " precise memory mode (AMDGPU only)", - m_amdgpu_Features_Group>; + " precise memory mode (AMDGPU only)">; defm unsafe_fp_atomics : BoolMOption<"unsafe-fp-atomics", TargetOpts<"AllowAMDGPUUnsafeFPAtomics">, DefaultFalse, diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp index e122379e860e20..4e6362a0f40632 100644 --- a/clang/lib/Driver/ToolChains/AMDGPU.cpp +++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp @@ -670,6 +670,10 @@ void amdgpu::getAMDGPUTargetFeatures(const Driver &D, options::OPT_mno_wavefrontsize64, false)) Features.push_back("+wavefrontsize64"); + if (Args.hasFlag(options::OPT_mamdgpu_precise_memory_op, + options::OPT_mno_amdgpu_precise_memory_op, false)) + Features.push_back("+precise-memory"); + handleTargetFeaturesGroup(D, Triple, Args, Features, options::OPT_m_amdgpu_Features_Group); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index c6aea7f0865fae..9b09550159993c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -169,7 +169,7 @@ def FeatureCuMode : SubtargetFeature<"cumode", >; def FeaturePreciseMemory - : SubtargetFeature<"amdgpu-precise-memory-op", "EnablePreciseMemory", + : SubtargetFeature<"precise-memory", "EnablePreciseMemory", "true", "Enable precise memory mode">; def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug", diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index c6f0191f4454c6..0d2b0a99db24da 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -367,8 +367,8 @@ class SICacheControl { } public: - // The following is for supporting precise memory mode. When the option - // amdgpu-precise-memory is enabled, an s_waitcnt instruction is inserted + // The following is for supporting precise memory mode. When the feature + // precise-memory is enabled, an s_waitcnt instruction is inserted // after each memory instruction. virtual bool diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll index 048a1f999f0195..facc63cf80189c 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll @@ -1,9 +1,9 @@ -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+amdgpu-precise-memory-op < %s | FileCheck %s -check-prefixes=GFX9 -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+amdgpu-precise-memory-op < %s | FileCheck %s -check-prefixes=GFX90A -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+amdgpu-precise-memory-op < %s | FileCheck %s -check-prefixes=GFX10 -; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global,+enable-flat-scratch,+amdgpu-precise-memory-op -amdgpu-use-divergent-register-indexing < %s | FileCheck --check-prefixes=GFX9-FLATSCR %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+amdgpu-precise-memory-op < %s | FileCheck %s -check-prefixes=GFX11 -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+amdgpu-precise-memory-op < %s | FileCheck %s -check-prefixes=GFX12 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX9 +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX90A +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX10 +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global,+enable-flat-scratch,+precise-memory -amdgpu-use-divergent-register-indexing < %s | FileCheck --check-prefixes=GFX9-FLATSCR %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX11 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX12 ; from atomicrmw-expand.ll ; covers flat_load, flat_atomic From c42d3fbb4716d68522d09752e709a76e7e3fda7c Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Thu, 29 Feb 2024 17:29:23 -0600 Subject: [PATCH 08/16] Move implementation from SIMemoryLegalizer to SIInsertWaitcnts. --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 14 + llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 263 +----------------- .../insert_waitcnt_for_precise_memory.ll | 58 ++-- 3 files changed, 44 insertions(+), 291 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index bb499c5c8c578e..c22adbf8f46c9d 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -2305,6 +2305,20 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, } #endif + if (ST->isPreciseMemoryEnabled()) { + AMDGPU::Waitcnt Wait; + if (WCG == &WCGPreGFX12) + Wait = AMDGPU::Waitcnt(0, 0, 0, 0); + else + Wait = AMDGPU::Waitcnt(0, 0, 0, 0, 0, 0, 0); + + if (!Inst.mayStore()) + Wait.StoreCnt = ~0u; + ScoreBrackets.simplifyWaitcnt(Wait); + Modified |= generateWaitcnt(Wait, std::next(Inst.getIterator()), Block, + ScoreBrackets, /*OldWaitcntInstr=*/nullptr); + } + LLVM_DEBUG({ Inst.print(dbgs()); ScoreBrackets.dump(); diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 0d2b0a99db24da..91a4f8973f2990 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -17,7 +17,6 @@ #include "AMDGPUMachineModuleInfo.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" -#include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/BitmaskEnum.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -365,18 +364,6 @@ class SICacheControl { MachineBasicBlock::iterator &MI) const { return false; } - -public: - // The following is for supporting precise memory mode. When the feature - // precise-memory is enabled, an s_waitcnt instruction is inserted - // after each memory instruction. - - virtual bool - handleNonAtomicForPreciseMemory(MachineBasicBlock::iterator &MI) = 0; - /// Handles atomic instruction \p MI with \p IsAtomicWithRet indicating - /// whether \p MI returns a result. - virtual bool handleAtomicForPreciseMemory(MachineBasicBlock::iterator &MI, - bool IsAtomicWithRet) = 0; }; class SIGfx6CacheControl : public SICacheControl { @@ -432,11 +419,6 @@ class SIGfx6CacheControl : public SICacheControl { SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, Position Pos) const override; - - bool - handleNonAtomicForPreciseMemory(MachineBasicBlock::iterator &MI) override; - bool handleAtomicForPreciseMemory(MachineBasicBlock::iterator &MI, - bool IsAtomicWithRet) override; }; class SIGfx7CacheControl : public SIGfx6CacheControl { @@ -589,11 +571,6 @@ class SIGfx10CacheControl : public SIGfx7CacheControl { SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, Position Pos) const override; - - bool - handleNonAtomicForPreciseMemory(MachineBasicBlock::iterator &MI) override; - bool handleAtomicForPreciseMemory(MachineBasicBlock::iterator &MI, - bool IsAtomicWithRet) override; }; class SIGfx11CacheControl : public SIGfx10CacheControl { @@ -646,11 +623,6 @@ class SIGfx12CacheControl : public SIGfx11CacheControl { bool IsLastUse) const override; bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override; - - bool - handleNonAtomicForPreciseMemory(MachineBasicBlock::iterator &MI) override; - bool handleAtomicForPreciseMemory(MachineBasicBlock::iterator &MI, - bool IsAtomicWithRet) override; }; class SIMemoryLegalizer final : public MachineFunctionPass { @@ -659,9 +631,6 @@ class SIMemoryLegalizer final : public MachineFunctionPass { /// Cache Control. std::unique_ptr CC = nullptr; - /// Precise Memory support. - bool PrecMem = false; - /// List of atomic pseudo instructions. std::list AtomicPseudoMIs; @@ -2461,217 +2430,8 @@ bool SIGfx12CacheControl::expandSystemScopeStore( MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol); if (CPol && ((CPol->getImm() & CPol::SCOPE) == CPol::SCOPE_SYS)) return insertWaitsBeforeSystemScopeStore(MI); - return false; -} - - -bool SIGfx6CacheControl ::handleNonAtomicForPreciseMemory( - MachineBasicBlock::iterator &MI) { - assert(MI->mayLoadOrStore()); - - MachineInstr &Inst = *MI; - AMDGPU::Waitcnt Wait; - - if (TII->isSMRD(Inst)) { // scalar - if (Inst.mayStore()) - return false; - Wait.DsCnt = 0; // LgkmCnt - } else { // vector - if (Inst.mayLoad()) { // vector load - if (TII->isVMEM(Inst)) // VMEM load - Wait.LoadCnt = 0; // VmCnt - else if (TII->isFLAT(Inst)) { // Flat load - Wait.LoadCnt = 0; // VmCnt - Wait.DsCnt = 0; // LgkmCnt - } else // LDS load - Wait.DsCnt = 0; // LgkmCnt - } else { // vector store - if (TII->isVMEM(Inst)) // VMEM store - Wait.LoadCnt = 0; // VmCnt - else if (TII->isFLAT(Inst)) { // Flat store - Wait.LoadCnt = 0; // VmCnt - Wait.DsCnt = 0; // LgkmCnt - } else - Wait.DsCnt = 0; // LDS store; LgkmCnt - } - } - - unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); - MachineBasicBlock &MBB = *MI->getParent(); - BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); - --MI; - return true; -} -bool SIGfx6CacheControl::handleAtomicForPreciseMemory( - MachineBasicBlock::iterator &MI, bool IsAtomicWithRet) { - assert(MI->mayLoadOrStore()); - - AMDGPU::Waitcnt Wait; - - Wait.LoadCnt = 0; // VmCnt - Wait.DsCnt = 0; // LgkmCnt - - unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); - MachineBasicBlock &MBB = *MI->getParent(); - BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); - --MI; - return true; -} - -bool SIGfx10CacheControl::handleNonAtomicForPreciseMemory( - MachineBasicBlock::iterator &MI) { - assert(MI->mayLoadOrStore()); - - MachineInstr &Inst = *MI; - AMDGPU::Waitcnt Wait; - - bool BuildWaitCnt = true; - bool BuildVsCnt = false; - - if (TII->isSMRD(Inst)) { // scalar - if (Inst.mayStore()) - return false; - Wait.DsCnt = 0; // LgkmCnt - } else { // vector - if (Inst.mayLoad()) { // vector load - if (TII->isVMEM(Inst)) // VMEM load - Wait.LoadCnt = 0; // VmCnt - else if (TII->isFLAT(Inst)) { // Flat load - Wait.LoadCnt = 0; // VmCnt - Wait.DsCnt = 0; // LgkmCnt - } else // LDS load - Wait.DsCnt = 0; // LgkmCnt - } - - // For some vector instructions, mayLoad() and mayStore() can be both true. - if (Inst.mayStore()) { // vector store; an instruction can be both - // load/store - if (TII->isVMEM(Inst)) { // VMEM store - if (!Inst.mayLoad()) - BuildWaitCnt = false; - BuildVsCnt = true; - } else if (TII->isFLAT(Inst)) { // Flat store - Wait.DsCnt = 0; // LgkmCnt - BuildVsCnt = true; - } else - Wait.DsCnt = 0; // LDS store; LgkmCnt - } - } - - MachineBasicBlock &MBB = *MI->getParent(); - if (BuildWaitCnt) { - unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); - BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); - --MI; - } - - if (BuildVsCnt) { - BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT)) - .addReg(AMDGPU::SGPR_NULL, RegState::Undef) - .addImm(0); - --MI; - } - return true; -} - -bool SIGfx10CacheControl ::handleAtomicForPreciseMemory( - MachineBasicBlock::iterator &MI, bool IsAtomicWithRet) { - assert(MI->mayLoadOrStore()); - - AMDGPU::Waitcnt Wait; - - Wait.DsCnt = 0; // LgkmCnt - if (IsAtomicWithRet) - Wait.LoadCnt = 0; // VmCnt - - unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); - MachineBasicBlock &MBB = *MI->getParent(); - BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); - --MI; - if (!IsAtomicWithRet) { - BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT)) - .addReg(AMDGPU::SGPR_NULL, RegState::Undef) - .addImm(0); - --MI; - } - return true; -} - -bool SIGfx12CacheControl ::handleNonAtomicForPreciseMemory( - MachineBasicBlock::iterator &MI) { - assert(MI->mayLoadOrStore()); - - MachineInstr &Inst = *MI; - unsigned WaitType = 0; - // For some vector instructions, mayLoad() and mayStore() can be both true. - bool LoadAndStore = false; - - if (TII->isSMRD(Inst)) { // scalar - if (Inst.mayStore()) - return false; - - WaitType = AMDGPU::S_WAIT_KMCNT; - } else { // vector - if (Inst.mayLoad() && Inst.mayStore()) { - WaitType = AMDGPU::S_WAIT_LOADCNT; - LoadAndStore = true; - } else if (Inst.mayLoad()) { // vector load - if (TII->isVMEM(Inst)) // VMEM load - WaitType = AMDGPU::S_WAIT_LOADCNT; - else if (TII->isFLAT(Inst)) // Flat load - WaitType = AMDGPU::S_WAIT_LOADCNT_DSCNT; - else // LDS load - WaitType = AMDGPU::S_WAIT_DSCNT; - } else { // vector store - if (TII->isVMEM(Inst)) // VMEM store - WaitType = AMDGPU::S_WAIT_STORECNT; - else if (TII->isFLAT(Inst)) // Flat store - WaitType = AMDGPU::S_WAIT_STORECNT_DSCNT; - else - WaitType = AMDGPU::S_WAIT_DSCNT; - } - } - - assert(WaitType != 0); - - MachineBasicBlock &MBB = *MI->getParent(); - - unsigned Enc = 0; - if (WaitType == AMDGPU::S_WAIT_LOADCNT_DSCNT) { - AMDGPU::Waitcnt Wait; - Wait.DsCnt = 0; - Wait.LoadCnt = 0; - Enc = AMDGPU::encodeLoadcntDscnt(IV, Wait); - } else if (WaitType == AMDGPU::S_WAIT_STORECNT_DSCNT) { - AMDGPU::Waitcnt Wait; - Wait.DsCnt = 0; - Wait.StoreCnt = 0; - Enc = AMDGPU::encodeStorecntDscnt(IV, Wait); - } - - BuildMI(MBB, ++MI, DebugLoc(), TII->get(WaitType)).addImm(Enc); - --MI; - if (LoadAndStore) { - BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAIT_STORECNT)) - .addImm(Enc); - --MI; - } - return true; -} - -bool SIGfx12CacheControl ::handleAtomicForPreciseMemory( - MachineBasicBlock::iterator &MI, bool IsAtomicWithRet) { - assert(MI->mayLoadOrStore()); - - MachineBasicBlock &MBB = *MI->getParent(); - if (IsAtomicWithRet) - BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAIT_LOADCNT)).addImm(0); - else - BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAIT_STORECNT)).addImm(0); - - --MI; - return true; + return false; } bool SIMemoryLegalizer::removeAtomicPseudoMIs() { @@ -2729,9 +2489,6 @@ bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI, Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(), SIMemOp::LOAD, MOI.isVolatile(), MOI.isNonTemporal()); - if (PrecMem) - Changed |= CC->handleNonAtomicForPreciseMemory(MI); - return Changed; } @@ -2769,10 +2526,6 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, // GFX12 specific, scope(desired coherence domain in cache hierarchy) is // instruction field, do not confuse it with atomic scope. Changed |= CC->expandSystemScopeStore(MI); - - if (PrecMem) - Changed |= CC->handleNonAtomicForPreciseMemory(MI); - return Changed; } @@ -2865,9 +2618,6 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, return Changed; } - if (PrecMem) - Changed |= CC->handleNonAtomicForPreciseMemory(MI); - return Changed; } @@ -2877,11 +2627,6 @@ bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { SIMemOpAccess MOA(MF); CC = SICacheControl::create(MF.getSubtarget()); - const GCNSubtarget &ST = MF.getSubtarget(); - PrecMem = false; - if (ST.isPreciseMemoryEnabled()) - PrecMem = true; - for (auto &MBB : MF) { for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) { @@ -2900,12 +2645,8 @@ bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { MI = II->getIterator(); } - if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic)) { - if (PrecMem && MI->mayLoadOrStore()) { - Changed |= CC->handleNonAtomicForPreciseMemory(MI); - } + if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic)) continue; - } if (const auto &MOI = MOA.getLoadInfo(MI)) Changed |= expandLoad(*MOI, MI); diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll index facc63cf80189c..0012372e454539 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll @@ -7,7 +7,7 @@ ; from atomicrmw-expand.ll ; covers flat_load, flat_atomic -define void @syncscope_workgroup_nortn(ptr %addr, float %val) { +; ; GFX90A-LABEL: syncscope_workgroup_nortn: ; GFX90A: ; %bb.0: ; GFX90A: flat_load_dword v5, v[0:1] @@ -42,6 +42,7 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) { ; GFX12-NEXT: flat_load_b32 v4, v[0:1] ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +define void @syncscope_workgroup_nortn(ptr %addr, float %val) { %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst ret void } @@ -53,7 +54,7 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v2, v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -77,7 +78,7 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v2, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -88,8 +89,8 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 @@ -99,7 +100,7 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v2, v[0:1], off -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11: .LBB1_1: ; %atomicrmw.start ; GFX11: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -108,8 +109,8 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv ; ; GFX12-LABEL: atomic_nand_i32_global: ; GFX12: ; %bb.0: @@ -119,7 +120,7 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v2, v[0:1], off -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12: .LBB1_1: ; %atomicrmw.start ; GFX12: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -143,18 +144,17 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v0, v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_short v[2:3], v0, off -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_load_store: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_short v[2:3], v0, off -; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -162,9 +162,8 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v0, v[0:1], off -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b16 v[2:3], v0, off -; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -176,9 +175,9 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v[0:1], off -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b16 v[2:3], v0, off -; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -192,9 +191,9 @@ define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; ; GFX9-FLATSCR-LABEL: {{^}}vs_main: ; GFX9-FLATSCR: scratch_store_dwordx4 off, v[{{[0-9:]+}}], -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) define amdgpu_vs float @vs_main(i32 %idx) { %v1 = extractelement <81 x float> , i32 %idx %v2 = extractelement <81 x float> , i32 %idx @@ -219,7 +218,6 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GFX10: global_store_dword v0, v1, s[0:1] -; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm ; @@ -229,9 +227,8 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GFX11: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: udiv_i32: ; GFX12: ; %bb.0: @@ -239,8 +236,8 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_cvt_f32_u32 s4, s3 ; GFX12: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_endpgm define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { %r = udiv i32 %x, %y @@ -331,7 +328,8 @@ declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32, ptr addrspace(8), i32, i ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10: buffer_atomic_add v1, off, s[4:7], 0 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NOT: s_waitcnt_vscnt null, 0x0 +; GFX10: .LBB{{[0-9]+}}_2 ; ; GFX11-LABEL: add_i32_constant: ; GFX11: ; %bb.1: @@ -339,7 +337,6 @@ declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32, ptr addrspace(8), i32, i ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; ; GFX12-LABEL: add_i32_constant: ; GFX12: ; %bb.1: @@ -347,12 +344,12 @@ declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32, ptr addrspace(8), i32, i ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: .LBB{{[0-9]+}}_2 ; GFX12: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12: global_store_b32 v1, v0, s[0:1] -; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_endpgm define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace(8) %inout) { entry: @@ -436,19 +433,20 @@ declare i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32, i32, <8 x i32>, i32, ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NOT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: {{^}}atomic_swap_1d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ; return to shader part epilog ; ; GFX12-LABEL: {{^}}atomic_swap_1d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: ; return to shader part epilog define amdgpu_ps float @atomic_swap_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { main_body: From 44bada0c2eff38c0326c26289cc1bd068871320c Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Mon, 25 Mar 2024 19:12:43 -0500 Subject: [PATCH 09/16] Minor code change. --- clang/test/Driver/amdgpu-features.c | 4 ++-- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/clang/test/Driver/amdgpu-features.c b/clang/test/Driver/amdgpu-features.c index 57d31ccedd8783..864744db203e91 100644 --- a/clang/test/Driver/amdgpu-features.c +++ b/clang/test/Driver/amdgpu-features.c @@ -34,7 +34,7 @@ // NO-CUMODE: "-target-feature" "-cumode" // RUN: %clang -### -target amdgcn -mcpu=gfx1010 -mamdgpu-precise-memory-op %s 2>&1 | FileCheck --check-prefix=PREC-MEM %s -// PREC-MEM: "-target-feature" "+amdgpu-precise-memory-op" +// PREC-MEM: "-target-feature" "+precise-memory" // RUN: %clang -### -target amdgcn -mcpu=gfx1010 -mno-amdgpu-precise-memory-op %s 2>&1 | FileCheck --check-prefix=NO-PREC-MEM %s -// NO-PREC-MEM: "-target-feature" "-amdgpu-precise-memory-op" +// NO-PREC-MEM-NOT: {{".*precise-memory"}} diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index c22adbf8f46c9d..7207c0f4894b45 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -2305,12 +2305,12 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, } #endif - if (ST->isPreciseMemoryEnabled()) { + if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) { AMDGPU::Waitcnt Wait; - if (WCG == &WCGPreGFX12) - Wait = AMDGPU::Waitcnt(0, 0, 0, 0); - else + if (ST->hasExtendedWaitCounts()) Wait = AMDGPU::Waitcnt(0, 0, 0, 0, 0, 0, 0); + else + Wait = AMDGPU::Waitcnt(0, 0, 0, 0); if (!Inst.mayStore()) Wait.StoreCnt = ~0u; From 20312a19f8e5ad10814a3ef6eeef746086a59d5d Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Tue, 26 Mar 2024 19:00:18 -0500 Subject: [PATCH 10/16] Use getAllZeroWaitcnt() when creating the Wait obj. Some changes to the test file. --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 10 +-- .../insert_waitcnt_for_precise_memory.ll | 87 ++++++++++++++----- 2 files changed, 66 insertions(+), 31 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 7207c0f4894b45..556ec3e231ff19 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -2306,14 +2306,8 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, #endif if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) { - AMDGPU::Waitcnt Wait; - if (ST->hasExtendedWaitCounts()) - Wait = AMDGPU::Waitcnt(0, 0, 0, 0, 0, 0, 0); - else - Wait = AMDGPU::Waitcnt(0, 0, 0, 0); - - if (!Inst.mayStore()) - Wait.StoreCnt = ~0u; + AMDGPU::Waitcnt Wait = WCG->getAllZeroWaitcnt( + Inst.mayStore() && !SIInstrInfo::isAtomicRet(Inst)); ScoreBrackets.simplifyWaitcnt(Wait); Modified |= generateWaitcnt(Wait, std::next(Inst.getIterator()), Block, ScoreBrackets, /*OldWaitcntInstr=*/nullptr); diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll index 0012372e454539..6b3fc191937def 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll @@ -1,12 +1,12 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX9 ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX90A ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX10 -; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global,+enable-flat-scratch,+precise-memory -amdgpu-use-divergent-register-indexing < %s | FileCheck --check-prefixes=GFX9-FLATSCR %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+enable-flat-scratch,+precise-memory < %s | FileCheck --check-prefixes=GFX9-FLATSCR %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX11 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX12 ; from atomicrmw-expand.ll -; covers flat_load, flat_atomic +; covers flat_load, flat_atomic (atomic with return) ; ; GFX90A-LABEL: syncscope_workgroup_nortn: ; GFX90A: ; %bb.0: @@ -48,7 +48,7 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) { } ; from atomicrmw-nand.ll -; covers global_atomic, global_load +; covers global_atomic (atomic with return), global_load ; ; GFX9-LABEL: atomic_nand_i32_global: ; GFX9: ; %bb.0: @@ -138,7 +138,7 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { } ; from bf16.ll -; covers buffer_load, buffer_store, flat_load, flat_store, global_load, global_store +; covers flat_load, flat_store, global_load, global_store ; ; GFX9-LABEL: test_load_store: ; GFX9: ; %bb.0: @@ -186,19 +186,60 @@ define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) { ret void } -; from scratch-simple.ll -; covers scratch_load, scratch_store -; -; GFX9-FLATSCR-LABEL: {{^}}vs_main: -; GFX9-FLATSCR: scratch_store_dwordx4 off, v[{{[0-9:]+}}], -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -define amdgpu_vs float @vs_main(i32 %idx) { - %v1 = extractelement <81 x float> , i32 %idx - %v2 = extractelement <81 x float> , i32 %idx - %r = fadd float %v1, %v2 - ret float %r +; covers scratch_load, scratch_store, buffer_load, buffer_store +; GFX9-LABEL: test_load_store_as5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLATSCR-LABEL: test_load_store_as5: +; GFX9-FLATSCR: ; %bb.0: +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLATSCR-NEXT: scratch_load_ushort v0, v0, off +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: scratch_store_short v1, v0, off +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_load_store_as5: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_load_store_as5: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: scratch_load_u16 v0, v0, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: scratch_store_b16 v1, v0, off +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_load_store_as5: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: scratch_load_u16 v0, v0, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: scratch_store_b16 v1, v0, off +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + +define void @test_load_store_as5(ptr addrspace(5) %in, ptr addrspace(5) %out) { + %val = load bfloat, ptr addrspace(5) %in + store bfloat %val, ptr addrspace(5) %out + ret void } ; from udiv.ll @@ -274,7 +315,7 @@ main_body: } ; from atomic_load_add.ll -; covers s_load, ds_add +; covers s_load, ds_add (atomic without return) ; GFX9-LABEL: atomic_add_local: ; GFX9: ; %bb.1: ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 @@ -314,7 +355,7 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32, ptr addrspace(8), i32, i32, i32 immarg) ; from atomic_optimizations_buffer.ll -; covers buffer_atomic +; covers buffer_atomic (atomic with return) ; GFX9-LABEL: add_i32_constant: ; GFX9: ; %bb.1: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 @@ -424,7 +465,7 @@ main_body: declare i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) ; from llvm.amdgcn.image.atomic.dim.ll -; covers image_atomic +; covers image_atomic (atomic with return) ; GFX90A-LABEL: {{^}}atomic_swap_1d: ; GFX90A: image_atomic_swap v0, v{{[02468]}}, s[0:7] dmask:0x1 unorm glc{{$}} ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -456,7 +497,7 @@ main_body: } ; from lds-bounds.ll -; covers ds_write_b64 +; covers ds_write_b64 (atomic without return) @compute_lds = external addrspace(3) global [512 x i32], align 16 ; GFX9-LABEL: {{^}}store_aligned: ; GFX9: ds_write_b64 @@ -509,7 +550,7 @@ entry: %v.0 = load i32, ptr addrspace(3) %ptr, align 8 %v.1 = load i32, ptr addrspace(3) %ptr.gep.1 - %r.0 = insertelement <2 x i32> undef, i32 %v.0, i32 0 + %r.0 = insertelement <2 x i32> poison, i32 %v.0, i32 0 %r.1 = insertelement <2 x i32> %r.0, i32 %v.1, i32 1 %bc = bitcast <2 x i32> %r.1 to <2 x float> ret <2 x float> %bc @@ -569,7 +610,7 @@ entry: %v.0 = load i32, ptr addrspace(3) %ptr.a %v.1 = load i32, ptr addrspace(3) %ptr.b - %r.0 = insertelement <2 x i32> undef, i32 %v.0, i32 0 + %r.0 = insertelement <2 x i32> poison, i32 %v.0, i32 0 %r.1 = insertelement <2 x i32> %r.0, i32 %v.1, i32 1 %bc = bitcast <2 x i32> %r.1 to <2 x float> ret <2 x float> %bc From 12dde5f3c4a66f7474922b1317d9411e2c156368 Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Fri, 29 Mar 2024 13:55:09 -0500 Subject: [PATCH 11/16] Use update_llc_test_checks.py on insert_waitcnt_for_precise_memory.ll. --- .../insert_waitcnt_for_precise_memory.ll | 1139 ++++++++++++++--- 1 file changed, 967 insertions(+), 172 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll index 6b3fc191937def..ef9e4cc7528ce0 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX9 ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX90A ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX10 @@ -8,32 +9,115 @@ ; from atomicrmw-expand.ll ; covers flat_load, flat_atomic (atomic with return) ; +define void @syncscope_workgroup_nortn(ptr %addr, float %val) { +; GFX9-LABEL: syncscope_workgroup_nortn: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dword v4, v[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX9-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB0_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; ; GFX90A-LABEL: syncscope_workgroup_nortn: -; GFX90A: ; %bb.0: -; GFX90A: flat_load_dword v5, v[0:1] +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A: .LBB0_1: ; %atomicrmw.start -; GFX90A: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: syncscope_workgroup_nortn: -; GFX10: ; %bb.0: -; GFX10: flat_load_dword v4, v[0:1] +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10: .LBB0_1: ; %atomicrmw.start -; GFX10: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB0_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLATSCR-LABEL: syncscope_workgroup_nortn: +; GFX9-FLATSCR: ; %bb.0: +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLATSCR-NEXT: flat_load_dword v4, v[0:1] +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-FLATSCR-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-FLATSCR-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX9-FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-FLATSCR-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX9-FLATSCR-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-FLATSCR-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-FLATSCR-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-FLATSCR-NEXT: s_cbranch_execnz .LBB0_1 +; GFX9-FLATSCR-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-FLATSCR-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: syncscope_workgroup_nortn: -; GFX11: ; %bb.0: -; GFX11: flat_load_b32 v4, v[0:1] +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_load_b32 v4, v[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11: .LBB0_1: ; %atomicrmw.start -; GFX11: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB0_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: syncscope_workgroup_nortn: -; GFX12: ; %bb.0: +; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 @@ -41,8 +125,24 @@ ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v4, v[0:1] ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 - -define void @syncscope_workgroup_nortn(ptr %addr, float %val) { +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB0_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_setpc_b64 s[30:31] %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst ret void } @@ -50,6 +150,7 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) { ; from atomicrmw-nand.ll ; covers global_atomic (atomic with return), global_load ; +define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { ; GFX9-LABEL: atomic_nand_i32_global: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -58,7 +159,6 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) { ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NOT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, v2 ; GFX9-NEXT: v_not_b32_e32 v2, v3 ; GFX9-NEXT: v_or_b32_e32 v2, -5, v2 @@ -74,6 +174,31 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) { ; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; +; GFX90A-LABEL: atomic_nand_i32_global: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v2, v[0:1], off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_not_b32_e32 v2, v3 +; GFX90A-NEXT: v_or_b32_e32 v2, -5, v2 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-LABEL: atomic_nand_i32_global: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -82,7 +207,6 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) { ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NOT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: v_not_b32_e32 v2, v3 ; GFX10-NEXT: v_or_b32_e32 v2, -5, v2 @@ -95,44 +219,90 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) { ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB1_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLATSCR-LABEL: atomic_nand_i32_global: +; GFX9-FLATSCR: ; %bb.0: +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLATSCR-NEXT: global_load_dword v2, v[0:1], off +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-FLATSCR-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX9-FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-FLATSCR-NEXT: v_not_b32_e32 v2, v3 +; GFX9-FLATSCR-NEXT: v_or_b32_e32 v2, -5, v2 +; GFX9-FLATSCR-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: buffer_wbinvl1_vol +; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX9-FLATSCR-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-FLATSCR-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-FLATSCR-NEXT: s_cbranch_execnz .LBB1_1 +; GFX9-FLATSCR-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-FLATSCR-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: atomic_nand_i32_global: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v2, v[0:1], off -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11: .LBB1_1: ; %atomicrmw.start -; GFX11: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v2, v3 -; GFX11-NEXT: v_or_b32_e32 v2, -5, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v2, v3 +; GFX11-NEXT: v_or_b32_e32 v2, -5, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB1_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: atomic_nand_i32_global: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v2, v[0:1], off -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12: .LBB1_1: ; %atomicrmw.start -; GFX12: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v2, v3 -; GFX12-NEXT: v_or_b32_e32 v2, -5, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 - -define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v2, v[0:1], off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_not_b32_e32 v2, v3 +; GFX12-NEXT: v_or_b32_e32 v2, -5, v2 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB1_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw nand ptr addrspace(1) %ptr, i32 4 seq_cst ret i32 %result } @@ -140,6 +310,7 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { ; from bf16.ll ; covers flat_load, flat_store, global_load, global_store ; +define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; GFX9-LABEL: test_load_store: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -149,6 +320,15 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; +; GFX90A-LABEL: test_load_store: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_ushort v0, v[0:1], off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_short v[2:3], v0, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-LABEL: test_load_store: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -158,6 +338,15 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX9-FLATSCR-LABEL: test_load_store: +; GFX9-FLATSCR: ; %bb.0: +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLATSCR-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: global_store_short v[2:3], v0, off +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: s_setpc_b64 s[30:31] +; ; GFX11-LABEL: test_load_store: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -179,14 +368,13 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { ; GFX12-NEXT: global_store_b16 v[2:3], v0, off ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] - -define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) { %val = load bfloat, ptr addrspace(1) %in store bfloat %val, ptr addrspace(1) %out ret void } ; covers scratch_load, scratch_store, buffer_load, buffer_store +define void @test_load_store_as5(ptr addrspace(5) %in, ptr addrspace(5) %out) { ; GFX9-LABEL: test_load_store_as5: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -196,14 +384,14 @@ define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-FLATSCR-LABEL: test_load_store_as5: -; GFX9-FLATSCR: ; %bb.0: -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_load_ushort v0, v0, off -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_short v1, v0, off -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: s_setpc_b64 s[30:31] +; GFX90A-LABEL: test_load_store_as5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_load_store_as5: ; GFX10: ; %bb.0: @@ -214,6 +402,15 @@ define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX9-FLATSCR-LABEL: test_load_store_as5: +; GFX9-FLATSCR: ; %bb.0: +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLATSCR-NEXT: scratch_load_ushort v0, v0, off +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: scratch_store_short v1, v0, off +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: s_setpc_b64 s[30:31] +; ; GFX11-LABEL: test_load_store_as5: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -235,8 +432,6 @@ define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; GFX12-NEXT: scratch_store_b16 v1, v0, off ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] - -define void @test_load_store_as5(ptr addrspace(5) %in, ptr addrspace(5) %out) { %val = load bfloat, ptr addrspace(5) %in store bfloat %val, ptr addrspace(5) %out ret void @@ -245,29 +440,161 @@ define void @test_load_store_as5(ptr addrspace(5) %in, ptr addrspace(5) %out) { ; from udiv.ll ; covers s_load ; +define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; GFX9-LABEL: udiv_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NOT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_sub_i32 s4, 0, s3 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s5, v0 +; GFX9-NEXT: s_mul_i32 s4, s4, s5 +; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 +; GFX9-NEXT: s_add_i32 s5, s5, s4 +; GFX9-NEXT: s_mul_hi_u32 s4, s2, s5 +; GFX9-NEXT: s_mul_i32 s5, s4, s3 +; GFX9-NEXT: s_sub_i32 s2, s2, s5 +; GFX9-NEXT: s_add_i32 s6, s4, 1 +; GFX9-NEXT: s_sub_i32 s5, s2, s3 +; GFX9-NEXT: s_cmp_ge_u32 s2, s3 +; GFX9-NEXT: s_cselect_b32 s4, s6, s4 +; GFX9-NEXT: s_cselect_b32 s2, s5, s2 +; GFX9-NEXT: s_add_i32 s5, s4, 1 +; GFX9-NEXT: s_cmp_ge_u32 s2, s3 +; GFX9-NEXT: s_cselect_b32 s2, s5, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: udiv_i32: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX90A-NEXT: s_sub_i32 s4, 0, s3 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX90A-NEXT: v_readfirstlane_b32 s5, v0 +; GFX90A-NEXT: s_mul_i32 s4, s4, s5 +; GFX90A-NEXT: s_mul_hi_u32 s4, s5, s4 +; GFX90A-NEXT: s_add_i32 s5, s5, s4 +; GFX90A-NEXT: s_mul_hi_u32 s4, s2, s5 +; GFX90A-NEXT: s_mul_i32 s5, s4, s3 +; GFX90A-NEXT: s_sub_i32 s2, s2, s5 +; GFX90A-NEXT: s_add_i32 s6, s4, 1 +; GFX90A-NEXT: s_sub_i32 s5, s2, s3 +; GFX90A-NEXT: s_cmp_ge_u32 s2, s3 +; GFX90A-NEXT: s_cselect_b32 s4, s6, s4 +; GFX90A-NEXT: s_cselect_b32 s2, s5, s2 +; GFX90A-NEXT: s_add_i32 s5, s4, 1 +; GFX90A-NEXT: s_cmp_ge_u32 s2, s3 +; GFX90A-NEXT: s_cselect_b32 s2, s5, s4 +; GFX90A-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NEXT: global_store_dword v1, v0, s[0:1] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_endpgm ; ; GFX10-LABEL: udiv_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX10: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_sub_i32 s5, 0, s3 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s4, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_mul_i32 s5, s5, s4 +; GFX10-NEXT: s_mul_hi_u32 s5, s4, s5 +; GFX10-NEXT: s_add_i32 s4, s4, s5 +; GFX10-NEXT: s_mul_hi_u32 s4, s2, s4 +; GFX10-NEXT: s_mul_i32 s5, s4, s3 +; GFX10-NEXT: s_sub_i32 s2, s2, s5 +; GFX10-NEXT: s_add_i32 s5, s4, 1 +; GFX10-NEXT: s_sub_i32 s6, s2, s3 +; GFX10-NEXT: s_cmp_ge_u32 s2, s3 +; GFX10-NEXT: s_cselect_b32 s4, s5, s4 +; GFX10-NEXT: s_cselect_b32 s2, s6, s2 +; GFX10-NEXT: s_add_i32 s5, s4, 1 +; GFX10-NEXT: s_cmp_ge_u32 s2, s3 +; GFX10-NEXT: s_cselect_b32 s2, s5, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm ; +; GFX9-FLATSCR-LABEL: udiv_i32: +; GFX9-FLATSCR: ; %bb.0: +; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-FLATSCR-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-FLATSCR-NEXT: s_sub_i32 s4, 0, s3 +; GFX9-FLATSCR-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-FLATSCR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX9-FLATSCR-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-FLATSCR-NEXT: v_readfirstlane_b32 s5, v0 +; GFX9-FLATSCR-NEXT: s_mul_i32 s4, s4, s5 +; GFX9-FLATSCR-NEXT: s_mul_hi_u32 s4, s5, s4 +; GFX9-FLATSCR-NEXT: s_add_i32 s5, s5, s4 +; GFX9-FLATSCR-NEXT: s_mul_hi_u32 s4, s2, s5 +; GFX9-FLATSCR-NEXT: s_mul_i32 s5, s4, s3 +; GFX9-FLATSCR-NEXT: s_sub_i32 s2, s2, s5 +; GFX9-FLATSCR-NEXT: s_add_i32 s6, s4, 1 +; GFX9-FLATSCR-NEXT: s_sub_i32 s5, s2, s3 +; GFX9-FLATSCR-NEXT: s_cmp_ge_u32 s2, s3 +; GFX9-FLATSCR-NEXT: s_cselect_b32 s4, s6, s4 +; GFX9-FLATSCR-NEXT: s_cselect_b32 s2, s5, s2 +; GFX9-FLATSCR-NEXT: s_add_i32 s5, s4, 1 +; GFX9-FLATSCR-NEXT: s_cmp_ge_u32 s2, s3 +; GFX9-FLATSCR-NEXT: s_cselect_b32 s2, s5, s4 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-FLATSCR-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: s_endpgm +; ; GFX11-LABEL: udiv_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX11: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sub_i32 s5, 0, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_mul_i32 s5, s5, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mul_hi_u32 s5, s4, s5 +; GFX11-NEXT: s_add_i32 s4, s4, s5 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mul_hi_u32 s4, s2, s4 +; GFX11-NEXT: s_mul_i32 s5, s4, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_sub_i32 s2, s2, s5 +; GFX11-NEXT: s_add_i32 s5, s4, 1 +; GFX11-NEXT: s_sub_i32 s6, s2, s3 +; GFX11-NEXT: s_cmp_ge_u32 s2, s3 +; GFX11-NEXT: s_cselect_b32 s4, s5, s4 +; GFX11-NEXT: s_cselect_b32 s2, s6, s2 +; GFX11-NEXT: s_add_i32 s5, s4, 1 +; GFX11-NEXT: s_cmp_ge_u32 s2, s3 +; GFX11-NEXT: s_cselect_b32 s2, s5, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_endpgm ; @@ -276,11 +603,35 @@ define void @test_load_store_as5(ptr addrspace(5) %in, ptr addrspace(5) %out) { ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_cvt_f32_u32 s4, s3 -; GFX12: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_sub_co_i32 s5, 0, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX12-NEXT: v_rcp_iflag_f32_e32 v0, s4 +; GFX12-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_3) +; GFX12-NEXT: s_mul_f32 s4, s4, 0x4f7ffffe +; GFX12-NEXT: s_cvt_u32_f32 s4, s4 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_mul_i32 s5, s5, s4 +; GFX12-NEXT: s_mul_hi_u32 s5, s4, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_co_i32 s4, s4, s5 +; GFX12-NEXT: s_mul_hi_u32 s4, s2, s4 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_mul_i32 s5, s4, s3 +; GFX12-NEXT: s_sub_co_i32 s2, s2, s5 +; GFX12-NEXT: s_add_co_i32 s5, s4, 1 +; GFX12-NEXT: s_sub_co_i32 s6, s2, s3 +; GFX12-NEXT: s_cmp_ge_u32 s2, s3 +; GFX12-NEXT: s_cselect_b32 s4, s5, s4 +; GFX12-NEXT: s_cselect_b32 s2, s6, s2 +; GFX12-NEXT: s_add_co_i32 s5, s4, 1 +; GFX12-NEXT: s_cmp_ge_u32 s2, s3 +; GFX12-NEXT: s_cselect_b32 s2, s5, s4 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_endpgm - -define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { %r = udiv i32 %x, %y store i32 %r, ptr addrspace(1) %out ret void @@ -290,25 +641,53 @@ declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32) ; from smrd.ll ; covers s_buffer_load -; GFX9-LABEL: {{^}}smrd_sgpr_offset: -; GFX9: s_buffer_load_dword s{{[0-9]}}, s[0:3], s4 +; +define amdgpu_ps float @smrd_sgpr_offset(<4 x i32> inreg %desc, i32 inreg %offset) #0 { +; GFX9-LABEL: smrd_sgpr_offset: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_buffer_load_dword s0, s[0:3], s4 offset:0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: {{^}}smrd_sgpr_offset: -; GFX10: s_buffer_load_dword s0, s[0:3], s4 offset:0x0 +; GFX90A-LABEL: smrd_sgpr_offset: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_buffer_load_dword s0, s[0:3], s4 offset:0x0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: smrd_sgpr_offset: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_buffer_load_dword s0, s[0:3], s4 offset:0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: {{^}}smrd_sgpr_offset: -; GFX11: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0 +; GFX9-FLATSCR-LABEL: smrd_sgpr_offset: +; GFX9-FLATSCR: ; %bb.0: ; %main_body +; GFX9-FLATSCR-NEXT: s_mov_b32 s11, s5 +; GFX9-FLATSCR-NEXT: s_mov_b32 s10, s4 +; GFX9-FLATSCR-NEXT: s_mov_b32 s9, s3 +; GFX9-FLATSCR-NEXT: s_mov_b32 s8, s2 +; GFX9-FLATSCR-NEXT: s_buffer_load_dword s0, s[8:11], s6 offset:0x0 +; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-FLATSCR-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: smrd_sgpr_offset: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: ; return to shader part epilog ; -; GFX12-LABEL: {{^}}smrd_sgpr_offset: -; GFX12: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0 -; GFX12: s_wait_kmcnt 0x0 -; GFX12: v_mov_b32_e32 v0, s0 - -define amdgpu_ps float @smrd_sgpr_offset(<4 x i32> inreg %desc, i32 inreg %offset) #0 { +; GFX12-LABEL: smrd_sgpr_offset: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: ; return to shader part epilog main_body: %r = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %offset, i32 0) ret float %r @@ -316,38 +695,132 @@ main_body: ; from atomic_load_add.ll ; covers s_load, ds_add (atomic without return) +; +define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; GFX9-LABEL: atomic_add_local: -; GFX9: ; %bb.1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB6_2 +; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9: ds_add_u32 v0, v1 +; GFX9-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX9-NEXT: s_mul_i32 s1, s1, 5 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: ds_add_u32 v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: .LBB6_2: +; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: atomic_add_local: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_cbranch_execz .LBB6_2 +; GFX90A-NEXT: ; %bb.1: +; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX90A-NEXT: s_mul_i32 s1, s1, 5 +; GFX90A-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NEXT: ds_add_u32 v0, v1 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: .LBB6_2: +; GFX90A-NEXT: s_endpgm ; ; GFX10-LABEL: atomic_add_local: -; GFX10: ; %bb.1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s2, exec_lo +; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX10-NEXT: s_cbranch_execz .LBB6_2 +; GFX10-NEXT: ; %bb.1: ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10: ds_add_u32 v0, v1 +; GFX10-NEXT: s_bcnt1_i32_b32 s1, s2 +; GFX10-NEXT: s_mul_i32 s1, s1, 5 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: ds_add_u32 v0, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: .LBB6_2: +; GFX10-NEXT: s_endpgm +; +; GFX9-FLATSCR-LABEL: atomic_add_local: +; GFX9-FLATSCR: ; %bb.0: +; GFX9-FLATSCR-NEXT: s_mov_b64 s[2:3], exec +; GFX9-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB6_2 +; GFX9-FLATSCR-NEXT: ; %bb.1: +; GFX9-FLATSCR-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLATSCR-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX9-FLATSCR-NEXT: s_mul_i32 s1, s1, 5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-FLATSCR-NEXT: ds_add_u32 v0, v1 +; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLATSCR-NEXT: .LBB6_2: +; GFX9-FLATSCR-NEXT: s_endpgm ; ; GFX11-LABEL: atomic_add_local: -; GFX11: ; %bb.1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: s_mov_b32 s3, exec_lo +; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-NEXT: ; %bb.1: ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11: ds_add_u32 v0, v1 +; GFX11-NEXT: s_bcnt1_i32_b32 s1, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mul_i32 s1, s1, 5 +; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0 +; GFX11-NEXT: ds_add_u32 v0, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: .LBB6_2: +; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_add_local: -; GFX12: ; %bb.1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: s_mov_b32 s3, exec_lo +; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX12-NEXT: s_cbranch_execz .LBB6_2 +; GFX12-NEXT: ; %bb.1: ; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12: ds_add_u32 v0, v1 +; GFX12-NEXT: s_bcnt1_i32_b32 s1, s2 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_mul_i32 s1, s1, 5 +; GFX12-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0 +; GFX12-NEXT: ds_add_u32 v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE - -define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { +; GFX12-NEXT: .LBB6_2: +; GFX12-NEXT: s_endpgm %unused = atomicrmw volatile add ptr addrspace(3) %local, i32 5 seq_cst ret void } @@ -356,43 +829,179 @@ declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32, ptr addrspace(8), i32, i ; from atomic_optimizations_buffer.ll ; covers buffer_atomic (atomic with return) +; +define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX9-LABEL: add_i32_constant: -; GFX9: ; %bb.1: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_cbranch_execz .LBB7_2 +; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9: buffer_atomic_add v1, off, s[8:11], 0 glc +; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX9-NEXT: s_mul_i32 s4, s4, 5 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: .LBB7_2: +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: add_i32_constant: +; GFX90A: ; %bb.0: ; %entry +; GFX90A-NEXT: s_mov_b64 s[4:5], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: ; implicit-def: $vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX90A-NEXT: s_cbranch_execz .LBB7_2 +; GFX90A-NEXT: ; %bb.1: +; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX90A-NEXT: s_mul_i32 s4, s4, 5 +; GFX90A-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: .LBB7_2: +; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_readfirstlane_b32 s2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX90A-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_endpgm ; ; GFX10-LABEL: add_i32_constant: -; GFX10: ; %bb.1: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s3, exec_lo +; GFX10-NEXT: ; implicit-def: $vgpr1 +; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX10-NEXT: s_cbranch_execz .LBB7_2 +; GFX10-NEXT: ; %bb.1: ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10: buffer_atomic_add v1, off, s[4:7], 0 glc +; GFX10-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX10-NEXT: s_mul_i32 s3, s3, 5 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NOT: s_waitcnt_vscnt null, 0x0 -; GFX10: .LBB{{[0-9]+}}_2 +; GFX10-NEXT: .LBB7_2: +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s2, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_endpgm +; +; GFX9-FLATSCR-LABEL: add_i32_constant: +; GFX9-FLATSCR: ; %bb.0: ; %entry +; GFX9-FLATSCR-NEXT: s_mov_b64 s[4:5], exec +; GFX9-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-FLATSCR-NEXT: ; implicit-def: $vgpr1 +; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB7_2 +; GFX9-FLATSCR-NEXT: ; %bb.1: +; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLATSCR-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX9-FLATSCR-NEXT: s_mul_i32 s4, s4, 5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-FLATSCR-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: .LBB7_2: +; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLATSCR-NEXT: v_readfirstlane_b32 s2, v1 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-FLATSCR-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX9-FLATSCR-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: s_endpgm ; ; GFX11-LABEL: add_i32_constant: -; GFX11: ; %bb.1: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_mov_b32 s3, exec_lo +; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-NEXT: ; %bb.1: ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc +; GFX11-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mul_i32 s3, s3, 5 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: .LBB7_2: +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: add_i32_constant: -; GFX12: ; %bb.1: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_mov_b32 s3, exec_lo +; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX12-NEXT: ; implicit-def: $vgpr1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX12-NEXT: s_cbranch_execz .LBB7_2 +; GFX12-NEXT: ; %bb.1: ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_mul_i32 s3, s3, 5 +; GFX12-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: .LBB{{[0-9]+}}_2 -; GFX12: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: .LBB7_2: +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: v_readfirstlane_b32 s2, v1 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_endpgm - -define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace(8) %inout) { entry: %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32 5, ptr addrspace(8) %inout, i32 0, i32 0, i32 0) store i32 %old, ptr addrspace(1) %out @@ -403,26 +1012,51 @@ declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32, i16, <8 x i32>, i3 ; from llvm.amdgcn.image.load.a16.ll ; covers image_load -; GFX9-LABEL: {{^}}load.f32.1d: -; GFX9: image_load v0, v0, s[0:7] dmask:0x1 unorm a16 +; +define amdgpu_ps <4 x float> @load.f32.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) { +; GFX9-LABEL: load.f32.1d: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: image_load v0, v0, s[0:7] dmask:0x1 unorm a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: {{^}}load.f32.1d: -; GFX10: %bb.0: ; %main_body +; GFX90A-LABEL: load.f32.1d: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: image_load v0, v0, s[0:7] dmask:0x1 unorm a16 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: load.f32.1d: +; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: {{^}}load.f32.1d: -; GFX11: %bb.0: ; %main_body +; GFX9-FLATSCR-LABEL: load.f32.1d: +; GFX9-FLATSCR: ; %bb.0: ; %main_body +; GFX9-FLATSCR-NEXT: s_mov_b32 s11, s9 +; GFX9-FLATSCR-NEXT: s_mov_b32 s10, s8 +; GFX9-FLATSCR-NEXT: s_mov_b32 s9, s7 +; GFX9-FLATSCR-NEXT: s_mov_b32 s8, s6 +; GFX9-FLATSCR-NEXT: s_mov_b32 s7, s5 +; GFX9-FLATSCR-NEXT: s_mov_b32 s6, s4 +; GFX9-FLATSCR-NEXT: s_mov_b32 s5, s3 +; GFX9-FLATSCR-NEXT: s_mov_b32 s4, s2 +; GFX9-FLATSCR-NEXT: image_load v0, v0, s[4:11] dmask:0x1 unorm a16 +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: load.f32.1d: +; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_load v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm a16 ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog ; -; GFX12-LABEL: {{^}}load.f32.1d: -; GFX12: %bb.0: ; %main_body +; GFX12-LABEL: load.f32.1d: +; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D a16 ; GFX12-NEXT: s_wait_loadcnt 0x0 - -define amdgpu_ps <4 x float> @load.f32.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) { +; GFX12-NEXT: ; return to shader part epilog main_body: %x = extractelement <2 x i16> %coords, i32 0 %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 1, i16 %x, <8 x i32> %rsrc, i32 0, i32 0) @@ -433,6 +1067,7 @@ declare void @llvm.amdgcn.image.store.1d.v4f32.i16(<4 x float>, i32, i16, <8 x i ; from llvm.amdgcn.image.store.a16.ll ; covers image_store +; define amdgpu_ps void @store_f32_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <4 x float> %val) { ; GFX9-LABEL: store_f32_1d: ; GFX9: ; %bb.0: ; %main_body @@ -440,21 +1075,47 @@ define amdgpu_ps void @store_f32_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; +; GFX90A-LABEL: store_f32_1d: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: image_store v[2:5], v0, s[0:7] dmask:0x1 unorm a16 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_endpgm +; ; GFX10-LABEL: store_f32_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_store v[1:4], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm a16 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm ; +; GFX9-FLATSCR-LABEL: store_f32_1d: +; GFX9-FLATSCR: ; %bb.0: ; %main_body +; GFX9-FLATSCR-NEXT: s_mov_b32 s11, s9 +; GFX9-FLATSCR-NEXT: s_mov_b32 s10, s8 +; GFX9-FLATSCR-NEXT: s_mov_b32 s9, s7 +; GFX9-FLATSCR-NEXT: s_mov_b32 s8, s6 +; GFX9-FLATSCR-NEXT: s_mov_b32 s7, s5 +; GFX9-FLATSCR-NEXT: s_mov_b32 s6, s4 +; GFX9-FLATSCR-NEXT: s_mov_b32 s5, s3 +; GFX9-FLATSCR-NEXT: s_mov_b32 s4, s2 +; GFX9-FLATSCR-NEXT: image_store v[1:4], v0, s[4:11] dmask:0x1 unorm a16 +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: s_endpgm +; ; GFX11-LABEL: store_f32_1d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[1:4], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm a16 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: store_f32_1d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[1:4], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D a16 ; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_endpgm main_body: %x = extractelement <2 x i16> %coords, i32 0 @@ -466,30 +1127,52 @@ declare i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32, i32, <8 x i32>, i32, ; from llvm.amdgcn.image.atomic.dim.ll ; covers image_atomic (atomic with return) -; GFX90A-LABEL: {{^}}atomic_swap_1d: -; GFX90A: image_atomic_swap v0, v{{[02468]}}, s[0:7] dmask:0x1 unorm glc{{$}} +; +define amdgpu_ps float @atomic_swap_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { +; GFX9-LABEL: atomic_swap_1d: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 unorm glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX90A-LABEL: atomic_swap_1d: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: image_atomic_swap v0, v2, s[0:7] dmask:0x1 unorm glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: {{^}}atomic_swap_1d: -; GFX10: ; %bb.0: ; %main_body +; GFX10-LABEL: atomic_swap_1d: +; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NOT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10-NEXT: ; return to shader part epilog +; +; GFX9-FLATSCR-LABEL: atomic_swap_1d: +; GFX9-FLATSCR: ; %bb.0: ; %main_body +; GFX9-FLATSCR-NEXT: s_mov_b32 s11, s9 +; GFX9-FLATSCR-NEXT: s_mov_b32 s10, s8 +; GFX9-FLATSCR-NEXT: s_mov_b32 s9, s7 +; GFX9-FLATSCR-NEXT: s_mov_b32 s8, s6 +; GFX9-FLATSCR-NEXT: s_mov_b32 s7, s5 +; GFX9-FLATSCR-NEXT: s_mov_b32 s6, s4 +; GFX9-FLATSCR-NEXT: s_mov_b32 s5, s3 +; GFX9-FLATSCR-NEXT: s_mov_b32 s4, s2 +; GFX9-FLATSCR-NEXT: image_atomic_swap v0, v1, s[4:11] dmask:0x1 unorm glc +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: {{^}}atomic_swap_1d: -; GFX11: ; %bb.0: ; %main_body +; GFX11-LABEL: atomic_swap_1d: +; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: ; return to shader part epilog +; GFX11-NEXT: ; return to shader part epilog ; -; GFX12-LABEL: {{^}}atomic_swap_1d: -; GFX12: ; %bb.0: ; %main_body +; GFX12-LABEL: atomic_swap_1d: +; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: ; return to shader part epilog - -define amdgpu_ps float @atomic_swap_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { +; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) %out = bitcast i32 %v to float @@ -499,23 +1182,53 @@ main_body: ; from lds-bounds.ll ; covers ds_write_b64 (atomic without return) @compute_lds = external addrspace(3) global [512 x i32], align 16 -; GFX9-LABEL: {{^}}store_aligned: -; GFX9: ds_write_b64 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; -; GFX10-LABEL: {{^}}store_aligned: -; GFX10: ds_write_b64 v0, v[1:2] +define amdgpu_cs void @store_aligned(ptr addrspace(3) %ptr) #0 { +; GFX9-LABEL: store_aligned: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: v_mov_b32_e32 v1, 42 +; GFX9-NEXT: v_mov_b32_e32 v2, 43 +; GFX9-NEXT: ds_write_b64 v0, v[1:2] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: store_aligned: +; GFX90A: ; %bb.0: ; %entry +; GFX90A-NEXT: v_mov_b32_e32 v2, 42 +; GFX90A-NEXT: v_mov_b32_e32 v3, 43 +; GFX90A-NEXT: ds_write_b64 v0, v[2:3] +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_endpgm +; +; GFX10-LABEL: store_aligned: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: v_mov_b32_e32 v1, 42 +; GFX10-NEXT: v_mov_b32_e32 v2, 43 +; GFX10-NEXT: ds_write_b64 v0, v[1:2] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: {{^}}store_aligned: -; GFX11: ds_store_b64 v0, v[1:2] +; GFX9-FLATSCR-LABEL: store_aligned: +; GFX9-FLATSCR: ; %bb.0: ; %entry +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 42 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 43 +; GFX9-FLATSCR-NEXT: ds_write_b64 v0, v[1:2] +; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLATSCR-NEXT: s_endpgm +; +; GFX11-LABEL: store_aligned: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v2, 43 +; GFX11-NEXT: ds_store_b64 v0, v[1:2] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: {{^}}store_aligned: -; GFX12: ds_store_b64 v0, v[1:2] +; GFX12-LABEL: store_aligned: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v2, 43 +; GFX12-NEXT: ds_store_b64 v0, v[1:2] ; GFX12-NEXT: s_wait_dscnt 0x0 - -define amdgpu_cs void @store_aligned(ptr addrspace(3) %ptr) #0 { +; GFX12-NEXT: s_endpgm entry: %ptr.gep.1 = getelementptr i32, ptr addrspace(3) %ptr, i32 1 @@ -527,23 +1240,43 @@ entry: ; from lds-bounds.ll ; covers ds_read_b64 -; GFX9-LABEL: {{^}}load_aligned: -; GFX9: ds_read_b64 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; -; GFX10-LABEL: {{^}}load_aligned: -; GFX10: ds_read_b64 v[0:1], v0 +define amdgpu_cs <2 x float> @load_aligned(ptr addrspace(3) %ptr) #0 { +; GFX9-LABEL: load_aligned: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: ds_read_b64 v[0:1], v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX90A-LABEL: load_aligned: +; GFX90A: ; %bb.0: ; %entry +; GFX90A-NEXT: ds_read_b64 v[0:1], v0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: load_aligned: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: ds_read_b64 v[0:1], v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX9-FLATSCR-LABEL: load_aligned: +; GFX9-FLATSCR: ; %bb.0: ; %entry +; GFX9-FLATSCR-NEXT: ds_read_b64 v[0:1], v0 +; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLATSCR-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: {{^}}load_aligned: -; GFX11: ds_load_b64 v[0:1], v0 +; GFX11-LABEL: load_aligned: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: ds_load_b64 v[0:1], v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: ; return to shader part epilog ; -; GFX12-LABEL: {{^}}load_aligned: -; GFX12: ds_load_b64 v[0:1], v0 +; GFX12-LABEL: load_aligned: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: ds_load_b64 v[0:1], v0 ; GFX12-NEXT: s_wait_dscnt 0x0 - -define amdgpu_cs <2 x float> @load_aligned(ptr addrspace(3) %ptr) #0 { +; GFX12-NEXT: ; return to shader part epilog entry: %ptr.gep.1 = getelementptr i32, ptr addrspace(3) %ptr, i32 1 @@ -558,23 +1291,59 @@ entry: ; from lds-bounds.ll ; covers ds_write2_b32 -; GFX9-LABEL: {{^}}store_global_const_idx: -; GFX9: ds_write2_b32 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; -; GFX10-LABEL: {{^}}store_global_const_idx: -; GFX10: ds_write2_b32 v0, v1, v2 offset0:3 offset1:4 +define amdgpu_cs void @store_global_const_idx() #0 { +; GFX9-LABEL: store_global_const_idx: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: v_mov_b32_e32 v0, compute_lds@abs32@lo +; GFX9-NEXT: v_mov_b32_e32 v1, 42 +; GFX9-NEXT: v_mov_b32_e32 v2, 43 +; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset0:3 offset1:4 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: store_global_const_idx: +; GFX90A: ; %bb.0: ; %entry +; GFX90A-NEXT: v_mov_b32_e32 v0, compute_lds@abs32@lo +; GFX90A-NEXT: v_mov_b32_e32 v1, 42 +; GFX90A-NEXT: v_mov_b32_e32 v2, 43 +; GFX90A-NEXT: ds_write2_b32 v0, v1, v2 offset0:3 offset1:4 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_endpgm +; +; GFX10-LABEL: store_global_const_idx: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: v_mov_b32_e32 v0, compute_lds@abs32@lo +; GFX10-NEXT: v_mov_b32_e32 v1, 42 +; GFX10-NEXT: v_mov_b32_e32 v2, 43 +; GFX10-NEXT: ds_write2_b32 v0, v1, v2 offset0:3 offset1:4 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_endpgm +; +; GFX9-FLATSCR-LABEL: store_global_const_idx: +; GFX9-FLATSCR: ; %bb.0: ; %entry +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, compute_lds@abs32@lo +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 42 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 43 +; GFX9-FLATSCR-NEXT: ds_write2_b32 v0, v1, v2 offset0:3 offset1:4 +; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLATSCR-NEXT: s_endpgm ; -; GFX11-LABEL: {{^}}store_global_const_idx: -; GFX11: ds_store_2addr_b32 v0, v1, v2 offset0:3 offset1:4 +; GFX11-LABEL: store_global_const_idx: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: v_dual_mov_b32 v0, compute_lds@abs32@lo :: v_dual_mov_b32 v1, 42 +; GFX11-NEXT: v_mov_b32_e32 v2, 43 +; GFX11-NEXT: ds_store_2addr_b32 v0, v1, v2 offset0:3 offset1:4 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: {{^}}store_global_const_idx: -; GFX12: ds_store_2addr_b32 v0, v1, v2 offset0:3 offset1:4 +; GFX12-LABEL: store_global_const_idx: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: v_dual_mov_b32 v0, compute_lds@abs32@lo :: v_dual_mov_b32 v1, 42 +; GFX12-NEXT: v_mov_b32_e32 v2, 43 +; GFX12-NEXT: ds_store_2addr_b32 v0, v1, v2 offset0:3 offset1:4 ; GFX12-NEXT: s_wait_dscnt 0x0 - -define amdgpu_cs void @store_global_const_idx() #0 { +; GFX12-NEXT: s_endpgm entry: %ptr.a = getelementptr [512 x i32], ptr addrspace(3) @compute_lds, i32 0, i32 3 %ptr.b = getelementptr [512 x i32], ptr addrspace(3) @compute_lds, i32 0, i32 4 @@ -586,23 +1355,49 @@ entry: ; from lds-bounds.ll ; covers ds_read2_b32 -; GFX9-LABEL: {{^}}load_global_const_idx: -; GFX9: ds_read2_b32 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; -; GFX10-LABEL: {{^}}load_global_const_idx: -; GFX10: ds_read2_b32 v[0:1], v0 offset0:3 offset1:4 +define amdgpu_cs <2 x float> @load_global_const_idx() #0 { +; GFX9-LABEL: load_global_const_idx: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: v_mov_b32_e32 v0, compute_lds@abs32@lo +; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset0:3 offset1:4 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX90A-LABEL: load_global_const_idx: +; GFX90A: ; %bb.0: ; %entry +; GFX90A-NEXT: v_mov_b32_e32 v0, compute_lds@abs32@lo +; GFX90A-NEXT: ds_read2_b32 v[0:1], v0 offset0:3 offset1:4 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: load_global_const_idx: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: v_mov_b32_e32 v0, compute_lds@abs32@lo +; GFX10-NEXT: ds_read2_b32 v[0:1], v0 offset0:3 offset1:4 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: {{^}}load_global_const_idx: -; GFX11: ds_load_2addr_b32 v[0:1], v0 offset0:3 offset1:4 +; GFX9-FLATSCR-LABEL: load_global_const_idx: +; GFX9-FLATSCR: ; %bb.0: ; %entry +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, compute_lds@abs32@lo +; GFX9-FLATSCR-NEXT: ds_read2_b32 v[0:1], v0 offset0:3 offset1:4 +; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLATSCR-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: load_global_const_idx: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: v_mov_b32_e32 v0, compute_lds@abs32@lo +; GFX11-NEXT: ds_load_2addr_b32 v[0:1], v0 offset0:3 offset1:4 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: ; return to shader part epilog ; -; GFX12-LABEL: {{^}}load_global_const_idx: -; GFX12: ds_load_2addr_b32 v[0:1], v0 offset0:3 offset1:4 +; GFX12-LABEL: load_global_const_idx: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: v_mov_b32_e32 v0, compute_lds@abs32@lo +; GFX12-NEXT: ds_load_2addr_b32 v[0:1], v0 offset0:3 offset1:4 ; GFX12-NEXT: s_wait_dscnt 0x0 - -define amdgpu_cs <2 x float> @load_global_const_idx() #0 { +; GFX12-NEXT: ; return to shader part epilog entry: %ptr.a = getelementptr [512 x i32], ptr addrspace(3) @compute_lds, i32 0, i32 3 %ptr.b = getelementptr [512 x i32], ptr addrspace(3) @compute_lds, i32 0, i32 4 From 1e3c7ddb357729a0dd9f279654a40374ef0cd976 Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Sun, 31 Mar 2024 19:10:46 -0500 Subject: [PATCH 12/16] Replace testcases test_load_store() and test_load_store_as5() with tail_call_byval_align16() to cover buffer_load/store, and scratch_load/store. --- .../insert_waitcnt_for_precise_memory.ll | 235 +++++++++--------- 1 file changed, 114 insertions(+), 121 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll index ef9e4cc7528ce0..56a817cdd3996b 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll @@ -307,133 +307,126 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { ret i32 %result } -; from bf16.ll -; covers flat_load, flat_store, global_load, global_store +; from call-argument-types.ll +; covers scratch_load, scratch_store, buffer_load, buffer_store ; -define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) { -; GFX9-LABEL: test_load_store: -; GFX9: ; %bb.0: +declare hidden void @byval_align16_f64_arg(<32 x i32>, ptr addrspace(5) byval(double) align 16) +define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) { +; GFX9-LABEL: tail_call_byval_align16: +; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_short v[2:3], v0, off +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: test_load_store: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_ushort v0, v[0:1], off -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_store_short v[2:3], v0, off -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: test_load_store: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_store_short v[2:3], v0, off -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-FLATSCR-LABEL: test_load_store: -; GFX9-FLATSCR: ; %bb.0: -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-FLATSCR-NEXT: global_load_ushort v0, v[0:1], off -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: global_store_short v[2:3], v0, off -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: test_load_store: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_u16 v0, v[0:1], off -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_store_b16 v[2:3], v0, off -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: test_load_store: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_u16 v0, v[0:1], off -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_store_b16 v[2:3], v0, off -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %val = load bfloat, ptr addrspace(1) %in - store bfloat %val, ptr addrspace(1) %out - ret void -} - -; covers scratch_load, scratch_store, buffer_load, buffer_store -define void @test_load_store_as5(ptr addrspace(5) %in, ptr addrspace(5) %out) { -; GFX9-LABEL: test_load_store_as5: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen +; GFX9-NEXT: s_getpc_b64 s[16:17] +; GFX9-NEXT: s_add_u32 s16, s16, byval_align16_f64_arg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s17, s17, byval_align16_f64_arg@rel32@hi+12 +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:20 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[16:17] ; -; GFX90A-LABEL: test_load_store_as5: -; GFX90A: ; %bb.0: +; GFX90A-LABEL: tail_call_byval_align16: +; GFX90A: ; %bb.0: ; %entry ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: buffer_load_dword v34, off, s[0:3], s32 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_getpc_b64 s[16:17] +; GFX90A-NEXT: s_add_u32 s16, s16, byval_align16_f64_arg@rel32@lo+4 +; GFX90A-NEXT: s_addc_u32 s17, s17, byval_align16_f64_arg@rel32@hi+12 +; GFX90A-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:20 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:16 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_store_dword v34, off, s[0:3], s32 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[16:17] ; -; GFX10-LABEL: test_load_store_as5: -; GFX10: ; %bb.0: +; GFX10-LABEL: tail_call_byval_align16: +; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen +; GFX10-NEXT: s_getpc_b64 s[16:17] +; GFX10-NEXT: s_add_u32 s16, s16, byval_align16_f64_arg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s17, s17, byval_align16_f64_arg@rel32@hi+12 +; GFX10-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:20 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:16 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_store_dword v34, off, s[0:3], s32 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[16:17] ; -; GFX9-FLATSCR-LABEL: test_load_store_as5: -; GFX9-FLATSCR: ; %bb.0: +; GFX9-FLATSCR-LABEL: tail_call_byval_align16: +; GFX9-FLATSCR: ; %bb.0: ; %entry ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_load_ushort v0, v0, off +; GFX9-FLATSCR-NEXT: scratch_load_dword v32, off, s32 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_short v1, v0, off +; GFX9-FLATSCR-NEXT: s_getpc_b64 s[0:1] +; GFX9-FLATSCR-NEXT: s_add_u32 s0, s0, byval_align16_f64_arg@rel32@lo+4 +; GFX9-FLATSCR-NEXT: s_addc_u32 s1, s1, byval_align16_f64_arg@rel32@hi+12 +; GFX9-FLATSCR-NEXT: scratch_store_dword off, v32, s32 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: s_setpc_b64 s[30:31] +; GFX9-FLATSCR-NEXT: scratch_load_dwordx2 v[32:33], off, s32 offset:24 +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: scratch_store_dwordx2 off, v[32:33], s32 offset:16 +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: s_setpc_b64 s[0:1] ; -; GFX11-LABEL: test_load_store_as5: -; GFX11: ; %bb.0: +; GFX11-LABEL: tail_call_byval_align16: +; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: scratch_load_u16 v0, v0, off +; GFX11-NEXT: scratch_load_b32 v32, off, s32 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b16 v1, v0, off +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, byval_align16_f64_arg@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, byval_align16_f64_arg@rel32@hi+12 +; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: scratch_load_b64 v[32:33], off, s32 offset:24 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: scratch_store_b64 off, v[32:33], s32 offset:16 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[0:1] ; -; GFX12-LABEL: test_load_store_as5: -; GFX12: ; %bb.0: +; GFX12-LABEL: tail_call_byval_align16: +; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: scratch_load_u16 v0, v0, off +; GFX12-NEXT: scratch_load_b32 v32, off, s32 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: scratch_store_b16 v1, v0, off +; GFX12-NEXT: s_getpc_b64 s[0:1] +; GFX12-NEXT: s_sext_i32_i16 s1, s1 +; GFX12-NEXT: s_add_co_u32 s0, s0, byval_align16_f64_arg@rel32@lo+8 +; GFX12-NEXT: s_add_co_ci_u32 s1, s1, byval_align16_f64_arg@rel32@hi+16 +; GFX12-NEXT: scratch_store_b32 off, v32, s32 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %val = load bfloat, ptr addrspace(5) %in - store bfloat %val, ptr addrspace(5) %out +; GFX12-NEXT: scratch_load_b64 v[32:33], off, s32 offset:24 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: scratch_store_b64 off, v[32:33], s32 offset:16 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[0:1] +entry: + %alloca = alloca double, align 8, addrspace(5) + tail call void @byval_align16_f64_arg(<32 x i32> %val, ptr addrspace(5) byval(double) align 16 %alloca) ret void } @@ -704,7 +697,7 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB6_2 +; GFX9-NEXT: s_cbranch_execz .LBB5_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -715,7 +708,7 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; GFX9-NEXT: ds_add_u32 v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB6_2: +; GFX9-NEXT: .LBB5_2: ; GFX9-NEXT: s_endpgm ; ; GFX90A-LABEL: atomic_add_local: @@ -725,7 +718,7 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB6_2 +; GFX90A-NEXT: s_cbranch_execz .LBB5_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) @@ -736,7 +729,7 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; GFX90A-NEXT: ds_add_u32 v0, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: .LBB6_2: +; GFX90A-NEXT: .LBB5_2: ; GFX90A-NEXT: s_endpgm ; ; GFX10-LABEL: atomic_add_local: @@ -745,7 +738,7 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB6_2 +; GFX10-NEXT: s_cbranch_execz .LBB5_2 ; GFX10-NEXT: ; %bb.1: ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -756,7 +749,7 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; GFX10-NEXT: ds_add_u32 v0, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: .LBB6_2: +; GFX10-NEXT: .LBB5_2: ; GFX10-NEXT: s_endpgm ; ; GFX9-FLATSCR-LABEL: atomic_add_local: @@ -766,7 +759,7 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB6_2 +; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB5_2 ; GFX9-FLATSCR-NEXT: ; %bb.1: ; GFX9-FLATSCR-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) @@ -777,7 +770,7 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; GFX9-FLATSCR-NEXT: ds_add_u32 v0, v1 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-FLATSCR-NEXT: .LBB6_2: +; GFX9-FLATSCR-NEXT: .LBB5_2: ; GFX9-FLATSCR-NEXT: s_endpgm ; ; GFX11-LABEL: atomic_add_local: @@ -787,7 +780,7 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-NEXT: s_cbranch_execz .LBB5_2 ; GFX11-NEXT: ; %bb.1: ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -798,7 +791,7 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; GFX11-NEXT: ds_add_u32 v0, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: .LBB6_2: +; GFX11-NEXT: .LBB5_2: ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_add_local: @@ -808,7 +801,7 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12-NEXT: s_cbranch_execz .LBB6_2 +; GFX12-NEXT: s_cbranch_execz .LBB5_2 ; GFX12-NEXT: ; %bb.1: ; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -819,7 +812,7 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; GFX12-NEXT: ds_add_u32 v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: .LBB6_2: +; GFX12-NEXT: .LBB5_2: ; GFX12-NEXT: s_endpgm %unused = atomicrmw volatile add ptr addrspace(3) %local, i32 5 seq_cst ret void @@ -839,7 +832,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB7_2 +; GFX9-NEXT: s_cbranch_execz .LBB6_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -848,7 +841,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: .LBB7_2: +; GFX9-NEXT: .LBB6_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -867,7 +860,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: ; implicit-def: $vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB7_2 +; GFX90A-NEXT: s_cbranch_execz .LBB6_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) @@ -876,7 +869,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX90A-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: .LBB7_2: +; GFX90A-NEXT: .LBB6_2: ; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) @@ -894,7 +887,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB7_2 +; GFX10-NEXT: s_cbranch_execz .LBB6_2 ; GFX10-NEXT: ; %bb.1: ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -903,7 +896,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: .LBB7_2: +; GFX10-NEXT: .LBB6_2: ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 @@ -923,7 +916,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-FLATSCR-NEXT: ; implicit-def: $vgpr1 ; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB7_2 +; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB6_2 ; GFX9-FLATSCR-NEXT: ; %bb.1: ; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) @@ -932,7 +925,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-FLATSCR-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: .LBB7_2: +; GFX9-FLATSCR-NEXT: .LBB6_2: ; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) @@ -951,7 +944,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11-NEXT: ; implicit-def: $vgpr1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-NEXT: s_cbranch_execz .LBB6_2 ; GFX11-NEXT: ; %bb.1: ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -961,7 +954,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: .LBB7_2: +; GFX11-NEXT: .LBB6_2: ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -981,7 +974,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12-NEXT: ; implicit-def: $vgpr1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12-NEXT: s_cbranch_execz .LBB7_2 +; GFX12-NEXT: s_cbranch_execz .LBB6_2 ; GFX12-NEXT: ; %bb.1: ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -991,7 +984,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: .LBB7_2: +; GFX12-NEXT: .LBB6_2: ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 From 4f4bf313293f81a2dddec4b9e7d058a24df676ec Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Tue, 2 Apr 2024 17:30:36 -0500 Subject: [PATCH 13/16] Check if mem instruciton is already immediately followed by a waitcnt instruction. If so, do not insert another waitcnt. Also add a testcase that has ds_add_rtn. Formatting change made to SIMemoryLegalizer.cpp is reverted. --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 14 +- llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 10 +- .../insert_waitcnt_for_precise_memory.ll | 214 ++++++++++++++++-- 3 files changed, 214 insertions(+), 24 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 556ec3e231ff19..fe1bf9478cde87 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -2306,11 +2306,15 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, #endif if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) { - AMDGPU::Waitcnt Wait = WCG->getAllZeroWaitcnt( - Inst.mayStore() && !SIInstrInfo::isAtomicRet(Inst)); - ScoreBrackets.simplifyWaitcnt(Wait); - Modified |= generateWaitcnt(Wait, std::next(Inst.getIterator()), Block, - ScoreBrackets, /*OldWaitcntInstr=*/nullptr); + Iter++; + if (!isWaitInstr(*Iter)) { + AMDGPU::Waitcnt Wait = WCG->getAllZeroWaitcnt( + Inst.mayStore() && !SIInstrInfo::isAtomicRet(Inst)); + ScoreBrackets.simplifyWaitcnt(Wait); + Modified |= generateWaitcnt(Wait, std::next(Inst.getIterator()), Block, + ScoreBrackets, /*OldWaitcntInstr=*/nullptr); + } + Iter--; } LLVM_DEBUG({ diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 91a4f8973f2990..c5f256562f78b7 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -2606,10 +2606,12 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || MOI.getFailureOrdering() == AtomicOrdering::Acquire || MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) { - Changed |= - CC->insertWait(MI, MOI.getScope(), MOI.getInstrAddrSpace(), - isAtomicRet(*MI) ? SIMemOp::LOAD : SIMemOp::STORE, - MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER); + Changed |= CC->insertWait(MI, MOI.getScope(), + MOI.getInstrAddrSpace(), + isAtomicRet(*MI) ? SIMemOp::LOAD : + SIMemOp::STORE, + MOI.getIsCrossAddressSpaceOrdering(), + Position::AFTER); Changed |= CC->insertAcquire(MI, MOI.getScope(), MOI.getOrderingAddrSpace(), Position::AFTER); diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll index 56a817cdd3996b..bb23324caec264 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll @@ -707,7 +707,6 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_add_u32 v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB5_2: ; GFX9-NEXT: s_endpgm ; @@ -728,7 +727,6 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; GFX90A-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NEXT: ds_add_u32 v0, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: .LBB5_2: ; GFX90A-NEXT: s_endpgm ; @@ -769,7 +767,6 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-FLATSCR-NEXT: ds_add_u32 v0, v1 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: .LBB5_2: ; GFX9-FLATSCR-NEXT: s_endpgm ; @@ -818,6 +815,193 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ret void } +; from atomic_load_add.ll +; covers s_load, ds_add_rtn (atomic with return) +; +define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrspace(3) %local) { +; GFX9-LABEL: atomic_add_ret_local: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_cbranch_execz .LBB6_2 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX9-NEXT: s_mul_i32 s4, s4, 5 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: .LBB6_2: +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: atomic_add_ret_local: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_mov_b64 s[4:5], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: ; implicit-def: $vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX90A-NEXT: s_cbranch_execz .LBB6_2 +; GFX90A-NEXT: ; %bb.1: +; GFX90A-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX90A-NEXT: s_mul_i32 s4, s4, 5 +; GFX90A-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NEXT: ds_add_rtn_u32 v1, v1, v2 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: .LBB6_2: +; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_readfirstlane_b32 s2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX90A-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_add_ret_local: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s3, exec_lo +; GFX10-NEXT: ; implicit-def: $vgpr1 +; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX10-NEXT: s_cbranch_execz .LBB6_2 +; GFX10-NEXT: ; %bb.1: +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX10-NEXT: s_mul_i32 s3, s3, 5 +; GFX10-NEXT: v_mov_b32_e32 v2, s3 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: ds_add_rtn_u32 v1, v1, v2 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: .LBB6_2: +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s2, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_endpgm +; +; GFX9-FLATSCR-LABEL: atomic_add_ret_local: +; GFX9-FLATSCR: ; %bb.0: +; GFX9-FLATSCR-NEXT: s_mov_b64 s[4:5], exec +; GFX9-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-FLATSCR-NEXT: ; implicit-def: $vgpr1 +; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB6_2 +; GFX9-FLATSCR-NEXT: ; %bb.1: +; GFX9-FLATSCR-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLATSCR-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX9-FLATSCR-NEXT: s_mul_i32 s4, s4, 5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-FLATSCR-NEXT: ds_add_rtn_u32 v1, v1, v2 +; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLATSCR-NEXT: .LBB6_2: +; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLATSCR-NEXT: v_readfirstlane_b32 s2, v1 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-FLATSCR-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX9-FLATSCR-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_add_ret_local: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b32 s3, exec_lo +; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-NEXT: ; %bb.1: +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mul_i32 s3, s3, 5 +; GFX11-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v1, s4 +; GFX11-NEXT: ds_add_rtn_u32 v1, v1, v2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: .LBB6_2: +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_ret_local: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_mov_b32 s3, exec_lo +; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX12-NEXT: ; implicit-def: $vgpr1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX12-NEXT: s_cbranch_execz .LBB6_2 +; GFX12-NEXT: ; %bb.1: +; GFX12-NEXT: s_load_b32 s4, s[0:1], 0x2c +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_mul_i32 s3, s3, 5 +; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v1, s4 +; GFX12-NEXT: ds_add_rtn_u32 v1, v1, v2 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: .LBB6_2: +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s2, v1 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_endpgm + %val = atomicrmw volatile add ptr addrspace(3) %local, i32 5 seq_cst + store i32 %val, ptr addrspace(1) %out + ret void +} + declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32, ptr addrspace(8), i32, i32, i32 immarg) ; from atomic_optimizations_buffer.ll @@ -832,7 +1016,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB6_2 +; GFX9-NEXT: s_cbranch_execz .LBB7_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -841,7 +1025,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: .LBB6_2: +; GFX9-NEXT: .LBB7_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -860,7 +1044,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: ; implicit-def: $vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB6_2 +; GFX90A-NEXT: s_cbranch_execz .LBB7_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) @@ -869,7 +1053,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX90A-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: .LBB6_2: +; GFX90A-NEXT: .LBB7_2: ; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) @@ -887,7 +1071,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB6_2 +; GFX10-NEXT: s_cbranch_execz .LBB7_2 ; GFX10-NEXT: ; %bb.1: ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -896,7 +1080,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: .LBB6_2: +; GFX10-NEXT: .LBB7_2: ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 @@ -916,7 +1100,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-FLATSCR-NEXT: ; implicit-def: $vgpr1 ; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB6_2 +; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB7_2 ; GFX9-FLATSCR-NEXT: ; %bb.1: ; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) @@ -925,7 +1109,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-FLATSCR-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: .LBB6_2: +; GFX9-FLATSCR-NEXT: .LBB7_2: ; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) @@ -944,7 +1128,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11-NEXT: ; implicit-def: $vgpr1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-NEXT: s_cbranch_execz .LBB7_2 ; GFX11-NEXT: ; %bb.1: ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -954,7 +1138,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: .LBB6_2: +; GFX11-NEXT: .LBB7_2: ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -974,7 +1158,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12-NEXT: ; implicit-def: $vgpr1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12-NEXT: s_cbranch_execz .LBB6_2 +; GFX12-NEXT: s_cbranch_execz .LBB7_2 ; GFX12-NEXT: ; %bb.1: ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -984,7 +1168,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: .LBB6_2: +; GFX12-NEXT: .LBB7_2: ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 From 4ae38b6877893c5bbb70c35a7fab51bbc5665733 Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Tue, 2 Apr 2024 18:36:07 -0500 Subject: [PATCH 14/16] Change iterator update operator from post-inc to pre-inc. --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index fe1bf9478cde87..8426310db95aae 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -2306,7 +2306,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, #endif if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) { - Iter++; + ++Iter; if (!isWaitInstr(*Iter)) { AMDGPU::Waitcnt Wait = WCG->getAllZeroWaitcnt( Inst.mayStore() && !SIInstrInfo::isAtomicRet(Inst)); @@ -2314,7 +2314,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, Modified |= generateWaitcnt(Wait, std::next(Inst.getIterator()), Block, ScoreBrackets, /*OldWaitcntInstr=*/nullptr); } - Iter--; + --Iter; } LLVM_DEBUG({ From 49cad2dfb106acf33586f59d6d4f7af9c3af51ee Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Thu, 4 Apr 2024 13:57:15 -0500 Subject: [PATCH 15/16] With #87539, previous commit that checks for the instruction immediately after a load/store is not necessary. --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 14 +++++--------- llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 9 ++++----- 2 files changed, 9 insertions(+), 14 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 8426310db95aae..556ec3e231ff19 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -2306,15 +2306,11 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, #endif if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) { - ++Iter; - if (!isWaitInstr(*Iter)) { - AMDGPU::Waitcnt Wait = WCG->getAllZeroWaitcnt( - Inst.mayStore() && !SIInstrInfo::isAtomicRet(Inst)); - ScoreBrackets.simplifyWaitcnt(Wait); - Modified |= generateWaitcnt(Wait, std::next(Inst.getIterator()), Block, - ScoreBrackets, /*OldWaitcntInstr=*/nullptr); - } - --Iter; + AMDGPU::Waitcnt Wait = WCG->getAllZeroWaitcnt( + Inst.mayStore() && !SIInstrInfo::isAtomicRet(Inst)); + ScoreBrackets.simplifyWaitcnt(Wait); + Modified |= generateWaitcnt(Wait, std::next(Inst.getIterator()), Block, + ScoreBrackets, /*OldWaitcntInstr=*/nullptr); } LLVM_DEBUG({ diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index c5f256562f78b7..62306fa667b360 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -2484,11 +2484,10 @@ bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI, // Atomic instructions already bypass caches to the scope specified by the // SyncScope operand. Only non-atomic volatile and nontemporal/last-use // instructions need additional treatment. - // SyncScope operand. Only non-atomic volatile and nontemporal instructions - // need additional treatment. - Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(), - SIMemOp::LOAD, MOI.isVolatile(), - MOI.isNonTemporal()); + Changed |= CC->enableVolatileAndOrNonTemporal( + MI, MOI.getInstrAddrSpace(), SIMemOp::LOAD, MOI.isVolatile(), + MOI.isNonTemporal(), MOI.isLastUse()); + return Changed; } From 6d52f6e74511d916b6901fed9718fe8ddd87c14b Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Mon, 8 Apr 2024 19:25:27 -0500 Subject: [PATCH 16/16] Add testcase that covers flat_atomic_swap, an atomic without return. --- .../insert_waitcnt_for_precise_memory.ll | 116 ++++++++++++++---- 1 file changed, 92 insertions(+), 24 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll index bb23324caec264..df03e893703777 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll @@ -815,6 +815,74 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ret void } +; from flat_atomics_i32_system.ll +; covers flat_atomic_swap (atomic without return) +; +define void @flat_atomic_xchg_i32_noret(ptr %ptr, i32 %in) { +; GFX9-LABEL: flat_atomic_xchg_i32_noret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_swap v[0:1], v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: flat_atomic_xchg_i32_noret: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: flat_atomic_xchg_i32_noret: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLATSCR-LABEL: flat_atomic_xchg_i32_noret: +; GFX9-FLATSCR: ; %bb.0: +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLATSCR-NEXT: flat_atomic_swap v[0:1], v2 +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-FLATSCR-NEXT: buffer_wbinvl1_vol +; GFX9-FLATSCR-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_atomic_xchg_i32_noret: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_atomic_xchg_i32_noret: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw xchg ptr %ptr, i32 %in seq_cst + ret void +} + ; from atomic_load_add.ll ; covers s_load, ds_add_rtn (atomic with return) ; @@ -827,7 +895,7 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB6_2 +; GFX9-NEXT: s_cbranch_execz .LBB7_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -837,7 +905,7 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB6_2: +; GFX9-NEXT: .LBB7_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -856,7 +924,7 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: ; implicit-def: $vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB6_2 +; GFX90A-NEXT: s_cbranch_execz .LBB7_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) @@ -866,7 +934,7 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX90A-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: .LBB6_2: +; GFX90A-NEXT: .LBB7_2: ; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) @@ -884,7 +952,7 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB6_2 +; GFX10-NEXT: s_cbranch_execz .LBB7_2 ; GFX10-NEXT: ; %bb.1: ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -895,7 +963,7 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX10-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: .LBB6_2: +; GFX10-NEXT: .LBB7_2: ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 @@ -915,7 +983,7 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-FLATSCR-NEXT: ; implicit-def: $vgpr1 ; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB6_2 +; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB7_2 ; GFX9-FLATSCR-NEXT: ; %bb.1: ; GFX9-FLATSCR-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) @@ -925,7 +993,7 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-FLATSCR-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-FLATSCR-NEXT: .LBB6_2: +; GFX9-FLATSCR-NEXT: .LBB7_2: ; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) @@ -944,7 +1012,7 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: ; implicit-def: $vgpr1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-NEXT: s_cbranch_execz .LBB7_2 ; GFX11-NEXT: ; %bb.1: ; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -955,7 +1023,7 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: .LBB6_2: +; GFX11-NEXT: .LBB7_2: ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -975,7 +1043,7 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: ; implicit-def: $vgpr1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12-NEXT: s_cbranch_execz .LBB6_2 +; GFX12-NEXT: s_cbranch_execz .LBB7_2 ; GFX12-NEXT: ; %bb.1: ; GFX12-NEXT: s_load_b32 s4, s[0:1], 0x2c ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -986,7 +1054,7 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: .LBB6_2: +; GFX12-NEXT: .LBB7_2: ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1016,7 +1084,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB7_2 +; GFX9-NEXT: s_cbranch_execz .LBB8_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1025,7 +1093,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: .LBB7_2: +; GFX9-NEXT: .LBB8_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1044,7 +1112,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: ; implicit-def: $vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB7_2 +; GFX90A-NEXT: s_cbranch_execz .LBB8_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) @@ -1053,7 +1121,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX90A-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: .LBB7_2: +; GFX90A-NEXT: .LBB8_2: ; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) @@ -1071,7 +1139,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB7_2 +; GFX10-NEXT: s_cbranch_execz .LBB8_2 ; GFX10-NEXT: ; %bb.1: ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1080,7 +1148,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: .LBB7_2: +; GFX10-NEXT: .LBB8_2: ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 @@ -1100,7 +1168,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-FLATSCR-NEXT: ; implicit-def: $vgpr1 ; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB7_2 +; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB8_2 ; GFX9-FLATSCR-NEXT: ; %bb.1: ; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) @@ -1109,7 +1177,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-FLATSCR-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: .LBB7_2: +; GFX9-FLATSCR-NEXT: .LBB8_2: ; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) @@ -1128,7 +1196,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11-NEXT: ; implicit-def: $vgpr1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-NEXT: s_cbranch_execz .LBB8_2 ; GFX11-NEXT: ; %bb.1: ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1138,7 +1206,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: .LBB7_2: +; GFX11-NEXT: .LBB8_2: ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1158,7 +1226,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12-NEXT: ; implicit-def: $vgpr1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12-NEXT: s_cbranch_execz .LBB7_2 +; GFX12-NEXT: s_cbranch_execz .LBB8_2 ; GFX12-NEXT: ; %bb.1: ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1168,7 +1236,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: .LBB7_2: +; GFX12-NEXT: .LBB8_2: ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0