[AMDGPU] Emit a waitcnt instruction after each memory instruction #79236

jwanggit86 · 2024-01-24T01:27:02Z

This patch introduces a new command-line option for clang, namely, amdgpu-precise-mem-op. When this option is specified, a waitcnt instruction is generated after each memory load/store instruction. The counter values are always 0, but which counters are involved depends on the memory instruction.

llvmbot · 2024-01-24T01:27:21Z

@llvm/pr-subscribers-clang
@llvm/pr-subscribers-clang-driver

@llvm/pr-subscribers-backend-amdgpu

Author: Jun Wang (jwanggit86)

Changes

This patch introduces a new command-line option for clang, namely, amdgpu-precise-mem-op. When this option is specified, a waitcnt instruction is generated after each memory load/store instruction. The counter values are always 0, but which counters are involved depends on the memory instruction.

Full diff: https://github.com/llvm/llvm-project/pull/79236.diff

6 Files Affected:

(modified) clang/include/clang/Driver/Options.td (+4)
(modified) clang/test/Driver/amdgpu-features.c (+6)
(modified) llvm/lib/Target/AMDGPU/AMDGPU.td (+4)
(modified) llvm/lib/Target/AMDGPU/GCNSubtarget.h (+3)
(modified) llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp (+79)
(added) llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll (+199)

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 7f4fa33748facaf..d570786534b3611 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -4796,6 +4796,10 @@ defm tgsplit : SimpleMFlag<"tgsplit", "Enable", "Disable",
 defm wavefrontsize64 : SimpleMFlag<"wavefrontsize64",
   "Specify wavefront size 64", "Specify wavefront size 32",
   " mode (AMDGPU only)">;
+defm amdgpu_precise_memory_op
+    : SimpleMFlag<"amdgpu-precise-memory-op", "Enable", "Disable",
+                  " precise memory mode (AMDGPU only)",
+                  m_amdgpu_Features_Group>;
 
 defm unsafe_fp_atomics : BoolOption<"m", "unsafe-fp-atomics",
   TargetOpts<"AllowAMDGPUUnsafeFPAtomics">, DefaultFalse,
diff --git a/clang/test/Driver/amdgpu-features.c b/clang/test/Driver/amdgpu-features.c
index a516bc6b7ff2004..57d31ccedd8783e 100644
--- a/clang/test/Driver/amdgpu-features.c
+++ b/clang/test/Driver/amdgpu-features.c
@@ -32,3 +32,9 @@
 
 // RUN: %clang -### -target amdgcn -mcpu=gfx1010 -mno-cumode %s 2>&1 | FileCheck --check-prefix=NO-CUMODE %s
 // NO-CUMODE: "-target-feature" "-cumode"
+
+// RUN: %clang -### -target amdgcn -mcpu=gfx1010 -mamdgpu-precise-memory-op %s 2>&1 | FileCheck --check-prefix=PREC-MEM %s
+// PREC-MEM: "-target-feature" "+amdgpu-precise-memory-op"
+
+// RUN: %clang -### -target amdgcn -mcpu=gfx1010 -mno-amdgpu-precise-memory-op %s 2>&1 | FileCheck --check-prefix=NO-PREC-MEM %s
+// NO-PREC-MEM: "-target-feature" "-amdgpu-precise-memory-op"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index cb29d5d94759812..c39cc9477023591 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -167,6 +167,10 @@ def FeatureCuMode : SubtargetFeature<"cumode",
   "Enable CU wavefront execution mode"
 >;
 
+def FeaturePreciseMemory
+    : SubtargetFeature<"amdgpu-precise-memory-op", "EnablePreciseMemory",
+                       "true", "Enable precise memory mode">;
+
 def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug",
   "SGPRInitBug",
   "true",
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 8019b98b1c68d66..b69df21f7859851 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -87,6 +87,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool EnableTgSplit = false;
   bool EnableCuMode = false;
   bool TrapHandler = false;
+  bool EnablePreciseMemory = false;
 
   // Used as options.
   bool EnableLoadStoreOpt = false;
@@ -592,6 +593,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return EnableCuMode;
   }
 
+  bool isPreciseMemoryEnabled() const { return EnablePreciseMemory; }
+
   bool hasFlatAddressSpace() const {
     return FlatAddressSpace;
   }
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 84b9330ef9633eb..93cdceb37bd5017 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -17,6 +17,7 @@
 #include "AMDGPUMachineModuleInfo.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/BitmaskEnum.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -24,6 +25,8 @@
 #include "llvm/Support/AtomicOrdering.h"
 #include "llvm/TargetParser/TargetParser.h"
 
+#include <iostream>
+
 using namespace llvm;
 using namespace llvm::AMDGPU;
 
@@ -641,6 +644,9 @@ class SIMemoryLegalizer final : public MachineFunctionPass {
   bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
                                 MachineBasicBlock::iterator &MI);
 
+  bool GFX9InsertWaitcntForPreciseMem(MachineFunction &MF);
+  bool GFX10And11InsertWaitcntForPreciseMem(MachineFunction &MF);
+
 public:
   static char ID;
 
@@ -2561,6 +2567,70 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
   return Changed;
 }
 
+bool SIMemoryLegalizer::GFX9InsertWaitcntForPreciseMem(MachineFunction &MF) {
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  IsaVersion IV = getIsaVersion(ST.getCPU());
+
+  bool Changed = false;
+
+  for (auto &MBB : MF) {
+    for (auto MI = MBB.begin(); MI != MBB.end();) {
+      MachineInstr &Inst = *MI;
+      ++MI;
+      if (Inst.mayLoadOrStore() == false)
+        continue;
+
+      // Todo: if next insn is an s_waitcnt
+      AMDGPU::Waitcnt Wait;
+
+      if (!(Inst.getDesc().TSFlags & SIInstrFlags::maybeAtomic)) {
+        if (TII->isSMRD(Inst)) {          // scalar
+          Wait.DsCnt = 0;                 // LgkmCnt
+        } else {                          // vector
+          if (Inst.mayLoad()) {           // vector load
+            if (TII->isVMEM(Inst))        // VMEM load
+              Wait.LoadCnt = 0;           // VmCnt
+            else if (TII->isFLAT(Inst)) { // Flat load
+              Wait.LoadCnt = 0;           // VmCnt
+              Wait.DsCnt = 0;             // LgkmCnt
+            } else                        // LDS load ?
+              Wait.DsCnt = 0;             // LgkmCnt
+          } else {                        // vector store
+            if (TII->isVMEM(Inst))        // VMEM store
+              Wait.LoadCnt = 0;           // VmCnt
+            else if (TII->isFLAT(Inst)) { // Flat store
+              Wait.LoadCnt = 0;           // VmCnt
+              Wait.DsCnt = 0;             // LgkmCnt
+            } else
+              Wait.DsCnt = 0; // LDS store? LgkmCnt
+          }
+        }                 // vector
+      } else {            // atomic
+        Wait.DsCnt = 0;   // LgkmCnt
+        Wait.LoadCnt = 0; // VmCnt
+      }
+
+      unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+      BuildMI(MBB, MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+      Changed = true;
+    }
+  }
+  return Changed;
+}
+
+bool SIMemoryLegalizer::GFX10And11InsertWaitcntForPreciseMem(
+    MachineFunction &MF) {
+  for (auto &MBB : MF) {
+    for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
+      MachineInstr &Inst = *MI;
+      if (Inst.mayLoadOrStore() == false)
+        continue;
+    }
+  }
+  return true;
+}
+
 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
   bool Changed = false;
 
@@ -2601,6 +2671,15 @@ bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
   }
 
   Changed |= removeAtomicPseudoMIs();
+
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  if (ST.isPreciseMemoryEnabled()) {
+    if (AMDGPU::isGFX10Plus(ST))
+      Changed |= GFX10And11InsertWaitcntForPreciseMem(MF);
+    else
+      Changed |= GFX9InsertWaitcntForPreciseMem(MF);
+  }
+
   return Changed;
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll
new file mode 100644
index 000000000000000..abb9b9071227f85
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll
@@ -0,0 +1,199 @@
+; Testing the -amdgpu-precise-memory-op option
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+amdgpu-precise-memory-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+amdgpu-precise-memory-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX90A
+; COM: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+amdgpu-precise-memory-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global,+enable-flat-scratch,+amdgpu-precise-memory-op -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-FLATSCR %s
+
+; from atomicrmw-expand.ll
+; covers flat_load, flat_atomic
+define void @syncscope_workgroup_nortn(ptr %addr, float %val) {
+; GFX90A-LABEL: syncscope_workgroup_nortn:
+; GFX90A:  ; %bb.0:
+; GFX90A:         flat_load_dword v5, v[0:1]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A:  .LBB0_1: ; %atomicrmw.start
+; GFX90A:         flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+  %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst
+  ret void
+}
+
+; from atomicrmw-nand.ll
+; covers global_atomic, global_load
+define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind {
+; GFX9-LABEL: atomic_nand_i32_global:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v2, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NOT:     s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v3, v2
+; GFX9-NEXT:    v_not_b32_e32 v2, v3
+; GFX9-NEXT:    v_or_b32_e32 v2, -5, v2
+; GFX9-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw nand ptr addrspace(1) %ptr, i32 4 seq_cst
+  ret i32 %result
+}
+
+; from bf16.ll
+; covers buffer_load, buffer_store, flat_load, flat_store, global_load, global_store
+define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+;
+; GFX9-LABEL: test_load_store:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_store_short v[2:3], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_load_store:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_store_short v[2:3], v0, off
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %val = load bfloat, ptr addrspace(1) %in
+  store bfloat %val, ptr addrspace(1) %out
+  ret void
+}
+
+; from scratch-simple.ll
+; covers scratch_load, scratch_store
+;
+; GFX9-FLATSCR-LABEL: {{^}}vs_main:
+; GFX9-FLATSCR:        scratch_store_dwordx4 off, v[{{[0-9:]+}}],
+; GFX9-FLATSCR-NEXT:   s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-FLATSCR:        scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off
+; GFX9-FLATSCR-NEXT:   s_waitcnt vmcnt(0) lgkmcnt(0)
+define amdgpu_vs float @vs_main(i32 %idx) {
+  %v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
+  %v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
+  %r = fadd float %v1, %v2
+  ret float %r
+}
+
+; from udiv.ll
+; covers s_load
+define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
+; GFX9-LABEL: udiv_i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NOT:     s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
+  %r = udiv i32 %x, %y
+  store i32 %r, ptr addrspace(1) %out
+  ret void
+}
+
+declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32)
+
+; from smrd.ll
+; covers s_buffer_load
+; GFX9-LABEL: {{^}}smrd_sgpr_offset:
+; GFX9:         s_buffer_load_dword s{{[0-9]}}, s[0:3], s4
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+define amdgpu_ps float @smrd_sgpr_offset(<4 x i32> inreg %desc, i32 inreg %offset) #0 {
+main_body:
+  %r = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %offset, i32 0)
+  ret float %r
+}
+
+; from atomic_load_add.ll
+; covers s_load, ds_add
+; GFX9-LABEL: atomic_add_local:
+; GFX9:       ; %bb.1:
+; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9:         ds_add_u32 v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) {
+   %unused = atomicrmw volatile add ptr addrspace(3) %local, i32 5 seq_cst
+   ret void
+}
+
+declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32, ptr addrspace(8), i32, i32, i32 immarg)
+
+; from atomic_optimizations_buffer.ll
+; covers buffer_atomic
+; GFX9-LABEL: add_i32_constant:
+; GFX9:       ; %bb.1:
+; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9:         buffer_atomic_add v1, off, s[8:11], 0 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace(8) %inout) {
+entry:
+  %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32 5, ptr addrspace(8) %inout, i32 0, i32 0, i32 0)
+  store i32 %old, ptr addrspace(1) %out
+  ret void
+}
+
+declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32, i16, <8 x i32>, i32, i32)
+
+; from llvm.amdgcn.image.load.a16.ll
+; covers image_load
+; GFX9-LABEL: {{^}}load.f32.1d:
+; GFX9:         image_load v0, v0, s[0:7] dmask:0x1 unorm a16
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+define amdgpu_ps <4 x float> @load.f32.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 1, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+declare void @llvm.amdgcn.image.store.1d.v4f32.i16(<4 x float>, i32, i16, <8 x i32>, i32, i32)
+
+; from llvm.amdgcn.image.store.a16.ll
+; covers image_store
+define amdgpu_ps void @store_f32_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <4 x float> %val) {
+; GFX9-LABEL: store_f32_1d:
+; GFX9:       ; %bb.0: ; %main_body
+; GFX9-NEXT:    image_store v[1:4], v0, s[0:7] dmask:0x1 unorm a16
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_endpgm
+;
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  call void @llvm.amdgcn.image.store.1d.v4f32.i16(<4 x float> %val, i32 1, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+declare i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32, i32, <8 x i32>, i32, i32)
+
+; from llvm.amdgcn.image.atomic.dim.ll
+; covers image_atomic
+; GFX90A-LABEL: {{^}}atomic_swap_1d:
+; GFX90A: image_atomic_swap v0, v{{[02468]}}, s[0:7] dmask:0x1 unorm glc{{$}}
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+define amdgpu_ps float @atomic_swap_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+
+
+

jayfoad · 2024-01-24T10:27:42Z

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

+      AMDGPU::Waitcnt Wait;
+
+      if (!(Inst.getDesc().TSFlags & SIInstrFlags::maybeAtomic)) {
+        if (TII->isSMRD(Inst)) {          // scalar


This logic would need updating again for GFX12. It seems like it's duplicating a lot of knowledge which is already implemented in SIInsertWaitcnts.

That's a valid point. However, even though it does similar work, I'd say the similarity to SIInsertWaitcnt is only to a certain extent. The differences include: (1) This is for a different purpose, i.e. to support the so-called "precise memory mode", in particular precise memory exceptions, for certain GPUs. (2) This feature is optional while SIInsertWaintcnt is not. (3) The counter values in SIInsertWaitcnt are precise, while in this features the counters are simply set to 0.
If performance is a concern, pls note that this feature is controlled by a command-line option which by default is off. The user has to explicitly give the option for it to work. We assume the user knows there's extra work for the compiler when the option is turned on.

@t-tye Hi Tony, do you have any comments? Note that the function for gfx10 and 11 is empty for now. I want to get some feedback before going further.

Can we have a shared helper, e.g. in SIInstrInfo for both? It's a lot of logic to duplicate

The counter values in SIInsertWaitcnt are precise, while in this features the counters are simply set to 0.
That could just be a boolean switch in a shared helper

Both the memory legalizer and SIInsertWaitcnts are required passes. This feature is not optional if requested. I do think this makes more sense to belong in SIInsertWaitcnts

t-tye · 2024-01-26T02:24:11Z

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

@@ -641,6 +644,9 @@ class SIMemoryLegalizer final : public MachineFunctionPass {
  bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
                                MachineBasicBlock::iterator &MI);

+  bool GFX9InsertWaitcntForPreciseMem(MachineFunction &MF);


Should these be combined with the expand* functions? They are supposed to do all that is necessary to "legalize" the opcodes to meet the memory model. And this inserting waitcnts is just another piece of that expansion.

Combining it can also avoid inserting multiple waitcnt for the same memory operation.

Combining it may be able to use the existing operation to ensure a memory operation is completed. I believe that operation should already be determining what kind of waitcnts should be inserted. If not, then I would consider generalizing it so it can be used by both the atomics expansion and the precise memory expansion.

It also keeps the operations in this class architecture neutral.

Agreed, this should definitely be a virtual function such as insertWaitcntForPreciseMem and let the CacheControl implementation do what is needed. This is just emulating what CacheControl already does

Sure. Code has been updated as suggested.

llvm/lib/Target/AMDGPU/AMDGPU.td

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll

Pierre-vh · 2024-01-29T06:35:02Z

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

@@ -641,6 +644,9 @@ class SIMemoryLegalizer final : public MachineFunctionPass {
  bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
                                MachineBasicBlock::iterator &MI);

+  bool GFX9InsertWaitcntForPreciseMem(MachineFunction &MF);


Agreed, this should definitely be a virtual function such as insertWaitcntForPreciseMem and let the CacheControl implementation do what is needed. This is just emulating what CacheControl already does

Pierre-vh · 2024-01-29T06:40:27Z

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

+      AMDGPU::Waitcnt Wait;
+
+      if (!(Inst.getDesc().TSFlags & SIInstrFlags::maybeAtomic)) {
+        if (TII->isSMRD(Inst)) {          // scalar


Can we have a shared helper, e.g. in SIInstrInfo for both? It's a lot of logic to duplicate

The counter values in SIInsertWaitcnt are precise, while in this features the counters are simply set to 0.
That could just be a boolean switch in a shared helper

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

jwanggit86 · 2024-02-05T22:04:31Z

@t-tye Code has been updated based on your feedback. Pls take a look.

Pierre-vh

When you made changes, you can click the "Re-request review" icon next to reviewers to put it back in the review queues :)

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

Pierre-vh · 2024-02-06T07:46:00Z

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

+std::unique_ptr<SIPreciseMemorySupport>
+SIPreciseMemorySupport::create(const GCNSubtarget &ST) {
+  GCNSubtarget::Generation Generation = ST.getGeneration();
+  if (Generation < AMDGPUSubtarget::GFX10)


GFX12 is missing

Not sure this is required for GFX12. @t-tye Tony, is this required for GFX12? We didn't discuss this for GFX12.

Is there a reason that this functionality should not be available for any target? It is true it is only particularly useful for targets that have no precise memory operations hardware support, but the basic idea is meaningful for any target.

Pls take a look at the updated code. I'll be working on gfx12 in the meantime.

Pierre-vh · 2024-02-06T07:47:45Z

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

+}
+
+bool SIGfx10And11PreciseMemorySupport ::handleAtomic(
+    MachineBasicBlock::iterator &MI, bool ret) {


Use something more descriptive than ret + use CamelCase, e.g. IsReturningAtomic

Changed "ret" to "IsAtomicWithRet".

llvm/lib/Target/AMDGPU/AMDGPU.td

Pierre-vh · 2024-02-06T07:49:52Z

llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll

+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+amdgpu-precise-memory-op < %s | FileCheck %s -check-prefixes=GFX9
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+amdgpu-precise-memory-op < %s | FileCheck %s -check-prefixes=GFX90A
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+amdgpu-precise-memory-op < %s | FileCheck %s -check-prefixes=GFX10
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global,+enable-flat-scratch,+amdgpu-precise-memory-op -amdgpu-use-divergent-register-indexing < %s | FileCheck --check-prefixes=GFX9-FLATSCR %s


gfx11 and 12 tests

Tests for gfx11 added. Gfx12 is on-going.

jayfoad · 2024-02-07T14:48:51Z

This logic would need updating again for GFX12. It seems like it's duplicating a lot of knowledge which is already implemented in SIInsertWaitcnts.

Just to demonstrate, you could implement this feature in SIInsertWaitcnts for all supported architectures with something like:

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 6ecb1c8bf6e1..910cd094f8f2 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -2299,6 +2299,12 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
 
     updateEventWaitcntAfter(Inst, &ScoreBrackets);
 
+    AMDGPU::Waitcnt Wait =
+        AMDGPU::Waitcnt::allZeroExceptVsCnt(ST->hasExtendedWaitCounts());
+    ScoreBrackets.simplifyWaitcnt(Wait);
+    Modified |= generateWaitcnt(Wait, std::next(Inst.getIterator()), Block,
+                                ScoreBrackets, /*OldWaitcntInstr=*/nullptr);
+
 #if 0 // TODO: implement resource type check controlled by options with ub = LB.
     // If this instruction generates a S_SETVSKIP because it is an
     // indexed resource, and we are on Tahiti, then it will also force

Handling VSCNT/STORECNT correctly is a little more complicated but not much.

Pierre-vh

I also agree with Jay, can't this go in InsertWaitCnt? Why does it have to go in SIMemoryLegalizer instead?

If it has to stay here, fine, but is it possible to merge some code with SIInsertWaitCnt in a common helper somewhere?

llvm/lib/Target/AMDGPU/AMDGPU.td

Pierre-vh · 2024-02-15T07:56:10Z

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

+                                    bool ret) override;
+};
+
+#if 0


Remove code in #if 0

Regarding the question about SIInsertWaitcnt, initially the code was indeed put there (see PR 68932). After a discussion with @t-tye and others, it was decided to do it in SIMemoryLegalizer. I think the main reason is that functionality-wise this new feather is more closely related to the Mem Legalizer. Tony can explain this better. Also, the Mem Legalizer, like SIInsertWaitcnt, goes through each instruction already.
Based on the implementation experience, I'd say another reason that supports putting it in the Mem Legalizer is that there's already a class hierarchy for the different ISAs, making it easier to implement ISA-specific behaviors.

Pierre-vh · 2024-02-15T07:56:21Z

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

+  bool
+  handleNonAtomicForPreciseMemory(MachineBasicBlock::iterator &MI) override;
+  bool handleAtomicForPreciseMemory(MachineBasicBlock::iterator &MI,
+                                    bool ret) override;


ret -> Ret
(CamelCase)

Changed "ret" to "IsAtomicWithRet".

Pierre-vh · 2024-02-15T07:57:23Z

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

 class SIMemoryLegalizer final : public MachineFunctionPass {
 private:

  /// Cache Control.
  std::unique_ptr<SICacheControl> CC = nullptr;

+  /// Precise Memory support.
+  bool PM = false;


PM is commonly used for PassManager so I'd just use PrecMem or something similar. Of course PassManager makes no sense here but I personally thought of that first before thinking PreciseMemory

Changed "PM" to "PrecMem".

Pierre-vh · 2024-02-15T07:57:52Z

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

+  if (ret) {
+    BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAIT_LOADCNT)).addImm(0);
+  } else {
+    BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAIT_STORECNT)).addImm(0);
+  }


can drop {}

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

github-actions · 2024-02-15T22:59:35Z

✅ With the latest revision this PR passed the C/C++ code formatter.

Pierre-vh

Did you try to move this to SIInsertWaitCnt, as suggested?

llvm/lib/Target/AMDGPU/AMDGPU.td

Pierre-vh · 2024-02-19T07:35:23Z

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

+  /// Handles atomic instruction \p MI with \p IsAtomicWithRet indicating
+  /// whether \p MI returns a result.


IMO the comment isn't needed, I think the function's name describes it well enough (and the comment just before adds enough context), the comment just rephrases the function + argument names

Adding the comments is mainly to be consistent with the existing code. Here the comment points out that the 2nd parameter is a property of the first parameter (MI), which is not absolutely clear by just looking at the code.

jwanggit86 · 2024-02-19T20:43:37Z

Did you try to move this to SIInsertWaitCnt, as suggested?

Pls see my reply on Feb 15, which is copy-pasted below.

Regarding the question about SIInsertWaitcnt, initially the code was indeed put there (see PR #68932). After a discussion with @t-tye and others, it was decided to do it in SIMemoryLegalizer. I think the main reason is that functionality-wise this new feather is more closely related to the Mem Legalizer. Tony can explain this better. Also, the Mem Legalizer, like SIInsertWaitcnt, goes through each instruction already.
Based on the implementation experience, I'd say another reason that supports putting it in the Mem Legalizer is that there's already a class hierarchy for the different ISAs, making it easier to implement ISA-specific behaviors.

jwanggit86 · 2024-02-26T22:16:14Z

@Pierre-vh Any further comments?

Pierre-vh

LGTM, but wait for @t-tye or @jayfoad to approve as well

jayfoad

I've added some inline comments, but really I don't want to spend the time to review this properly (or maintain it, or extend it for new architectures in future). All this logic already exists in SIInsertWaitcnts. Duplicating it here is not a good design.

jayfoad · 2024-02-27T08:21:28Z

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

+  handleNonAtomicForPreciseMemory(MachineBasicBlock::iterator &MI) = 0;
+  /// Handles atomic instruction \p MI with \p IsAtomicWithRet indicating
+  /// whether \p MI returns a result.
+  virtual bool handleAtomicForPreciseMemory(MachineBasicBlock::iterator &MI,


This function is never even called.

jayfoad · 2024-02-27T08:21:53Z

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

+      LoadAndStore = true;
+    } else if (Inst.mayLoad()) { // vector load
+      if (TII->isVMEM(Inst))     // VMEM load
+        WaitType = AMDGPU::S_WAIT_LOADCNT;


What about SAMPLECNT and BVHCNT?

jayfoad · 2024-02-27T08:25:59Z

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

+
+  MachineInstr &Inst = *MI;
+  unsigned WaitType = 0;
+  // For some vector instructions, mayLoad() and mayStore() can be both true.


What kind of (non-atomic) instructions is this supposed to handle?

t-tye · 2024-02-28T05:28:29Z

I am not clear why new functions need to be added for this, as I think there are existing functions that already do this.

jayfoad · 2024-04-02T10:02:05Z

llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll

+}
+
+; from atomic_load_add.ll
+; covers s_load, ds_add (atomic without return)


Can you add a test for DS atomic with return, e.g. ds_add_rtn?

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

arsenm · 2024-04-02T22:44:34Z

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

+        Modified |= generateWaitcnt(Wait, std::next(Inst.getIterator()), Block,
+                                    ScoreBrackets, /*OldWaitcntInstr=*/nullptr);
+      }
+      Iter--;


You shouldn't need to manipulate the iterator here. Do you just need to get rid of the std::next, or use Iter directly above?

When I see a load/store instruction, I need to check if the next one is a waitcnt. A waitcnt is only inserted if the next one is not a waitcnt. Thus the Iter++ and Iter--.

Something is wrong, you shouldn't need to check for neighbors?

jayfoad · 2024-04-03T18:40:32Z

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

@@ -2326,6 +2326,18 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
    }
 #endif

+    if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) {
+      ++Iter;
+      if (!isWaitInstr(*Iter)) {


This isn't right. This will prevent adding your waitcnt if any waitcnt follows, even a wait for a different counter.

Normally generateWaitcnt should combine and remove redundant waitcnts like the ones in your test. It looks like the problem is that this does not normally happen at the end of a basic block. I have tried fixing that in #87539. If that patch is committed, you should be able to rebase this patch and the duplicate waitcnts will get removed.

#87539 has been merged. You can merge-or-rebase main into your branch and remove this isWaitInstr check.

This patch introduces a new command-line option for clang, namely, amdgpu-precise-mem-op. When this option is specified, a waitcnt instruction is generated after each memory load/store instruction. The counter values are always 0, but which counters are involved depends on the memory instruction.

for the backend.

to the test file.

tail_call_byval_align16() to cover buffer_load/store, and scratch_load/store.

waitcnt instruction. If so, do not insert another waitcnt. Also add a testcase that has ds_add_rtn. Formatting change made to SIMemoryLegalizer.cpp is reverted.

…diately after a load/store is not necessary.

jayfoad · 2024-04-05T07:31:37Z

Can you add at least one test for a VMEM (flat or scratch or global or buffer or image) atomic without return? That should use vscnt on GFX10.

Apart from that the SIInsertWaitcnts.cpp and tests look good to me. I have not reviewed the clang parts but it looks like @Pierre-vh approved them previously?

jwanggit86 · 2024-04-09T00:27:51Z

Added a testcase that has flat_atomic_swap, which is an atomic without return.

jwanggit86 · 2024-04-10T00:02:44Z

@jayfoad Do you have any more comments?

jayfoad · 2024-04-10T10:10:19Z

No further comments.

…ch memory instruction (llvm#79236) This patch introduces a new command-line option for clang, namely, amdgpu-precise-mem-op (or precise-memory in the backend). When this option is specified, a waitcnt instruction is generated after each memory load/store instruction. The counter values are always 0, but which counters are involved depends on the memory instruction. --------- Co-authored-by: Jun Wang <[email protected]> Change-Id: Ieeac771ce0facbdff3c6149945bbabc95d7cb48c

jwanggit86 added the backend:AMDGPU label Jan 24, 2024

jwanggit86 requested review from jayfoad and arsenm January 24, 2024 01:27

llvmbot added clang Clang issues not falling into any other category clang:driver 'clang' and 'clang++' user-facing binaries. Not 'clang-cl' labels Jan 24, 2024

jwanggit86 mentioned this pull request Jan 24, 2024

[AMDGPU] Emit a waitcnt instruction after each memory instruction #68932

Closed

jayfoad reviewed Jan 24, 2024

View reviewed changes

jwanggit86 requested a review from t-tye January 25, 2024 00:34

t-tye requested changes Jan 26, 2024

View reviewed changes

Pierre-vh reviewed Jan 29, 2024

View reviewed changes

arsenm reviewed Jan 29, 2024

View reviewed changes

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp Outdated Show resolved Hide resolved

arsenm reviewed Jan 29, 2024

View reviewed changes

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp Outdated Show resolved Hide resolved

Pierre-vh requested changes Feb 6, 2024

View reviewed changes

jwanggit86 requested review from Pierre-vh and t-tye February 12, 2024 23:03

Pierre-vh reviewed Feb 15, 2024

View reviewed changes

jwanggit86 requested a review from Pierre-vh February 16, 2024 18:15

Pierre-vh requested changes Feb 19, 2024

View reviewed changes

jwanggit86 requested review from Pierre-vh and arsenm February 20, 2024 00:34

Pierre-vh approved these changes Feb 27, 2024

View reviewed changes

Pierre-vh requested a review from jayfoad February 27, 2024 07:59

jayfoad requested changes Feb 27, 2024

View reviewed changes

jayfoad reviewed Apr 2, 2024

View reviewed changes

arsenm reviewed Apr 2, 2024

View reviewed changes

jwanggit86 requested review from arsenm and jayfoad April 2, 2024 23:56

jayfoad reviewed Apr 3, 2024

View reviewed changes

Jun Wang added 15 commits April 4, 2024 12:34

Combined insertions of waitcnt with existing SIMemoryLegalizer code.

6624b6a

Merge code for precise mem with the existing SICacheControl classes.

8abdc34

Add support for GFX12.

adaa16c

Some small changes based on code review.

c02b87b

Code formatting.

571ce58

Change the option from amdgpu-precise-memory-op to precise-memory

93b00bf

for the backend.

Move implementation from SIMemoryLegalizer to SIInsertWaitcnts.

c42d3fb

Minor code change.

44bada0

Use getAllZeroWaitcnt() when creating the Wait obj. Some changes

20312a1

to the test file.

Use update_llc_test_checks.py on insert_waitcnt_for_precise_memory.ll.

12dde5f

Replace testcases test_load_store() and test_load_store_as5() with

1e3c7dd

tail_call_byval_align16() to cover buffer_load/store, and scratch_load/store.

Check if mem instruciton is already immediately followed by a

4f4bf31

waitcnt instruction. If so, do not insert another waitcnt. Also add a testcase that has ds_add_rtn. Formatting change made to SIMemoryLegalizer.cpp is reverted.

Change iterator update operator from post-inc to pre-inc.

4ae38b6

With llvm#87539, previous commit that checks for the instruction imme…

49cad2d

…diately after a load/store is not necessary.

jwanggit86 force-pushed the waitcnt-after-each-mem-insn branch from fe2204c to 49cad2d Compare April 4, 2024 18:59

jwanggit86 requested a review from jayfoad April 4, 2024 19:09

Add testcase that covers flat_atomic_swap, an atomic without return.

6d52f6e

jwanggit86 merged commit 86842e1 into llvm:main Apr 10, 2024
4 checks passed

jwanggit86 deleted the waitcnt-after-each-mem-insn branch April 10, 2024 17:47

		/// Handles atomic instruction \p MI with \p IsAtomicWithRet indicating
		/// whether \p MI returns a result.

[AMDGPU] Emit a waitcnt instruction after each memory instruction #79236

[AMDGPU] Emit a waitcnt instruction after each memory instruction #79236

Conversation

jwanggit86 commented Jan 24, 2024

llvmbot commented Jan 24, 2024 • edited Loading

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

jwanggit86 commented Feb 5, 2024

Pierre-vh left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

jwanggit86 Feb 7, 2024 • edited Loading

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

jayfoad commented Feb 7, 2024

Pierre-vh left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

github-actions bot commented Feb 15, 2024 • edited Loading

Pierre-vh left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

jwanggit86 commented Feb 19, 2024

jwanggit86 commented Feb 26, 2024

Pierre-vh left a comment

Choose a reason for hiding this comment

jayfoad left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

t-tye commented Feb 28, 2024

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

jayfoad Apr 3, 2024 • edited Loading

Choose a reason for hiding this comment

Choose a reason for hiding this comment

jayfoad commented Apr 5, 2024

jwanggit86 commented Apr 9, 2024

jwanggit86 commented Apr 10, 2024

jayfoad commented Apr 10, 2024

llvmbot commented Jan 24, 2024 •

edited

Loading

jwanggit86 Feb 7, 2024 •

edited

Loading

github-actions bot commented Feb 15, 2024 •

edited

Loading

jayfoad Apr 3, 2024 •

edited

Loading