Jul 18, 2025 · Jul 16, 2025 · Jul 18, 2025 · Jul 18, 2025 · Jul 18, 2025 · Jul 18, 2025
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp

    // Copies and REG_SEQUENCE do not contribute to the final assembly
    // So, skip them but take care of the SGPR to VGPR copies bookkeeping.
    if (Inst->isCopy() || Inst->isRegSequence()) {
      if (TRI->isVGPR(*MRI, Inst->getOperand(0).getReg())) {
        if (!Inst->isCopy() ||
            !tryChangeVGPRtoSGPRinCopy(*Inst, TRI, TII)) {
          Info.NumSVCopies++;
          continue;
        }
    if (Inst->isRegSequence() &&
        TRI->isVGPR(*MRI, Inst->getOperand(0).getReg())) {
      Info.NumSVCopies++;
      continue;
    }
    if (Inst->isCopy()) {
      const TargetRegisterClass *SrcRC, *DstRC;
      std::tie(SrcRC, DstRC) = getCopyRegClasses(*Inst, *TRI, *MRI);
      if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI) &&
          !tryChangeVGPRtoSGPRinCopy(*Inst, TRI, TII)) {
        Info.NumSVCopies++;
        continue;
      }
    }

diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-to-vreg1-copy.ll b/llvm/test/CodeGen/AMDGPU/sgpr-to-vreg1-copy.ll
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=GCN %s


 define amdgpu_kernel void @copy_to_vreg_1(i32 %0) {
 ; GCN-LABEL: copy_to_vreg_1:
 ; GCN:       ; %bb.0: ; %._crit_edge
 ; GCN-NEXT:    s_load_dword s4, s[4:5], 0x24
 ; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GCN-NEXT:    v_mov_b64_e32 v[2:3], 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_sub_i32 s5, 1, s4
 ; GCN-NEXT:    s_cmp_lt_u32 s4, 2
 ; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GCN-NEXT:    s_and_b64 s[2:3], s[0:1], exec
 ; GCN-NEXT:    s_cselect_b32 s3, s5, 1
 ; GCN-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
 ; GCN-NEXT:    s_addc_u32 s0, 1, 0
 ; GCN-NEXT:    v_readfirstlane_b32 s2, v1
 ; GCN-NEXT:    s_cmp_ge_u32 s3, s4
 ; GCN-NEXT:    s_cselect_b32 s4, s0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_cmp_lg_u64 0, 0
 ; GCN-NEXT:    s_mov_b64 s[0:1], 0
 ; GCN-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; GCN-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GCN-NEXT:    s_branch .LBB0_3
 ; GCN-NEXT:  .LBB0_1: ; %Flow
 ; GCN-NEXT:    ; in Loop: Header=BB0_3 Depth=1
 ; GCN-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GCN-NEXT:    s_xor_b64 s[8:9], exec, -1
 ; GCN-NEXT:  .LBB0_2: ; %Flow2
 ; GCN-NEXT:    ; in Loop: Header=BB0_3 Depth=1
 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-NEXT:    s_and_b64 s[4:5], exec, s[8:9]
 ; GCN-NEXT:    s_or_b64 s[0:1], s[4:5], s[0:1]
 ; GCN-NEXT:    s_mov_b32 s4, 0
 ; GCN-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GCN-NEXT:    s_cbranch_execz .LBB0_8
 ; GCN-NEXT:  .LBB0_3: ; %.lr.ph27
 ; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 0
 ; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GCN-NEXT:    s_or_b64 s[8:9], vcc, s[4:5]
 ; GCN-NEXT:    s_xor_b64 s[6:7], s[8:9], -1
 ; GCN-NEXT:    s_and_saveexec_b64 s[4:5], s[8:9]
 ; GCN-NEXT:    s_cbranch_execz .LBB0_5
 ; GCN-NEXT:  ; %bb.4: ; %pred.store.if
 ; GCN-NEXT:    ; in Loop: Header=BB0_3 Depth=1
 ; GCN-NEXT:    s_or_b64 s[6:7], s[6:7], exec
 ; GCN-NEXT:    global_store_byte v[2:3], v1, off
 ; GCN-NEXT:  .LBB0_5: ; %Flow1
 ; GCN-NEXT:    ; in Loop: Header=BB0_3 Depth=1
 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-NEXT:    s_mov_b64 s[8:9], -1
 ; GCN-NEXT:    s_and_saveexec_b64 s[4:5], s[6:7]
 ; GCN-NEXT:    s_cbranch_execz .LBB0_2
 ; GCN-NEXT:  ; %bb.6: ; %pred.store.continue
 ; GCN-NEXT:    ; in Loop: Header=BB0_3 Depth=1
 ; GCN-NEXT:    s_and_saveexec_b64 s[6:7], s[2:3]
 ; GCN-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
 ; GCN-NEXT:    s_cbranch_execz .LBB0_1
 ; GCN-NEXT:  ; %bb.7: ; %pred.store.if41
 ; GCN-NEXT:    ; in Loop: Header=BB0_3 Depth=1
 ; GCN-NEXT:    global_store_byte v[2:3], v1, off
 ; GCN-NEXT:    s_branch .LBB0_1
 ; GCN-NEXT:  .LBB0_8: ; %DummyReturnBlock
 ; GCN-NEXT:    s_endpgm
 ._crit_edge:
  %1 = tail call i32 @llvm.amdgcn.workitem.id.x()
  %2 = udiv i32 1, %0
  br label %.lr.ph27

 .lr.ph27:                                         ; preds = %pred.store.if41, %pred.store.continue, %._crit_edge
  %3 = phi i32 [ %2, %._crit_edge ], [ 0, %pred.store.if41 ], [ 0, %pred.store.continue ]
  %4 = icmp ugt i32 %3, 0
  %broadcast.splatinsert37 = insertelement <4 x i1> zeroinitializer, i1 %4, i64 0
  %.zext = zext i32 %1 to i64
  %broadcast.splatinsert39 = insertelement <4 x i64> zeroinitializer, i64 %.zext, i64 0
  %5 = icmp uge <4 x i64> %broadcast.splatinsert39, splat (i64 1)
  %6 = or <4 x i1> %5, %broadcast.splatinsert37
  %7 = extractelement <4 x i1> %6, i64 0
  br i1 %7, label %pred.store.if, label %pred.store.continue

 pred.store.if:                                    ; preds = %.lr.ph27
  store i8 0, ptr addrspace(1) null, align 64
  br label %pred.store.continue

 pred.store.continue:                              ; preds = %pred.store.if, %.lr.ph27
  %8 = extractelement <4 x i1> %6, i64 1
  br i1 %8, label %pred.store.if41, label %.lr.ph27

 pred.store.if41:                                  ; preds = %pred.store.continue
  store i8 0, ptr addrspace(1) null, align 64
  br label %.lr.ph27
 }

 declare noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() #0

 attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-to-vreg1-copy.mir b/llvm/test/CodeGen/AMDGPU/sgpr-to-vreg1-copy.mir
 # RUN: llc -mtriple=amdgcn -run-pass si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s

 ---
 name:            copy_to_vreg_1
 tracksRegLiveness: true
 body:             |
  ; GCN-LABEL: name: copy_to_vreg_1
  ; GCN: bb.0:
  ; GCN-NEXT:   successors: %bb.1(0x80000000)
  ; GCN-NEXT:   liveins: $vgpr0, $vgpr1
  ; GCN-NEXT: {{  $}}
  ; GCN-NEXT:   [[V_CVT_U32_F32_e64:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e64 0, killed $vgpr0, 0, 0, implicit $mode, implicit $exec
  ; GCN-NEXT:   [[IMPLICIT_DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
  ; GCN-NEXT:   [[V_CMP_GT_U32_e64:%[0-9]+]]:sreg_64_xexec = samesign V_CMP_GT_U32_e64 [[V_CVT_U32_F32_e64]], killed [[COPY1]], implicit $exec
  ; GCN-NEXT:   [[VREG1:%[0-9]+]]:vreg_1 = COPY [[V_CMP_GT_U32_e64]]
  ; GCN-NEXT: {{  $}}
  ; GCN-NEXT: bb.1:
  ; GCN-NEXT:   S_ENDPGM 0
  bb.0:
    liveins: $vgpr0, $vgpr1
      %0:vgpr_32 = nofpexcept V_CVT_U32_F32_e64 0, killed $vgpr0, 0, 0, implicit $mode, implicit $exec
      %1:sreg_32 = COPY %0:vgpr_32
      %2:sreg_32 = COPY $vgpr1
      samesign S_CMP_GT_U32 %1:sreg_32, killed %2:sreg_32, implicit-def $scc
      %3:sreg_64 = COPY $scc
      %4:vreg_1 = COPY %3:sreg_64

  bb.1:
      S_ENDPGM 0
 ...
Original file line number	Diff line number	Diff line change
Expand Up		@@ -946,13 +946,18 @@ void SIFixSGPRCopies::analyzeVGPRToSGPRCopy(MachineInstr* MI) {

		// Copies and REG_SEQUENCE do not contribute to the final assembly
		// So, skip them but take care of the SGPR to VGPR copies bookkeeping.
		if (Inst->isCopy() \|\| Inst->isRegSequence()) {
		if (TRI->isVGPR(*MRI, Inst->getOperand(0).getReg())) {
		if (!Inst->isCopy() \|\|
		!tryChangeVGPRtoSGPRinCopy(*Inst, TRI, TII)) {
		Info.NumSVCopies++;
		continue;
		}
		if (Inst->isRegSequence() &&
		TRI->isVGPR(*MRI, Inst->getOperand(0).getReg())) {
		Info.NumSVCopies++;
		continue;
		}
		if (Inst->isCopy()) {
		const TargetRegisterClass SrcRC, DstRC;
		std::tie(SrcRC, DstRC) = getCopyRegClasses(Inst, TRI, *MRI);
		if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI) &&
		!tryChangeVGPRtoSGPRinCopy(*Inst, TRI, TII)) {
		Info.NumSVCopies++;
		continue;
		}
		}

Expand Down
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,101 @@
		; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
		; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s \| FileCheck -check-prefixes=GCN %s

arsenm marked this conversation as resolved. Show resolvedHide resolved

		define amdgpu_kernel void @copy_to_vreg_1(i32 %0) {
		; GCN-LABEL: copy_to_vreg_1:
		; GCN: ; %bb.0: ; %._crit_edge
		; GCN-NEXT: s_load_dword s4, s[4:5], 0x24
		; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0
		; GCN-NEXT: v_mov_b64_e32 v[2:3], 0
		; GCN-NEXT: s_waitcnt lgkmcnt(0)
		; GCN-NEXT: s_sub_i32 s5, 1, s4
		; GCN-NEXT: s_cmp_lt_u32 s4, 2
		; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
		; GCN-NEXT: s_and_b64 s[2:3], s[0:1], exec
		; GCN-NEXT: s_cselect_b32 s3, s5, 1
		; GCN-NEXT: s_cmp_lg_u64 s[0:1], 0
		; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
		; GCN-NEXT: s_addc_u32 s0, 1, 0
		; GCN-NEXT: v_readfirstlane_b32 s2, v1
		; GCN-NEXT: s_cmp_ge_u32 s3, s4
		; GCN-NEXT: s_cselect_b32 s4, s0, s2
		; GCN-NEXT: v_mov_b32_e32 v1, 0
		; GCN-NEXT: s_cmp_lg_u64 0, 0
		; GCN-NEXT: s_mov_b64 s[0:1], 0
		; GCN-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
		; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0
		; GCN-NEXT: s_branch .LBB0_3
		; GCN-NEXT: .LBB0_1: ; %Flow
		; GCN-NEXT: ; in Loop: Header=BB0_3 Depth=1
		; GCN-NEXT: s_or_b64 exec, exec, s[6:7]
		; GCN-NEXT: s_xor_b64 s[8:9], exec, -1
		; GCN-NEXT: .LBB0_2: ; %Flow2
		; GCN-NEXT: ; in Loop: Header=BB0_3 Depth=1
		; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
		; GCN-NEXT: s_and_b64 s[4:5], exec, s[8:9]
		; GCN-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1]
		; GCN-NEXT: s_mov_b32 s4, 0
		; GCN-NEXT: s_andn2_b64 exec, exec, s[0:1]
		; GCN-NEXT: s_cbranch_execz .LBB0_8
		; GCN-NEXT: .LBB0_3: ; %.lr.ph27
		; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
		; GCN-NEXT: s_cmp_lg_u32 s4, 0
		; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
		; GCN-NEXT: s_or_b64 s[8:9], vcc, s[4:5]
		; GCN-NEXT: s_xor_b64 s[6:7], s[8:9], -1
		; GCN-NEXT: s_and_saveexec_b64 s[4:5], s[8:9]
		; GCN-NEXT: s_cbranch_execz .LBB0_5
		; GCN-NEXT: ; %bb.4: ; %pred.store.if
		; GCN-NEXT: ; in Loop: Header=BB0_3 Depth=1
		; GCN-NEXT: s_or_b64 s[6:7], s[6:7], exec
		; GCN-NEXT: global_store_byte v[2:3], v1, off
		; GCN-NEXT: .LBB0_5: ; %Flow1
		; GCN-NEXT: ; in Loop: Header=BB0_3 Depth=1
		; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
		; GCN-NEXT: s_mov_b64 s[8:9], -1
		; GCN-NEXT: s_and_saveexec_b64 s[4:5], s[6:7]
		; GCN-NEXT: s_cbranch_execz .LBB0_2
		; GCN-NEXT: ; %bb.6: ; %pred.store.continue
		; GCN-NEXT: ; in Loop: Header=BB0_3 Depth=1
		; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[2:3]
		; GCN-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
		; GCN-NEXT: s_cbranch_execz .LBB0_1
		; GCN-NEXT: ; %bb.7: ; %pred.store.if41
		; GCN-NEXT: ; in Loop: Header=BB0_3 Depth=1
		; GCN-NEXT: global_store_byte v[2:3], v1, off
		; GCN-NEXT: s_branch .LBB0_1
		; GCN-NEXT: .LBB0_8: ; %DummyReturnBlock
		; GCN-NEXT: s_endpgm
		._crit_edge:
		%1 = tail call i32 @llvm.amdgcn.workitem.id.x()
Copy link Contributor arsenmJul 18, 2025 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. Should use named values in tests
		%2 = udiv i32 1, %0
		br label %.lr.ph27

		.lr.ph27: ; preds = %pred.store.if41, %pred.store.continue, %._crit_edge
		%3 = phi i32 [ %2, %._crit_edge ], [ 0, %pred.store.if41 ], [ 0, %pred.store.continue ]
		%4 = icmp ugt i32 %3, 0
		%broadcast.splatinsert37 = insertelement <4 x i1> zeroinitializer, i1 %4, i64 0
		%.zext = zext i32 %1 to i64
		%broadcast.splatinsert39 = insertelement <4 x i64> zeroinitializer, i64 %.zext, i64 0
		%5 = icmp uge <4 x i64> %broadcast.splatinsert39, splat (i64 1)
		%6 = or <4 x i1> %5, %broadcast.splatinsert37
		%7 = extractelement <4 x i1> %6, i64 0
		br i1 %7, label %pred.store.if, label %pred.store.continue

		pred.store.if: ; preds = %.lr.ph27
		store i8 0, ptr addrspace(1) null, align 64
		br label %pred.store.continue

		pred.store.continue: ; preds = %pred.store.if, %.lr.ph27
		%8 = extractelement <4 x i1> %6, i64 1
		br i1 %8, label %pred.store.if41, label %.lr.ph27

		pred.store.if41: ; preds = %pred.store.continue
		store i8 0, ptr addrspace(1) null, align 64
		br label %.lr.ph27
		}

		declare noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() #0

		attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,31 @@
		# RUN: llc -mtriple=amdgcn -run-pass si-fix-sgpr-copies -verify-machineinstrs -o - %s \| FileCheck -check-prefix=GCN %s

		---
		name: copy_to_vreg_1
		tracksRegLiveness: true
		body: \|
		; GCN-LABEL: name: copy_to_vreg_1
		; GCN: bb.0:
		; GCN-NEXT: successors: %bb.1(0x80000000)
		; GCN-NEXT: liveins: $vgpr0, $vgpr1
		; GCN-NEXT: {{ $}}
		; GCN-NEXT: [[V_CVT_U32_F32_e64:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e64 0, killed $vgpr0, 0, 0, implicit $mode, implicit $exec
		; GCN-NEXT: [[IMPLICIT_DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
		; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
		; GCN-NEXT: [[V_CMP_GT_U32_e64:%[0-9]+]]:sreg_64_xexec = samesign V_CMP_GT_U32_e64 [[V_CVT_U32_F32_e64]], killed [[COPY1]], implicit $exec
		; GCN-NEXT: [[VREG1:%[0-9]+]]:vreg_1 = COPY [[V_CMP_GT_U32_e64]]
		; GCN-NEXT: {{ $}}
		; GCN-NEXT: bb.1:
		; GCN-NEXT: S_ENDPGM 0
		bb.0:
		liveins: $vgpr0, $vgpr1
		%0:vgpr_32 = nofpexcept V_CVT_U32_F32_e64 0, killed $vgpr0, 0, 0, implicit $mode, implicit $exec
		%1:sreg_32 = COPY %0:vgpr_32
		%2:sreg_32 = COPY $vgpr1
		samesign S_CMP_GT_U32 %1:sreg_32, killed %2:sreg_32, implicit-def $scc
		%3:sreg_64 = COPY $scc
		%4:vreg_1 = COPY %3:sreg_64

		bb.1:
		S_ENDPGM 0
		...