Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit695660c

Browse files
authored
[AMDGPU] Provide control to force VGPR MFMA form (#148079)
This gives an override to the user to force select VGPR form of MFMA.Eventually we will drop this in favor of compiler making betterdecisions, but this provides a mechanism for users to address the caseswhere MayNeedAGPRs favors the AGPR form and performance is degraded dueto poor RA.
1 parentabdd453 commit695660c

File tree

3 files changed

+3994
-2
lines changed

3 files changed

+3994
-2
lines changed

‎llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,16 @@ enum { MAX_LANES = 64 };
2929

3030
usingnamespacellvm;
3131

32+
// TODO -- delete this flag once we have more robust mechanisms to allocate the
33+
// optimal RC for Opc and Dest of MFMA. In particular, there are high RP cases
34+
// where it is better to produce the VGPR form (e.g. if there are VGPR users
35+
// of the MFMA result).
36+
cl::opt<bool>MFMAVGPRForm(
37+
"amdgpu-mfma-vgpr-form", cl::Hidden,
38+
cl::desc("Whether to force use VGPR for Opc and Dest of MFMA. If"
39+
"unspecified, default to compiler heuristics"),
40+
cl::init(false));
41+
3242
const GCNTargetMachine &getTM(const GCNSubtarget *STI) {
3343
const SITargetLowering *TLI = STI->getTargetLowering();
3444
returnstatic_cast<const GCNTargetMachine &>(TLI->getTargetMachine());
@@ -69,8 +79,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
6979
PSInputAddr =AMDGPU::getInitialPSInputAddr(F);
7080
}
7181

72-
MayNeedAGPRs = ST.hasMAIInsts();
73-
if (ST.hasGFX90AInsts() &&
82+
MayNeedAGPRs = ST.hasMAIInsts() && !MFMAVGPRForm;
83+
if (!MFMAVGPRForm &&ST.hasGFX90AInsts() &&
7484
ST.getMaxNumVGPRs(F) <= AMDGPU::VGPR_32RegClass.getNumRegs() &&
7585
!mayUseAGPRs(F))
7686
MayNeedAGPRs =false;// We will select all MAI with VGPR operands.
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2+
; RUN: llc -mtriple=amdgcn -mcpu=gfx950 --amdgpu-mfma-vgpr-form=0 < %s | FileCheck -enable-var-scope --check-prefixes=HEURRC %s
3+
; RUN: llc -mtriple=amdgcn -mcpu=gfx950 --amdgpu-mfma-vgpr-form=1 < %s | FileCheck -enable-var-scope --check-prefixes=VGPRRC %s
4+
5+
declare <4 xfloat>@llvm.amdgcn.mfma.f32.16x16x32.f16(<8 xhalf>, <8 xhalf>, <4 xfloat>,i32 immarg,i32 immarg,i32 immarg)
6+
7+
define <4 xfloat>@default(<8 xhalf>%arg0, <8 xhalf>%arg1, <4 xfloat>%arg2) {
8+
; HEURRC-LABEL: default:
9+
; HEURRC: ; %bb.0:
10+
; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11+
; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
12+
; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
13+
; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
14+
; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
15+
; HEURRC-NEXT: s_nop 1
16+
; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
17+
; HEURRC-NEXT: s_nop 7
18+
; HEURRC-NEXT: v_accvgpr_read_b32 v0, a0
19+
; HEURRC-NEXT: v_accvgpr_read_b32 v1, a1
20+
; HEURRC-NEXT: v_accvgpr_read_b32 v2, a2
21+
; HEURRC-NEXT: v_accvgpr_read_b32 v3, a3
22+
; HEURRC-NEXT: s_setpc_b64 s[30:31]
23+
;
24+
; VGPRRC-LABEL: default:
25+
; VGPRRC: ; %bb.0:
26+
; VGPRRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27+
; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
28+
; VGPRRC-NEXT: s_setpc_b64 s[30:31]
29+
%result =call <4 xfloat>@llvm.amdgcn.mfma.f32.16x16x32.f16(<8 xhalf>%arg0, <8 xhalf>%arg1, <4 xfloat>%arg2,i320,i320,i320)
30+
ret <4 xfloat>%result
31+
}
32+
33+
define <4 xfloat>@request_agpr(<8 xhalf>%arg0, <8 xhalf>%arg1, <4 xfloat>%arg2) #0 {
34+
; HEURRC-LABEL: request_agpr:
35+
; HEURRC: ; %bb.0:
36+
; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37+
; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
38+
; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
39+
; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
40+
; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
41+
; HEURRC-NEXT: s_nop 1
42+
; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
43+
; HEURRC-NEXT: s_nop 7
44+
; HEURRC-NEXT: v_accvgpr_read_b32 v0, a0
45+
; HEURRC-NEXT: v_accvgpr_read_b32 v1, a1
46+
; HEURRC-NEXT: v_accvgpr_read_b32 v2, a2
47+
; HEURRC-NEXT: v_accvgpr_read_b32 v3, a3
48+
; HEURRC-NEXT: s_setpc_b64 s[30:31]
49+
;
50+
; VGPRRC-LABEL: request_agpr:
51+
; VGPRRC: ; %bb.0:
52+
; VGPRRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
53+
; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
54+
; VGPRRC-NEXT: s_setpc_b64 s[30:31]
55+
%result =call <4 xfloat>@llvm.amdgcn.mfma.f32.16x16x32.f16(<8 xhalf>%arg0, <8 xhalf>%arg1, <4 xfloat>%arg2,i320,i320,i320)
56+
ret <4 xfloat>%result
57+
}
58+
59+
define <4 xfloat>@request_no_agpr(<8 xhalf>%arg0, <8 xhalf>%arg1, <4 xfloat>%arg2) #1 {
60+
; HEURRC-LABEL: request_no_agpr:
61+
; HEURRC: ; %bb.0:
62+
; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
63+
; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
64+
; HEURRC-NEXT: s_setpc_b64 s[30:31]
65+
;
66+
; VGPRRC-LABEL: request_no_agpr:
67+
; VGPRRC: ; %bb.0:
68+
; VGPRRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
69+
; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
70+
; VGPRRC-NEXT: s_setpc_b64 s[30:31]
71+
%result =call <4 xfloat>@llvm.amdgcn.mfma.f32.16x16x32.f16(<8 xhalf>%arg0, <8 xhalf>%arg1, <4 xfloat>%arg2,i320,i320,i320)
72+
ret <4 xfloat>%result
73+
}
74+
75+
attributes #0 = {"amdgpu-agpr-alloc"="32,256" }
76+
attributes #1 = {"amdgpu-agpr-alloc"="0,0" }

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp