NotificationsYou must be signed in to change notification settings
Fork33
Star125

Commit3cec690

committed

Adding support for refining ElementWise, ExpandDims and Broadcast

1 parent96060c0 commit3cec690Copy full SHA for 3cec690

File tree

3 files changed

+495

-3

lines changed

lib/Dialect/TritonGPU/IR
- LinearLayoutConversions.cpp
test/TritonGPU/amd
- amd-extractslice-op.mlir
third_party/amd/lib/TritonAMDGPUTransforms
- RefineOps.cpp

3 files changed

+495

-3

lines changed

`‎lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp`

Lines changed: 14 additions & 3 deletions

Original file line number	Diff line number	Diff line change
`@@ -394,17 +394,28 @@ AMDMfmaEncodingAttr::toLinearLayout(ArrayRef<int64_t> shape) const {`
`394`	`394`	`// For the lane (i.e., thread) dimension, these threads are along the`
`395`	`395`	`// matrix C's N dimension, with 32 consecutive threads covering a whole`
`396`	`396`	`// row and the next 32 threads start after a gap spanning 4 rows.`
	`397`	`+ std::vector<std::vector<int>> regBases = { {0,1}, {0,2} };`
	`398`	`+if (getWarpsPerCTA()[1] >1) {`
	`399`	`+ regBases.push_back({0,8});`
	`400`	`+ regBases.push_back({0,16});`
	`401`	`+ }`
`397`	`402`	`tileLayout =LinearLayout(`
`398`		`- {{kRegister,{{0,1}, {0,2}, {0,8},/gap/ {0,16}}},`
	`403`	`+ {{kRegister,regBases},`
`399`	`404`	`{kLane, {{1,0}, {2,0}, {4,0}, {8,0}, {16,0},/gap/ {0,4}}}},`
`400`	`405`	`{outDimNames[order[0]], outDimNames[order[1]]});`
`401`	`406`	`// For mfma.transposed layout, the element ownership among threads are`
`402`	`407`	`// "transposed" within each warp.`
`403`		`-if (getIsTransposed())`
	`408`	`+if (getIsTransposed()) {`
	`409`	`+ regBases = { {1,0}, {2,0} };`
	`410`	`+if (getWarpsPerCTA()[1] >1) {`
	`411`	`+ regBases.push_back({8,0});`
	`412`	`+ regBases.push_back({16,0});`
	`413`	`+ }`
`404`	`414`	`tileLayout =LinearLayout(`
`405`		`- {{kRegister,{{1,0}, {2,0}, {8,0},/gap/ {16,0}}},`
	`415`	`+ {{kRegister,regBases},`
`406`	`416`	`{kLane, {{0,1}, {0,2}, {0,4}, {0,8}, {0,16},/gap/ {4,0}}}},`
`407`	`417`	`{outDimNames[order[0]], outDimNames[order[1]]});`
	`418`	`+ }`
`408`	`419`	`}else {`
`409`	`420`	`assert(getMDim() ==16);`
`410`	`421`	`// For mfma with 16x16 output, each of the 64 threads holds 4 elements.`

`‎test/TritonGPU/amd/amd-extractslice-op.mlir`

Lines changed: 36 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -49,3 +49,39 @@ module attributes {"ttg.compute-capability" = 0 : i32, "ttg.num-ctas" = 1 : i32,`
`49`	`49`	`tt.return`
`50`	`50`	`}`
`51`	`51`	`}`
	`52`	`+`
	`53`	`+#mma0 =#ttg.amd_mfma<{versionMajor=3,versionMinor=0,warpsPerCTA= [4,1],instrShape= [16,16],isTransposed=true}>`
	`54`	`+moduleattributes {"ttg.compute-capability" =0 :i32,"ttg.num-ctas" =1 :i32,"ttg.num-warps" =8 :i32,"ttg.threads-per-warp" =64 :i32} {`
	`55`	`+ tt.func@extract_slice_slice_mma_1d(%arg0:tensor<256xi32,#ttg.slice<{dim=1,parent=#mma0}>> {tt.divisibility=16 :i32}) {`
	`56`	`+// CHECK: llvm.func @extract_slice_slice_mma_1d`
	`57`	`+// CHECK-COUNT-4: %{{[0-9]}} = llvm.extractvalue %arg0[{{[0-9]}}] : !llvm.struct<(i32, i32, i32, i32)>`
	`58`	`+// CHECK: %4 = llvm.mlir.undef : !llvm.struct<(i32, i32)>`
	`59`	`+// CHECK-COUNT-2: %{{[0-9]}} = llvm.insertvalue %{{[0-9]}}, %{{[0-9]}}[{{[0-9]}}] : !llvm.struct<(i32, i32)>`
	`60`	`+%1 =amdgpu.extract_slice%arg0 [128] :tensor<256xi32,#ttg.slice<{dim=1,parent=#mma0}>>totensor<128xi32,#ttg.slice<{dim=1,parent=#mma0}>>`
	`61`	`+ tt.return`
	`62`	`+ }`
	`63`	`+}`
	`64`	`+`
	`65`	`+#mma1 =#ttg.amd_mfma<{versionMajor=3,versionMinor=0,warpsPerCTA= [4,1],instrShape= [16,16],isTransposed=true}>`
	`66`	`+moduleattributes {"ttg.compute-capability" =0 :i32,"ttg.num-ctas" =1 :i32,"ttg.num-warps" =4 :i32,"ttg.threads-per-warp" =64 :i32} {`
	`67`	`+ tt.func@extract_slice_slice_mma_2d_16(%arg0:tensor<128x64xf32,#mma1> {tt.divisibility=16 :i32}) {`
	`68`	`+// CHECK: llvm.func @extract_slice_slice_mma_2d_16`
	`69`	`+// CHECK-COUNT-32: %{{[0-9]}} = llvm.extractvalue %arg0[{{[0-9]}}] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>`
	`70`	`+// CHECK: %32 = llvm.mlir.undef : !llvm.struct<(f32, f32, f32, f32)>`
	`71`	`+// CHECK-COUNT-4: %{{[0-9]}} = llvm.insertvalue %{{[0-9]}}, %{{[0-9]}}[{{[0-9]}}] : !llvm.struct<(f32, f32, f32, f32)>`
	`72`	`+%1 =amdgpu.extract_slice%arg0 [0,0] :tensor<128x64xf32,#mma1>totensor<64x16xf32,#mma1>`
	`73`	`+ tt.return`
	`74`	`+ }`
	`75`	`+}`
	`76`	`+`
	`77`	`+#mma2 =#ttg.amd_mfma<{versionMajor=3,versionMinor=0,warpsPerCTA= [4,1],instrShape= [32,32],isTransposed=true}>`
	`78`	`+moduleattributes {"ttg.compute-capability" =0 :i32,"ttg.num-ctas" =1 :i32,"ttg.num-warps" =4 :i32,"ttg.threads-per-warp" =64 :i32} {`
	`79`	`+ tt.func@extract_slice_slice_mma_2d_32(%arg0:tensor<128x64xf32,#mma2> {tt.divisibility=16 :i32}) {`
	`80`	`+// CHECK: llvm.func @extract_slice_slice_mma_2d_32`
	`81`	`+// CHECK-COUNT-32: %{{[0-9]}} = llvm.extractvalue %arg0[{{[0-9]}}] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>`
	`82`	`+// CHECK: %32 = llvm.mlir.undef : !llvm.struct<(f32, f32, f32, f32)>`
	`83`	`+// CHECK-COUNT-4: %{{[0-9]}} = llvm.insertvalue %{{[0-9]}}, %{{[0-9]}}[{{[0-9]}}] : !llvm.struct<(f32, f32, f32, f32)>`
	`84`	`+%1 =amdgpu.extract_slice%arg0 [0,0] :tensor<128x64xf32,#mma2>totensor<128x8xf32,#mma2>`
	`85`	`+ tt.return`
	`86`	`+ }`
	`87`	`+}`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit3cec690

File tree

3 files changed

3 files changed

`‎lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp`

`‎test/TritonGPU/amd/amd-extractslice-op.mlir`

0 commit comments