@@ -49,3 +49,39 @@ module attributes {"ttg.compute-capability" = 0 : i32, "ttg.num-ctas" = 1 : i32,
49
49
tt.return
50
50
}
51
51
}
52
+
53
+ #mma0 =#ttg.amd_mfma <{versionMajor =3 ,versionMinor =0 ,warpsPerCTA = [4 ,1 ],instrShape = [16 ,16 ],isTransposed =true }>
54
+ module attributes {" ttg.compute-capability" =0 :i32 ," ttg.num-ctas" =1 :i32 ," ttg.num-warps" =8 :i32 ," ttg.threads-per-warp" =64 :i32 } {
55
+ tt.func @extract_slice_slice_mma_1d (%arg0: tensor <256 xi32 ,#ttg.slice <{dim =1 ,parent =#mma0 }>> {tt.divisibility =16 :i32 }) {
56
+ // CHECK: llvm.func @extract_slice_slice_mma_1d
57
+ // CHECK-COUNT-4: %{{[0-9]*}} = llvm.extractvalue %arg0[{{[0-9]*}}] : !llvm.struct<(i32, i32, i32, i32)>
58
+ // CHECK: %4 = llvm.mlir.undef : !llvm.struct<(i32, i32)>
59
+ // CHECK-COUNT-2: %{{[0-9]*}} = llvm.insertvalue %{{[0-9]*}}, %{{[0-9]*}}[{{[0-9]*}}] : !llvm.struct<(i32, i32)>
60
+ %1 =amdgpu.extract_slice %arg0 [128 ] :tensor <256 xi32 ,#ttg.slice <{dim =1 ,parent =#mma0 }>>to tensor <128 xi32 ,#ttg.slice <{dim =1 ,parent =#mma0 }>>
61
+ tt.return
62
+ }
63
+ }
64
+
65
+ #mma1 =#ttg.amd_mfma <{versionMajor =3 ,versionMinor =0 ,warpsPerCTA = [4 ,1 ],instrShape = [16 ,16 ],isTransposed =true }>
66
+ module attributes {" ttg.compute-capability" =0 :i32 ," ttg.num-ctas" =1 :i32 ," ttg.num-warps" =4 :i32 ," ttg.threads-per-warp" =64 :i32 } {
67
+ tt.func @extract_slice_slice_mma_2d_16 (%arg0: tensor <128 x64 xf32 ,#mma1 > {tt.divisibility =16 :i32 }) {
68
+ // CHECK: llvm.func @extract_slice_slice_mma_2d_16
69
+ // CHECK-COUNT-32: %{{[0-9]*}} = llvm.extractvalue %arg0[{{[0-9]*}}] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>
70
+ // CHECK: %32 = llvm.mlir.undef : !llvm.struct<(f32, f32, f32, f32)>
71
+ // CHECK-COUNT-4: %{{[0-9]*}} = llvm.insertvalue %{{[0-9]*}}, %{{[0-9]*}}[{{[0-9]*}}] : !llvm.struct<(f32, f32, f32, f32)>
72
+ %1 =amdgpu.extract_slice %arg0 [0 ,0 ] :tensor <128 x64 xf32 ,#mma1 >to tensor <64 x16 xf32 ,#mma1 >
73
+ tt.return
74
+ }
75
+ }
76
+
77
+ #mma2 =#ttg.amd_mfma <{versionMajor =3 ,versionMinor =0 ,warpsPerCTA = [4 ,1 ],instrShape = [32 ,32 ],isTransposed =true }>
78
+ module attributes {" ttg.compute-capability" =0 :i32 ," ttg.num-ctas" =1 :i32 ," ttg.num-warps" =4 :i32 ," ttg.threads-per-warp" =64 :i32 } {
79
+ tt.func @extract_slice_slice_mma_2d_32 (%arg0: tensor <128 x64 xf32 ,#mma2 > {tt.divisibility =16 :i32 }) {
80
+ // CHECK: llvm.func @extract_slice_slice_mma_2d_32
81
+ // CHECK-COUNT-32: %{{[0-9]*}} = llvm.extractvalue %arg0[{{[0-9]*}}] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>
82
+ // CHECK: %32 = llvm.mlir.undef : !llvm.struct<(f32, f32, f32, f32)>
83
+ // CHECK-COUNT-4: %{{[0-9]*}} = llvm.insertvalue %{{[0-9]*}}, %{{[0-9]*}}[{{[0-9]*}}] : !llvm.struct<(f32, f32, f32, f32)>
84
+ %1 =amdgpu.extract_slice %arg0 [0 ,0 ] :tensor <128 x64 xf32 ,#mma2 >to tensor <128 x8 xf32 ,#mma2 >
85
+ tt.return
86
+ }
87
+ }