Commitc584cbe

apaszke

authored and

Google-ML-Automation

committed

[Mosaic GPU] Add support for 32x32b loads/stores of arbitrary TMEM layouts

If the array in registers uses a layout exactly equal to the TMEM layout,and each register is exactly 32-bit, then the whole load/store operationis a trivial copy of registers into TMEM.This also adds support for arbitrary bitwidths in 32x32b TMEM transfers.PiperOrigin-RevId: 837775689

1 parent03fd0cd commitc584cbeCopy full SHA for c584cbe

File tree

4 files changed

+182

-63

lines changed

jax
- _src/pallas/mosaic_gpu
  - core.py
- experimental/mosaic/gpu
  - tcgen05.py
tests
- mosaic
  - gpu_test.py
- pallas
  - mosaic_gpu_test.py

4 files changed

+182

-63

lines changed

`‎jax/_src/pallas/mosaic_gpu/core.py‎`

Lines changed: 14 additions & 4 deletions

Original file line number	Diff line number	Diff line change
`@@ -1432,7 +1432,7 @@ def to_mgpu(self, args, *kwargs) -> mgpu.FragmentedLayout:`
`1432`	`1432`
`1433`	`1433`	`@dataclasses.dataclass(frozen=True)`
`1434`	`1434`	`classParameterizedLayout(SomeLayout):`
`1435`		`-layout_cls:Layout`
	`1435`	`+layout_cls:Layout\|TMEMLayout`
`1436`	`1436`	`args:Sequence[Any]`
`1437`	`1437`	`kwargs:Any`
`1438`	`1438`
`@@ -1473,6 +1473,7 @@ class Layout(SomeLayout, enum.Enum):`
`1473`	`1473`	`TCGEN05_TRANSPOSED=enum.auto()`
`1474`	`1474`	`TCGEN05_M64_COLLECTIVE=enum.auto()`
`1475`	`1475`	`TCGEN05_TMEM_NATIVE=enum.auto()`
	`1476`	`+TCGEN05_M64_COLLECTIVE_NATIVE=enum.auto()`
`1476`	`1477`
`1477`	`1478`	`SMEM_GMEM_COPY=enum.auto()`
`1478`	`1479`	`TMA_GATHER_INDICES=enum.auto()`
`@@ -1525,6 +1526,8 @@ def check_no_args():`
`1525`	`1526`	`returnmgpu.TMEM_NATIVE_LAYOUT`
`1526`	`1527`	`caseLayout.TCGEN05_M64_COLLECTIVE:`
`1527`	`1528`	`returntcgen05.fa_m64_collective_layout(args,*kwargs)# pytype: disable=missing-parameter`
	`1529`	`+caseLayout.TCGEN05_M64_COLLECTIVE_NATIVE:`
	`1530`	`+returntcgen05.tmem_m64_collective_layout(args,*kwargs).as_tiled_layout()# pytype: disable=missing-parameter`
`1528`	`1531`	`caseLayout.SMEM_GMEM_COPY:`
`1529`	`1532`	`normalize_args=lambdashape,dtype,swizzle: (shape,dtype,swizzle)`
`1530`	`1533`	`shape,dtype,swizzle=normalize_args(args,*kwargs)`
`@@ -1548,15 +1551,22 @@ def check_no_args():`
`1548`	`1551`
`1549`	`1552`	`classTMEMLayout(enum.Enum):`
`1550`	`1553`	`"""Layout for TMEM references."""`
	`1554`	`+# TODO(apaszke): Remove the layout suffix.`
`1551`	`1555`	`SCALES_LAYOUT=enum.auto()`
`1552`	`1556`	`SPARSE_METADATA_LAYOUT=enum.auto()`
	`1557`	`+M64_COLLECTIVE_LAYOUT=enum.auto()`
`1553`	`1558`
`1554`		`-defto_mgpu(self)->tcgen05.TMEMLayout:`
	`1559`	`+def__call__(self,args,*kwargs)->ParameterizedLayout:`
	`1560`	`+returnParameterizedLayout(self,args,kwargs)`
	`1561`	`+`
	`1562`	`+defto_mgpu(self,args,*kwargs)->tcgen05.TMEMLayout:`
`1555`	`1563`	`matchself:`
`1556`	`1564`	`caseTMEMLayout.SCALES_LAYOUT:`
`1557`		`-returntcgen05.scales_layout()`
	`1565`	`+returntcgen05.scales_layout(args,*kwargs)`
`1558`	`1566`	`caseTMEMLayout.SPARSE_METADATA_LAYOUT:`
`1559`		`-returntcgen05.sparse_meta_layout()`
	`1567`	`+returntcgen05.sparse_meta_layout(args,*kwargs)`
	`1568`	`+caseTMEMLayout.M64_COLLECTIVE_LAYOUT:`
	`1569`	`+returntcgen05.tmem_m64_collective_layout(args,*kwargs)# pytype: disable=missing-parameter`
`1560`	`1570`
`1561`	`1571`
`1562`	`1572`	`defTryClusterCancelResult(`

`‎jax/experimental/mosaic/gpu/tcgen05.py‎`

Lines changed: 95 additions & 45 deletions

Original file line number	Diff line number	Diff line change
`@@ -902,6 +902,11 @@ def canonicalize(self) -> TMEMLayout:`
`902`	`902`	`_check_canonical=False,`
`903`	`903`	`)`
`904`	`904`
	`905`	`+defas_tiled_layout(self)->fa.TiledLayout:`
	`906`	`+returnfa.TiledLayout(`
	`907`	`+self.tiling,self.warp_dims,self.lane_dims,self.vector_dim`
	`908`	`+ )`
	`909`	`+`
`905`	`910`
`906`	`911`	`def_infer_tmem_load_registers_layout(`
`907`	`912`	`tmem_layout:TMEMLayout,columns:int,packing:int`
`@@ -1115,23 +1120,47 @@ def slice(self, *idxs) -> TMEMRef:`
`1115`	`1120`	`)`
`1116`	`1121`
`1117`	`1122`	`defload(self,layout:fa.TiledLayout\|None=None,is_signed:bool\|None=None)->fa.FragmentedArray:`
`1118`		`-ifutils.bitwidth(self.dtype)notin {16,32}:`
`1119`		`-raiseNotImplementedError(f"Unsupported dtype:{self.dtype}")`
`1120`	`1123`	`packing=self.packing`
`1121`	`1124`	`iflayoutisNone:`
`1122`	`1125`	`layout=_infer_tmem_load_registers_layout(`
`1123`	`1126`	`self.layout,self.shape[1],packing`
`1124`	`1127`	`)`
	`1128`	`+bitwidth=utils.bitwidth(self.dtype)`
	`1129`	`+has_default_layout=self.layout==tmem_default_layout(packing=packing)`
`1125`	`1130`	`regs_shape=layout.registers_shape(self.shape)`
`1126`	`1131`	`ifregs_shape[0]!=1:# We'll need to issue multiple loads below.`
`1127`	`1132`	`raiseNotImplementedError("Loading multiple row tiles")`
`1128`	`1133`	`iflayout==LAYOUTandself.layout==tmem_default_layout(packing=packing):`
`1129`	`1134`	`registers=_load_32xcols(`
`1130`	`1135`	`self.address,self.shape[1],self.dtype,packing`
`1131`	`1136`	`).T.reshape(regs_shape)`
`1132`		`-eliflayout==TMEM_NATIVE_LAYOUTandself.layout==tmem_default_layout(packing=packing):`
	`1137`	`+eliflayout==self.layout.as_tiled_layout()andpacking*bitwidth==32:`
	`1138`	`+assertlen(layout.base_tile_shape)==2`
	`1139`	`+# We could allow replicated dims in the input, but we'd need to divide the`
	`1140`	`+# split factor computed below by the replication factor of the input.`
	`1141`	`+assertnotany(isinstance(d,fa.Replicated)fordinlayout.warp_dims)`
	`1142`	`+assertnotany(isinstance(d,fa.Replicated)fordinlayout.lane_dims)`
	`1143`	`+warp_split_factor=math.prod(`
	`1144`	`+d.timesifisinstance(d,fa.Replicated)else1`
	`1145`	`+fordinlayout.remove_dimension(1).warp_dims`
	`1146`	`+ )`
	`1147`	`+lane_split_factor=math.prod(`
	`1148`	`+d.timesifisinstance(d,fa.Replicated)else1`
	`1149`	`+fordinlayout.remove_dimension(1).lane_dims`
	`1150`	`+ )`
	`1151`	`+split_factor=warp_split_factor*lane_split_factor`
`1133`	`1152`	`registers=_load_32xcols_native(`
`1134`		`-self.address,self.shape[1],self.dtype,packing`
	`1153`	`+self.address,self.shape[1]//split_factor,self.dtype,packing,packing`
	`1154`	`+ ).reshape(regs_shape)`
	`1155`	`+# TODO(apaszke): Support the case where we have a long vector length in the`
	`1156`	`+# FA more generally, not just for 2x32b.`
	`1157`	`+# 16-bit types are special, because the store instruction can unpack them.`
	`1158`	`+eliflayout==TMEM_NATIVE_LAYOUTandhas_default_layoutand (`
	`1159`	`+ (bitwidth==16andpacking==1)`
	`1160`	`+or (bitwidth==32andlayout.vector_length==2)`
	`1161`	`+ ):`
	`1162`	`+registers=_load_32xcols_native(`
	`1163`	`+self.address,self.shape[1],self.dtype,packing,TMEM_NATIVE_LAYOUT.vector_length`
`1135`	`1164`	`).reshape(regs_shape)`
`1136`	`1165`	`eliflayout==fa.WGMMA_LAYOUTandself.layout==tmem_half_lane_layout(self.shape[1],packing=packing):`
`1137`	`1166`	`# Load half the columns, since they are folded over lanes.`
`@@ -1157,8 +1186,6 @@ def load(self, layout: fa.TiledLayout \| None = None, is_signed: bool \| None = No`
`1157`	`1186`	`)`
`1158`	`1187`
`1159`	`1188`	`defstore(self,value:fa.FragmentedArray):`
`1160`		`-ifutils.bitwidth(self.dtype)notin {8,16,32}:`
`1161`		`-raiseNotImplementedError(f"Unsupported dtype:{self.dtype}")`
`1162`	`1189`	`ifnotisinstance(value,fa.FragmentedArray):`
`1163`	`1190`	`raiseTypeError(f"TMEM stores expect a FragmentedArray, got:{value}")`
`1164`	`1191`	`ifvalue.shape!=self.shape:`
`@@ -1171,27 +1198,38 @@ def store(self, value: fa.FragmentedArray):`
`1171`	`1198`	`f"Stored array has dtype{value.mlir_dtype}, but TMEM has dtype"`
`1172`	`1199`	`f"{self.dtype}"`
`1173`	`1200`	`)`
	`1201`	`+ifnotisinstance(value.layout,fa.TiledLayout):`
	`1202`	`+raiseTypeError(f"Stored array has layout{value.layout}, but TMEM stores expect a TiledLayout")`
`1174`	`1203`	`packing=self.packing`
`1175`	`1204`	`has_default_layout=self.layout==tmem_default_layout(packing=packing)`
	`1205`	`+bitwidth=utils.bitwidth(self.dtype)`
`1176`	`1206`	`ifvalue.layout==LAYOUTandhas_default_layout:`
`1177`	`1207`	`_store_32xcols(`
`1178`	`1208`	`self.address,value.registers.T.reshape((4,-1)),packing`
`1179`	`1209`	`)`
`1180`		`-elif (`
`1181`		`-utils.bitwidth(self.dtype)==8`
`1182`		`-andvalue.layout==fa.tmem_native_layout(vector_length=packing)`
`1183`		`-andhas_default_layout`
	`1210`	`+elifvalue.layout==self.layout.as_tiled_layout()andpacking*bitwidth==32:`
	`1211`	`+_store_32xcols_native(self.address,value.registers.reshape(-1),packing)`
	`1212`	`+# TODO(apaszke): Support the case where we have a long vector length in the`
	`1213`	`+# FA more generally, not just for 2x32b.`
	`1214`	`+# TODO(apaszke): Support a wider range of layouts when dealing with unpacking.`
	`1215`	`+# 16-bit types are special, because the store instruction can unpack them.`
	`1216`	`+elifvalue.layout==TMEM_NATIVE_LAYOUTandhas_default_layoutand (`
	`1217`	`+ (bitwidth==16andpacking==1)`
	`1218`	`+or (bitwidth==32andvalue.layout.vector_length==2)`
`1184`	`1219`	`):`
`1185`	`1220`	`_store_32xcols_native(self.address,value.registers.reshape(-1),packing)`
`1186`		`-elifvalue.layout==TMEM_NATIVE_LAYOUTandhas_default_layout:`
`1187`		`-_store_32xcols_native(`
`1188`		`-self.address,value.registers.reshape(-1),packing`
`1189`		`- )`
`1190`		`-elifvalue.layout==fa.WGMMA_LAYOUTandself.layout==tmem_half_lane_layout(self.shape[1],packing=packing):`
	`1221`	`+elif (`
	`1222`	`+value.layout==fa.WGMMA_LAYOUT`
	`1223`	`+andself.layout==tmem_half_lane_layout(self.shape[1],packing=packing)`
	`1224`	`+ ):`
`1191`	`1225`	`registers=value.registers.T.reshape(2,-1)`
`1192`	`1226`	`registers=np.concatenate(np.split(registers,2,axis=1),axis=0)`
`1193`	`1227`	`_store_32xcols(self.address,registers,packing)`
`1194`		`-elifvalue.layout==fa_m64_collective_layout(self.shape[1])andself.layout==tmem_m64_collective_layout(self.shape[1],packing=packing):`
	`1228`	`+elifvalue.layout==fa_m64_collective_layout(`
	`1229`	`+self.shape[1]`
	`1230`	`+ )andself.layout==tmem_m64_collective_layout(`
	`1231`	`+self.shape[1],packing=packing`
	`1232`	`+ ):`
`1195`	`1233`	`_store_32xcols(self.address,value.registers.reshape(4,-1),packing)`
`1196`	`1234`	`else:`
`1197`	`1235`	`raiseValueError(`
`@@ -1306,37 +1344,49 @@ def _store_32xcols(base_addr, vector_regs, tmem_packing) -> None:`
`1306`	`1344`	`def_store_32xcols_native(base_addr,vector_regs,tmem_packing)->None:`
`1307`	`1345`	`i32=ir.IntegerType.get_signless(32)`
`1308`	`1346`	`assertvector_regs.ndim==1`
`1309`		`-cols=len(vector_regs)*TMEM_NATIVE_LAYOUT.vector_length`
`1310`	`1347`	`vec_ty=ir.VectorType(vector_regs.flat[0].type)`
`1311`		`-reg_packing=64//utils.bitwidth(vec_ty)`
`1312`		`-store_atom_shape= (32,reg_packing)`
	`1348`	`+ [vector_length]=vec_ty.shape`
`1313`	`1349`	`elt_bitwidth=utils.bitwidth(vec_ty.element_type)`
	`1350`	`+reg_packing=32//elt_bitwidth`
	`1351`	`+store_atom_shape= (32,reg_packing)`
	`1352`	`+# TODO(apaszke): More general register splitting code, not just 2x32b.`
`1314`	`1353`	`ifreg_packing==1:`
`1315`		`-# Transform data such that each reg is 32 bits wide.`
`1316`		`-assertelt_bitwidth==32,elt_bitwidth`
`1317`		`-regs= [None]*cols`
`1318`		`-c0=arith.constant(i32,0)`
`1319`		`-c1=arith.constant(i32,1)`
`1320`		`-foridx,vreginenumerate(vector_regs):`
`1321`		`-regs[2*idx]=llvm.extractelement(vreg,c0)`
`1322`		`-regs[2*idx+1]=llvm.extractelement(vreg,c1)`
	`1354`	`+ifvector_length==2:`
	`1355`	`+# Transform data such that each reg is 32 bits wide.`
	`1356`	`+regs= [None]* (len(vector_regs)*2)`
	`1357`	`+c0=arith.constant(i32,0)`
	`1358`	`+c1=arith.constant(i32,1)`
	`1359`	`+foridx,vreginenumerate(vector_regs):`
	`1360`	`+regs[2*idx]=llvm.extractelement(vreg,c0)`
	`1361`	`+regs[2*idx+1]=llvm.extractelement(vreg,c1)`
	`1362`	`+else:`
	`1363`	`+regs= [utils.bitcast(r,i32)forrinvector_regs]`
`1323`	`1364`	`asserttmem_packing==1`
`1324`	`1365`	`unpack=False`
`1325`	`1366`	`elifreg_packing==2:`
	`1367`	`+assertvector_length==2`
`1326`	`1368`	`# In this case, registers are already packed into 32-bit registers.`
`1327`		`-regs=vector_regs`
	`1369`	`+regs=[utils.bitcast(r,i32)forrinvector_regs]`
`1328`	`1370`	`ifelt_bitwidth==16:`
`1329`	`1371`	`assert1<=tmem_packing<=2`
`1330`	`1372`	`unpack=tmem_packing==1`
`1331`	`1373`	`else:`
`1332`		`-iftmem_packing==1:`
	`1374`	`+iftmem_packing==1andelt_bitwidth!=32:`
`1333`	`1375`	`raiseNotImplementedError(`
`1334`	`1376`	`f"Unsupported packing:{tmem_packing} for element type{elt_bitwidth}"`
`1335`	`1377`	`)`
`1336`	`1378`	`asserttmem_packing==32//elt_bitwidth`
`1337`	`1379`	`unpack=False`
`1338`	`1380`	`else:`
`1339`		`-raiseNotImplementedError(reg_packing)`
	`1381`	`+iftmem_packing!=reg_packing:`
	`1382`	`+raiseNotImplementedError(`
	`1383`	`+f"Only{reg_packing} packing supported for bitwidth{elt_bitwidth},"`
	`1384`	`+f" but got TMEM packing of{tmem_packing}"`
	`1385`	`+ )`
	`1386`	`+assertutils.bitwidth(vec_ty)==32`
	`1387`	`+regs= [utils.bitcast(r,i32)forrinvector_regs]`
	`1388`	`+unpack=False`
	`1389`	`+cols=len(regs)*reg_packing`
`1340`	`1390`	`it=_transfer_32xcols(base_addr,cols,store_atom_shape,tmem_packing,reg_packing)`
`1341`	`1391`	`foraddr_row_col,instr_num,lane_step,num_sliceinit:`
`1342`	`1392`	`assertlane_step==0`
`@@ -1393,21 +1443,23 @@ def _load_32xcols(base_addr, cols, dtype, tmem_packing) -> np.ndarray:`
`1393`	`1443`	`returnvector_regs`
`1394`	`1444`
`1395`	`1445`
`1396`		`-def_load_32xcols_native(base_addr,cols,dtype,tmem_packing)->np.ndarray:`
	`1446`	`+def_load_32xcols_native(base_addr,cols,dtype,tmem_packing,vector_length)->np.ndarray:`
`1397`	`1447`	`i32=ir.IntegerType.get_signless(32)`
`1398`		`-vec_ty=ir.VectorType.get((2,),dtype)`
	`1448`	`+vec_ty=ir.VectorType.get((vector_length,),dtype)`
`1399`	`1449`	`reg_packing=32//utils.bitwidth(dtype)`
	`1450`	`+assertvector_length%reg_packing==0`
`1400`	`1451`	`load_shape="32x32b"`
`1401`		`-ifreg_packing==1:`
`1402`		`-load_atom_shape= (32,1)`
`1403`		`-asserttmem_packing==1`
`1404`		`-pack=False`
`1405`		`-elifreg_packing==2:`
`1406`		`-load_atom_shape= (32,2)`
	`1452`	`+load_atom_shape= (32,reg_packing)`
	`1453`	`+ifreg_packing==2:`
`1407`	`1454`	`assert1<=tmem_packing<=2`
`1408`	`1455`	`pack=tmem_packing==1`
`1409`	`1456`	`else:`
`1410`		`-raiseNotImplementedError(reg_packing)`
	`1457`	`+iftmem_packing!=reg_packing:`
	`1458`	`+raiseNotImplementedError(`
	`1459`	`+f"Only{reg_packing} supported for element type{dtype}, but got"`
	`1460`	`+f" TMEM packing of{tmem_packing}"`
	`1461`	`+ )`
	`1462`	`+pack=False`
`1411`	`1463`
`1412`	`1464`	`it=_transfer_32xcols(base_addr,cols,load_atom_shape,tmem_packing,reg_packing)`
`1413`	`1465`	`c0=arith.constant(i32,0)`
`@@ -1416,24 +1468,22 @@ def _load_32xcols_native(base_addr, cols, dtype, tmem_packing) -> np.ndarray:`
`1416`	`1468`	`foraddr_row_col,instr_num,lane_step,num_sliceinit:`
`1417`	`1469`	`assertlane_step==0,lane_step`
`1418`	`1470`	`instr_regs=_tmem_load(addr_row_col,load_shape,instr_num,pack)`
`1419`		`-ifreg_packing==1:`
	`1471`	`+ifreg_packing==1andvector_length==2:`
`1420`	`1472`	`regs[num_slice]= [llvm.bitcast(dtype,r)forrininstr_regs]`
`1421`	`1473`	`else:`
`1422`		`-assertreg_packing==2`
`1423`		`-regs[num_slice]= [llvm.bitcast(vec_ty,r)forrininstr_regs]`
	`1474`	`+regs[num_slice]= [utils.bitcast(r,vec_ty)forrininstr_regs]`
`1424`	`1475`
`1425`		`-ifreg_packing==1:`
	`1476`	`+ifreg_packing==1andvector_length==2:`
`1426`	`1477`	`vector_regs=np.ndarray((cols//2,),dtype=object)`
`1427`	`1478`	`undef=llvm.mlir_undef(vec_ty)`
`1428`	`1479`	`foridxinrange(vector_regs.size):`
`1429`	`1480`	`high_undef=llvm.insertelement(undef,regs[2*idx],c0)`
`1430`	`1481`	`vreg=llvm.insertelement(high_undef,regs[2*idx+1],c1)`
`1431`	`1482`	`vector_regs[idx]=vreg`
`1432`	`1483`	`else:`
`1433`		`-assertreg_packing==2`
	`1484`	`+assertvector_length==reg_packing`
`1434`	`1485`	`vector_regs=np.asarray(regs,dtype=object)`
`1435`	`1486`
`1436`		`-assertvector_regs.shape== (cols//TMEM_NATIVE_LAYOUT.vector_length,)`
`1437`	`1487`	`returnvector_regs`
`1438`	`1488`
`1439`	`1489`

`‎tests/mosaic/gpu_test.py‎`

Lines changed: 33 additions & 9 deletions

Original file line number	Diff line number	Diff line change
`@@ -1199,18 +1199,33 @@ def setUp(self):`
`1199`	`1199`	`self.skipTest("Only works on GPU with capability sm_100a or sm_101a")`
`1200`	`1200`
`1201`	`1201`	`@parameterized.product(`
`1202`		`-jax_dtype_packing=[(jnp.float32,1), (jnp.float16,1), (jnp.float16,2)],`
	`1202`	`+jax_dtype_packing=[(jnp.float32,1), (jnp.float16,1), (jnp.float16,2), (jnp.float8_e5m2,4)],`
`1203`	`1203`	`reg_tmem_layout_m=[`
`1204`		`- (lambda_:tcgen05.LAYOUT,lambda_,p:tcgen05.tmem_default_layout(p),128),`
`1205`		`- (lambda_:fa.WGMMA_LAYOUT,tcgen05.tmem_half_lane_layout,64),`
`1206`		`- (tcgen05.fa_m64_collective_layout,tcgen05.tmem_m64_collective_layout,64),`
	`1204`	`+ (lambda_c,_p:tcgen05.LAYOUT,lambda_,p:tcgen05.tmem_default_layout(p),128),`
	`1205`	`+ (lambda_c,_p:fa.WGMMA_LAYOUT,tcgen05.tmem_half_lane_layout,64),`
	`1206`	`+ (`
	`1207`	`+lambdac,_p:tcgen05.fa_m64_collective_layout(c),`
	`1208`	`+tcgen05.tmem_m64_collective_layout,`
	`1209`	`+64,`
	`1210`	`+ ),`
	`1211`	`+ (`
	`1212`	`+lambdac,p:tcgen05.tmem_m64_collective_layout(c,p).as_tiled_layout(),`
	`1213`	`+tcgen05.tmem_m64_collective_layout,`
	`1214`	`+64,`
	`1215`	`+ ),`
`1207`	`1216`	`],`
`1208`	`1217`	`)`
`1209`	`1218`	`deftest_load_store_tmem(self,jax_dtype_packing,reg_tmem_layout_m):`
`1210`	`1219`	`jax_dtype,packing=jax_dtype_packing`
`1211`	`1220`	`reg_layout_f,tmem_layout_f,m=reg_tmem_layout_m`
`1212`	`1221`	`n=160`
`1213`		`-reg_layout=reg_layout_f(n)`
	`1222`	`+reg_layout=reg_layout_f(n,packing)`
	`1223`	`+iftmem_layout_fistcgen05.tmem_m64_collective_layout:`
	`1224`	`+ifjax_dtype==jnp.float16andpacking==1:`
	`1225`	`+self.skipTest("Not implemented yet")`
	`1226`	`+is_native_transfer=tmem_layout_f(n,packing).as_tiled_layout()==reg_layout`
	`1227`	`+ifnotis_native_transferandjax_dtype==jnp.float8_e5m2:`
	`1228`	`+self.skipTest("Not implemented yet")`
`1214`	`1229`
`1215`	`1230`	`defkernel(ctx,input,output,tmem):`
`1216`	`1231`	`delctx`
`@@ -1220,19 +1235,28 @@ def kernel(ctx, input, output, tmem):`
`1220`	`1235`
`1221`	`1236`	`x=self.prng.uniform(-1,1, (m,n)).astype(jax_dtype)`
`1222`	`1237`	`y=mgpu.as_gpu_kernel(`
`1223`		`-kernel, (1,1,1), (128,1,1),x,x,mgpu.TMEM(x.shape,jax_dtype,layout=tmem_layout_f(n,packing)),`
	`1238`	`+kernel, (1,1,1), (128,1,1),x,x,`
	`1239`	`+mgpu.TMEM(x.shape,jax_dtype,layout=tmem_layout_f(n,packing)),`
`1224`	`1240`	`)(x)`
`1225`	`1241`	`np.testing.assert_array_equal(x,y)`
`1226`	`1242`
`1227`		`-@parameterized.parameters([(jnp.float32,1), (jnp.float16,1), (jnp.float16,2)])`
	`1243`	`+@parameterized.parameters([`
	`1244`	`+ (jnp.float32,1),`
	`1245`	`+ (jnp.float16,1),`
	`1246`	`+ (jnp.float16,2),`
	`1247`	`+ (jnp.float8_e5m2,4),`
	`1248`	`+# TODO(apaszke): Enable. LLVM lowering doesn't like 4 bits yet.`
	`1249`	`+# (jnp.float4_e2m1fn, 8),`
	`1250`	`+ ])`
`1228`	`1251`	`deftest_load_store_tmem_native(self,jax_dtype,packing):`
`1229`	`1252`	`# TODO(bchetioui): add a test for int8 with a native layout with vector`
`1230`	`1253`	`# length equal to 4 once TMEM load is implemented for it.`
`1231`	`1254`	`defkernel(ctx,input,output,tmem):`
`1232`	`1255`	`delctx`
`1233`		`-tmem.store(fa.FragmentedArray.load_untiled(input,layout=tcgen05.TMEM_NATIVE_LAYOUT,optimized=False))`
	`1256`	`+reg_layout=tcgen05.tmem_default_layout(max(packing,2)).as_tiled_layout()`
	`1257`	`+tmem.store(fa.FragmentedArray.load_untiled(input,layout=reg_layout,optimized=False))`
`1234`	`1258`	`tcgen05.commit_tmem()`
`1235`		`-tmem.load(tcgen05.TMEM_NATIVE_LAYOUT).store_untiled(output,optimized=False)`
	`1259`	`+tmem.load(reg_layout).store_untiled(output,optimized=False)`
`1236`	`1260`
`1237`	`1261`	`x=self.prng.uniform(-1,1, (128,128)).astype(jax_dtype)`
`1238`	`1262`	`y=mgpu.as_gpu_kernel(`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commitc584cbe

File tree

4 files changed

4 files changed

`‎jax/_src/pallas/mosaic_gpu/core.py‎`

`‎jax/experimental/mosaic/gpu/tcgen05.py‎`

`‎tests/mosaic/gpu_test.py‎`

0 commit comments