NotificationsYou must be signed in to change notification settings
Fork26.3k
Star96k

Commitcd6274e

committed

[FlexAttention] Remove Old Constraint on last dim strides

ghstack-source-id:2dd8f81Pull Requestresolved:#151959

1 parentca17c81 commitcd6274eCopy full SHA for cd6274e

File tree

2 files changed

+80

-12

lines changed

test/inductor
- test_flex_attention.py
torch/_inductor/kernel
- flex_attention.py

2 files changed

+80

-12

lines changed

`‎test/inductor/test_flex_attention.py‎`

Lines changed: 66 additions & 7 deletions

Original file line number	Diff line number	Diff line change
`@@ -96,7 +96,7 @@ def temp_float32_matmul_precision(precision: str):`
`96`	`96`
`97`	`97`	`defskip_on_cpu(test_func):`
`98`	`98`	`"""Decorator to skip tests that are not supported on CPU."""`
`99`		`-decorated_func=skipCPUIf(True,"Not supported onCUDA")(test_func)`
	`99`	`+decorated_func=skipCPUIf(True,"Not supported onCPU")(test_func)`
`100`	`100`	`returndecorated_func`
`101`	`101`
`102`	`102`
`@@ -2851,6 +2851,7 @@ def test_strided_backwards(self):`
`2851`	`2851`	`(1,0,2,3),# Reverse order`
`2852`	`2852`	`(0,2,1,3),# Mixed order`
`2853`	`2853`	`(2,0,1,3),# Another mixed order`
	`2854`	`+ (0,1,3,2),# Non contiguous last dim`
`2854`	`2855`	`],`
`2855`	`2856`	`)`
`2856`	`2857`	`@common_utils.parametrize("shape", [(2,1,128,16), (4,2,64,16)])`
`@@ -2899,12 +2900,7 @@ def test_flex_attention_stride_ordering(self, device, mode, permute_order, shape`
`2899`	`2900`	`@common_utils.parametrize("mode", ["eager","inductor"])`
`2900`	`2901`	`@common_utils.parametrize(`
`2901`	`2902`	`"permute_order",`
`2902`		`- [`
`2903`		`- (0,1,2,3),`
`2904`		`- (1,0,2,3),`
`2905`		`- (0,2,1,3),`
`2906`		`- (2,0,1,3),`
`2907`		`- ],`
	`2903`	`+ [(0,1,2,3), (1,0,2,3), (0,2,1,3), (2,0,1,3), (0,1,3,2)],`
`2908`	`2904`	`)`
`2909`	`2905`	`@common_utils.parametrize("shape", [(2,5,128,16), (4,2,64,16)])`
`2910`	`2906`	`deftest_flex_attention_backward_stride_ordering(`
`@@ -2948,6 +2944,69 @@ def test_flex_attention_backward_stride_ordering(`
`2948`	`2944`	`f"Mode:{mode}, Stride order mismatch for{name}: grad{input_stride_order}, input{orig_stride_order}.",`
`2949`	`2945`	`)`
`2950`	`2946`
	`2947`	`+@supported_platform`
	`2948`	`+deftest_non_contiguous_last_dim(self,device):`
	`2949`	`+"""Test flex_attention with tensors having non contiguous last dimension."""`
	`2950`	`+B,H,D=4,8,64`
	`2951`	`+dtype=torch.float16ifdevice=="cuda"elsetorch.float32`
	`2952`	`+forSin [16,64]:`
	`2953`	`+`
	`2954`	`+defcolumn_major_tensor():`
	`2955`	`+tensor=torch.randn(`
	`2956`	`+ (B,H,S,D),`
	`2957`	`+dtype=dtype,`
	`2958`	`+device=device,`
	`2959`	`+ )`
	`2960`	`+# Column major in last 2 dims`
	`2961`	`+returntensor.transpose(-1,-2).contiguous().transpose(-1,-2)`
	`2962`	`+`
	`2963`	`+q=column_major_tensor()`
	`2964`	`+k=column_major_tensor()`
	`2965`	`+v=column_major_tensor()`
	`2966`	`+`
	`2967`	`+requires_grad=deviceinDEVICE_SUPPORTS_BACKWARDS`
	`2968`	`+ifrequires_grad:`
	`2969`	`+q.requires_grad_(True)`
	`2970`	`+k.requires_grad_(True)`
	`2971`	`+v.requires_grad_(True)`
	`2972`	`+`
	`2973`	`+self.assertNotEqual(q.stride()[-1],1)`
	`2974`	`+self.assertNotEqual(k.stride()[-1],1)`
	`2975`	`+self.assertNotEqual(v.stride()[-1],1)`
	`2976`	`+`
	`2977`	`+q_ref,k_ref,v_ref=query_key_value_clones(q,k,v)`
	`2978`	`+q_gold,k_gold,v_gold=query_key_value_clones(q,k,v,torch.float64)`
	`2979`	`+`
	`2980`	`+golden_out=flex_attention(q_gold,k_gold,v_gold)`
	`2981`	`+ref_out=flex_attention(q_ref,k_ref,v_ref)`
	`2982`	`+`
	`2983`	`+flex_compiled=torch.compile(flex_attention,fullgraph=True,dynamic=True)`
	`2984`	`+compiled_out=flex_compiled(q,k,v)`
	`2985`	`+`
	`2986`	`+self._check_out(golden_out,ref_out,compiled_out)`
	`2987`	`+`
	`2988`	`+ifrequires_grad:`
	`2989`	`+backward_grad=torch.randn_like(ref_out)`
	`2990`	`+`
	`2991`	`+golden_out.backward(backward_grad.to(torch.float64))`
	`2992`	`+ref_out.backward(backward_grad)`
	`2993`	`+compiled_out.backward(backward_grad)`
	`2994`	`+`
	`2995`	`+self._check_out_and_grad(`
	`2996`	`+golden_out,`
	`2997`	`+ref_out,`
	`2998`	`+compiled_out,`
	`2999`	`+q_gold,`
	`3000`	`+q_ref,`
	`3001`	`+q,`
	`3002`	`+k_gold,`
	`3003`	`+k_ref,`
	`3004`	`+k,`
	`3005`	`+v_gold,`
	`3006`	`+v_ref,`
	`3007`	`+v,`
	`3008`	`+ )`
	`3009`	`+`
`2951`	`3010`	`@supported_platform`
`2952`	`3011`	`@common_utils.parametrize("compile", [True,False])`
`2953`	`3012`	`deftest_fully_masked_out_rows_0_check(self,device,compile:bool):`

`‎torch/_inductor/kernel/flex_attention.py‎`

Lines changed: 14 additions & 5 deletions

Original file line number	Diff line number	Diff line change
`@@ -930,6 +930,15 @@ def check_cpu_supported():`
`930`	`930`	`returnsupported`
`931`	`931`
`932`	`932`
	`933`	`+defcontiguous_last_dim(x):`
	`934`	`+"""Ensure that realized IR node has a contigous stride in the last dimension."""`
	`935`	`+strides=x.maybe_get_stride()`
	`936`	`+ifstridesandstrides[-1]!=1:`
	`937`	`+contiguous_stride_order=list(reversed(range(len(x.get_size()))))`
	`938`	`+returnExternKernel.require_stride_order(x,contiguous_stride_order)`
	`939`	`+returnx`
	`940`	`+`
	`941`	`+`
`933`	`942`	`deflower_cpu(`
`934`	`943`	`query,`
`935`	`944`	`key,`
`@@ -1092,6 +1101,9 @@ def convert_mask_graph_module(mask_graph):`
`1092`	`1101`	`ifisinstance(item,TensorBox):`
`1093`	`1102`	`fake_buffers.append(item.data.data)# type: ignore[attr-defined]`
`1094`	`1103`
	`1104`	`+# CPU kernel requires last dim to be contiguous`
	`1105`	`+query,key,value=map(contiguous_last_dim, [query,key,value])`
	`1106`	`+`
`1095`	`1107`	`(`
`1096`	`1108`	`query,`
`1097`	`1109`	`key,`
`@@ -1258,7 +1270,6 @@ def set_head_dim_values(`
`1258`	`1270`	`)`
`1259`	`1271`
`1260`	`1272`
`1261`		`-# TODO: We probably also need a layout constraint?`
`1262`	`1273`	`@register_lowering(torch.ops.higher_order.flex_attention,type_promotion_kind=None)`
`1263`	`1274`	`defflex_attention(`
`1264`	`1275`	`query,`
`@@ -1413,11 +1424,9 @@ def flex_attention(`
`1413`	`1424`	`else:`
`1414`	`1425`	`kernel_options.setdefault("IS_DIVISIBLE",True)`
`1415`	`1426`
`1416`		`-#Reuse query strides for output layout despite different last dimension.`
`1417`		`-#This works because only the last dim differs and we check it is contiguous.`
	`1427`	`+#NB it is okay that the v_head_dim is different`
	`1428`	`+#We are using these to match fill order of the output.`
`1418`	`1429`	`q_strides=query.get_stride()`
`1419`		`-assertq_strides[-1]==1,"Query must be contiguous in the last dimension"`
`1420`		`-`
`1421`	`1430`	`# Construct output layout with strides matching the query.`
`1422`	`1431`	`out_size= [B,Hq,seq_len_q,v_head_dim]`
`1423`	`1432`	`out_strides=infer_dense_strides(out_size,q_strides)`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commitcd6274e

File tree

2 files changed

2 files changed

`‎test/inductor/test_flex_attention.py‎`

`‎torch/_inductor/kernel/flex_attention.py‎`

0 commit comments