Commite2d141d

and

authored

set thread_work_size to 4 for unrolled kernel (#154541)

set thread_work_size to 4 for unrolled kernel (#152396)Previous PRs enabling 8-vectorization inadvertently regressed unrolled kernel perf.Pull Requestresolved:#152396Approved by:https://github.com/BoyuanFeng,https://github.com/msaroufim,https://github.com/malfet,https://github.com/Aidyn-A,https://github.com/atalman(cherry picked from commitadebb8b)Co-authored-by: Natalia Gimelshein <ngimel@meta.com>

1 parent1214198 commite2d141dCopy full SHA for e2d141d

File tree

+11

-2

lines changed

+11

-2

lines changed

Lines changed: 11 additions & 2 deletions

Original file line number	Diff line number	Diff line change
`@@ -83,6 +83,14 @@ constexpr auto elems_per_thread(){`
`83`	`83`	`}`
`84`	`84`	`#endif`
`85`	`85`
	`86`	`+`
	`87`	`+//thread work size of 8 regresses the perf of elementwise kernel on cuda`
	`88`	`+//this doesn't change ROCm behavior as thread_work_size is already 4 on ROCm`
	`89`	`+constexprintelementwise_thread_work_size() {return4;}`
	`90`	`+constexprintelementwise_block_work_size() {`
	`91`	`+returnelementwise_thread_work_size() *num_threads();`
	`92`	`+}`
	`93`	`+`
`86`	`94`	`template<int io_sizes>`
`87`	`95`	`constexprautoio_block_work_size() {`
`88`	`96`	`returnnum_threads() * elems_per_thread<io_sizes>();`
`@@ -336,9 +344,10 @@ static inline void launch_unrolled_kernel(`
`336`	`344`	`loader_t l,`
`337`	`345`	`storer_t s) {`
`338`	`346`	`TORCH_INTERNAL_ASSERT(N >0 && N <= std::numeric_limits<int32_t>::max());`
`339`		`-int64_t grid = (N +block_work_size() -1) /block_work_size();`
	`347`	`+`
	`348`	`+int64_t grid = (N +elementwise_block_work_size() -1) /elementwise_block_work_size();`
`340`	`349`	`auto stream =at::cuda::getCurrentCUDAStream();`
`341`		`- unrolled_elementwise_kernel<func_t,array_t,thread_work_size()>`
	`350`	`+ unrolled_elementwise_kernel<func_t,array_t,elementwise_thread_work_size()>`
`342`	`351`	`<<<grid, num_threads(),0, stream>>>(N, f, data, ic, oc, l, s);`
`343`	`352`	`C10_CUDA_KERNEL_LAUNCH_CHECK();`
`344`	`353`	`}`

Comments

(0)