@@ -10894,9 +10894,12 @@ void CodeGen::genZeroInitFrameUsingBlockInit(int untrLclHi, int untrLclLo, regNu
1089410894assert ((blkSize + alignmentHiBlkSize) == (untrLclHi - untrLclLo));
1089510895#endif // !defined(TARGET_AMD64)
1089610896
10897+ const int maxSimdSize = (int )compiler->roundDownSIMDSize (blkSize);
10898+ assert ((maxSimdSize >= XMM_REGSIZE_BYTES) && (maxSimdSize <= ZMM_REGSIZE_BYTES));
10899+
1089710900// The loop is unrolled 3 times so we do not move to the loop block until it
1089810901// will loop at least once so the threshold is 6.
10899- if (blkSize < (6 *XMM_REGSIZE_BYTES ))
10902+ if (blkSize < (6 *maxSimdSize ))
1090010903 {
1090110904// Generate the following code:
1090210905//
@@ -10905,10 +10908,22 @@ void CodeGen::genZeroInitFrameUsingBlockInit(int untrLclHi, int untrLclLo, regNu
1090510908// ...
1090610909// movups xmmword ptr [ebp/esp-OFFS], xmm4
1090710910// mov qword ptr [ebp/esp-OFFS], rax
10908-
10911+ //
10912+ // NOTE: it implicitly zeroes YMM4 and ZMM4 as well.
1090910913 emit->emitIns_SIMD_R_R_R (INS_xorps, EA_16BYTE, zeroSIMDReg, zeroSIMDReg, zeroSIMDReg);
1091010914
1091110915int i =0 ;
10916+ if (maxSimdSize > XMM_REGSIZE_BYTES)
10917+ {
10918+ for (; i <= blkSize - maxSimdSize; i += maxSimdSize)
10919+ {
10920+ // We previously aligned data to 16 bytes which might not be aligned to maxSimdSize
10921+ emit->emitIns_AR_R (simdUnalignedMovIns (),EA_ATTR (maxSimdSize), zeroSIMDReg, frameReg,
10922+ alignedLclLo + i);
10923+ }
10924+ // Remainder will be handled by the xmm loop below
10925+ }
10926+
1091210927for (; i < blkSize; i += XMM_REGSIZE_BYTES)
1091310928 {
1091410929 emit->emitIns_AR_R (simdMov,EA_ATTR (XMM_REGSIZE_BYTES), zeroSIMDReg, frameReg, alignedLclLo + i);