@@ -3231,6 +3231,18 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
32313231assert (size <= INT32_MAX);
32323232assert (dstOffset < (INT32_MAX -static_cast <int >(size)));
32333233
3234+ auto emitStore = [&](instruction ins,unsigned width, regNumber target) {
3235+ if (dstLclNum != BAD_VAR_NUM)
3236+ {
3237+ emit->emitIns_S_R (ins,EA_ATTR (width), target, dstLclNum, dstOffset);
3238+ }
3239+ else
3240+ {
3241+ emit->emitIns_ARX_R (ins,EA_ATTR (width), target, dstAddrBaseReg, dstAddrIndexReg, dstAddrIndexScale,
3242+ dstOffset);
3243+ }
3244+ };
3245+
32343246#ifdef FEATURE_SIMD
32353247if (willUseSimdMov)
32363248 {
@@ -3244,18 +3256,6 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
32443256 instruction simdMov =simdUnalignedMovIns ();
32453257unsigned bytesWritten =0 ;
32463258
3247- auto emitSimdMovs = [&]() {
3248- if (dstLclNum != BAD_VAR_NUM)
3249- {
3250- emit->emitIns_S_R (simdMov,EA_ATTR (regSize), srcXmmReg, dstLclNum, dstOffset);
3251- }
3252- else
3253- {
3254- emit->emitIns_ARX_R (simdMov,EA_ATTR (regSize), srcXmmReg, dstAddrBaseReg, dstAddrIndexReg,
3255- dstAddrIndexScale, dstOffset);
3256- }
3257- };
3258-
32593259while (bytesWritten < size)
32603260 {
32613261if (bytesWritten + regSize > size)
@@ -3264,7 +3264,7 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
32643264break ;
32653265 }
32663266
3267- emitSimdMovs ( );
3267+ emitStore (simdMov, regSize, srcXmmReg );
32683268 dstOffset += regSize;
32693269 bytesWritten += regSize;
32703270 }
@@ -3279,10 +3279,71 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
32793279
32803280// Rewind dstOffset so we can fit a vector for the while remainder
32813281 dstOffset -= (regSize - size);
3282- emitSimdMovs ( );
3282+ emitStore (simdMov, regSize, srcXmmReg );
32833283 size =0 ;
32843284 }
32853285 }
3286+ else if (node->IsOnHeapAndContainsReferences () && ((internalRegisters.GetAll (node) & RBM_ALLFLOAT) !=0 ))
3287+ {
3288+ // For block with GC refs we still can use SIMD, but only for continuous
3289+ // non-GC parts where atomicity guarantees are not that strict.
3290+ assert (!willUseSimdMov);
3291+ ClassLayout* layout = node->GetLayout ();
3292+
3293+ regNumber simdZeroReg = REG_NA;
3294+ unsigned slots = layout->GetSlotCount ();
3295+ unsigned slot =0 ;
3296+ while (slot < slots)
3297+ {
3298+ if (!layout->IsGCPtr (slot))
3299+ {
3300+ // How many continuous non-GC slots do we have?
3301+ unsigned nonGcSlotCount =0 ;
3302+ do
3303+ {
3304+ nonGcSlotCount++;
3305+ slot++;
3306+ }while ((slot < slots) && !layout->IsGCPtr (slot));
3307+
3308+ for (unsigned nonGcSlot =0 ; nonGcSlot < nonGcSlotCount; nonGcSlot++)
3309+ {
3310+ // Are continuous nongc slots enough to use SIMD?
3311+ unsigned simdSize = compiler->roundDownSIMDSize ((nonGcSlotCount - nonGcSlot) * REGSIZE_BYTES);
3312+ if (simdSize >0 )
3313+ {
3314+ // Initialize simdZeroReg with zero on demand
3315+ if (simdZeroReg == REG_NA)
3316+ {
3317+ simdZeroReg = internalRegisters.GetSingle (node, RBM_ALLFLOAT);
3318+ // SIMD16 is sufficient for any SIMD size
3319+ simd_t vecCon = {};
3320+ genSetRegToConst (simdZeroReg, TYP_SIMD16, &vecCon);
3321+ }
3322+
3323+ emitStore (simdUnalignedMovIns (), simdSize, simdZeroReg);
3324+ dstOffset += (int )simdSize;
3325+ nonGcSlot += (simdSize / REGSIZE_BYTES) -1 ;
3326+ }
3327+ else
3328+ {
3329+ emitStore (INS_mov, REGSIZE_BYTES, srcIntReg);
3330+ dstOffset += REGSIZE_BYTES;
3331+ }
3332+ }
3333+ }
3334+ else
3335+ {
3336+ // GC slot - update atomically
3337+ emitStore (INS_mov, REGSIZE_BYTES, srcIntReg);
3338+ dstOffset += REGSIZE_BYTES;
3339+ slot++;
3340+ }
3341+ }
3342+
3343+ // There are no trailing elements
3344+ assert ((layout->GetSize () % TARGET_POINTER_SIZE) ==0 );
3345+ size =0 ;
3346+ }
32863347#endif // FEATURE_SIMD
32873348
32883349assert ((srcIntReg != REG_NA) || (size ==0 ));
@@ -3298,15 +3359,7 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
32983359
32993360for (; size > regSize; size -= regSize, dstOffset += regSize)
33003361 {
3301- if (dstLclNum != BAD_VAR_NUM)
3302- {
3303- emit->emitIns_S_R (INS_mov,EA_ATTR (regSize), srcIntReg, dstLclNum, dstOffset);
3304- }
3305- else
3306- {
3307- emit->emitIns_ARX_R (INS_mov,EA_ATTR (regSize), srcIntReg, dstAddrBaseReg, dstAddrIndexReg,
3308- dstAddrIndexScale, dstOffset);
3309- }
3362+ emitStore (INS_mov, regSize, srcIntReg);
33103363 }
33113364
33123365// Handle the non-SIMD remainder by overlapping with previously processed data if needed
@@ -3322,15 +3375,7 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
33223375assert (shiftBack <= regSize);
33233376 dstOffset -= shiftBack;
33243377
3325- if (dstLclNum != BAD_VAR_NUM)
3326- {
3327- emit->emitIns_S_R (INS_mov,EA_ATTR (regSize), srcIntReg, dstLclNum, dstOffset);
3328- }
3329- else
3330- {
3331- emit->emitIns_ARX_R (INS_mov,EA_ATTR (regSize), srcIntReg, dstAddrBaseReg, dstAddrIndexReg,
3332- dstAddrIndexScale, dstOffset);
3333- }
3378+ emitStore (INS_mov, regSize, srcIntReg);
33343379 }
33353380#else // TARGET_X86
33363381for (unsigned regSize = REGSIZE_BYTES; size >0 ; size -= regSize, dstOffset += regSize)
@@ -3339,15 +3384,8 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
33393384 {
33403385 regSize /=2 ;
33413386 }
3342- if (dstLclNum != BAD_VAR_NUM)
3343- {
3344- emit->emitIns_S_R (INS_mov,EA_ATTR (regSize), srcIntReg, dstLclNum, dstOffset);
3345- }
3346- else
3347- {
3348- emit->emitIns_ARX_R (INS_mov,EA_ATTR (regSize), srcIntReg, dstAddrBaseReg, dstAddrIndexReg,
3349- dstAddrIndexScale, dstOffset);
3350- }
3387+
3388+ emitStore (INS_mov, regSize, srcIntReg);
33513389 }
33523390#endif
33533391}