Expand Up @@ -7802,6 +7802,285 @@ void Lowering::ContainCheckBitCast(GenTree* node) } } struct StoreCoalescingData { var_types targetType; GenTree* baseAddr; GenTree* index; GenTree* value; uint32_t scale; int offset; }; //------------------------------------------------------------------------ // GetStoreCoalescingData: given a STOREIND node, get the data needed to perform // store coalescing including pointer to the previous node. // // Arguments: // comp - the compiler instance // ind - the STOREIND node // data - [OUT] the data needed for store coalescing // // Return Value: // true if the data was successfully retrieved, false otherwise. // Basically, false means that we definitely can't do store coalescing. // static bool GetStoreCoalescingData(Compiler* comp, GenTreeStoreInd* ind, StoreCoalescingData* data) { // Don't merge volatile stores. if (ind->IsVolatile()) { return false; } // Data has to be INT_CNS, can be also VEC_CNS in future. if (!ind->Data()->IsCnsIntOrI()) { return false; } data->targetType = ind->TypeGet(); data->value = ind->Data(); if (ind->Addr()->OperIs(GT_LEA)) { GenTree* base = ind->Addr()->AsAddrMode()->Base(); GenTree* index = ind->Addr()->AsAddrMode()->Index(); if ((base == nullptr) || !base->OperIs(GT_LCL_VAR) || comp->lvaVarAddrExposed(base->AsLclVar()->GetLclNum())) { // Base must be a local. It's possible for it to be nullptr when index is not null, // but let's ignore such cases. return false; } if ((index != nullptr) && (!index->OperIs(GT_LCL_VAR) || comp->lvaVarAddrExposed(index->AsLclVar()->GetLclNum()))) { // Index should be either nullptr or a local. return false; } data->baseAddr = base == nullptr ? nullptr : base; data->index = index == nullptr ? nullptr : index; data->scale = ind->Addr()->AsAddrMode()->GetScale(); data->offset = ind->Addr()->AsAddrMode()->Offset(); } else if (ind->Addr()->OperIs(GT_LCL_VAR) && !comp->lvaVarAddrExposed(ind->Addr()->AsLclVar()->GetLclNum())) { // Address is just a local, no offset, scale is 1 data->baseAddr = ind->Addr(); data->index = nullptr; data->scale = 1; data->offset = 0; } else { // Address is not LEA or local. return false; } return true; } //------------------------------------------------------------------------ // LowerStoreIndirCoalescing: If the given STOREIND node is followed by a similar // STOREIND node, try to merge them into a single store of a twice wider type. Example: // // * STOREIND int // +--* LCL_VAR byref V00 // \--* CNS_INT int 0x1 // // * STOREIND int // +--* LEA(b+4) byref // | \--* LCL_VAR byref V00 // \--* CNS_INT int 0x2 // // We can merge these two into into a single store of 8 bytes with (0x1 | (0x2 << 32)) as the value // // * STOREIND long // +--* LEA(b+0) byref // | \--* LCL_VAR byref V00 // \--* CNS_INT long 0x200000001 // // Arguments: // ind - the current STOREIND node // void Lowering::LowerStoreIndirCoalescing(GenTreeStoreInd* ind) { // LA, RISC-V and ARM32 more likely to recieve a terrible performance hit from // unaligned accesses making this optimization questionable. #if defined(TARGET_XARCH) || defined(TARGET_ARM64) if (!comp->opts.OptimizationEnabled()) { return; } // For now, we require the current STOREIND to have LEA (previous store may not have it) // So we can easily adjust the offset, consider making it more flexible in future. if (!ind->Addr()->OperIs(GT_LEA)) { return; } // We're going to do it in a loop while we see suitable STOREINDs to coalesce. // E.g.: we have the following LIR sequence: // // ...addr nodes... // STOREIND(int) // ...addr nodes... // STOREIND(short) // ...addr nodes... // STOREIND(short) <-- we're here // // First we merge two 'short' stores, then we merge the result with the 'int' store // to get a single store of 8 bytes. do { // This check is not really needed, just for better throughput. if (!ind->TypeIs(TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT)) { return; } StoreCoalescingData currData; StoreCoalescingData prevData; // Get coalescing data for the current STOREIND if (!GetStoreCoalescingData(comp, ind, &currData)) { return; } bool isClosedRange = false; // Now we need to find the very first LIR node representing the current STOREIND // and make sure that there are no other unexpected nodes in-between. LIR::ReadOnlyRange currIndRange = BlockRange().GetTreeRange(ind, &isClosedRange); if (!isClosedRange) { return; } GenTree* prevTree = currIndRange.FirstNode()->gtPrev; // Now we need to find the previous STOREIND, // we can ignore any NOPs or IL_OFFSETs in-between while ((prevTree != nullptr) && prevTree->OperIs(GT_NOP, GT_IL_OFFSET)) { prevTree = prevTree->gtPrev; } // It's not a STOREIND - bail out. if ((prevTree == nullptr) || !prevTree->OperIs(GT_STOREIND)) { return; } // Get coalescing data for the previous STOREIND GenTreeStoreInd* prevInd = prevTree->AsStoreInd(); if (!GetStoreCoalescingData(comp, prevInd->AsStoreInd(), &prevData)) { return; } // Same for the previous STOREIND, make sure there are no unexpected nodes around. LIR::ReadOnlyRange prevIndRange = BlockRange().GetTreeRange(prevInd, &isClosedRange); if (!isClosedRange) { return; } // STOREIND aren't value nodes. LIR::Use use; assert(!BlockRange().TryGetUse(prevInd, &use) && !BlockRange().TryGetUse(ind, &use)); // BaseAddr, Index, Scale and Type all have to match. if ((prevData.scale != currData.scale) || (prevData.targetType != currData.targetType) || !GenTree::Compare(prevData.baseAddr, currData.baseAddr) || !GenTree::Compare(prevData.index, currData.index)) { return; } // Offset has to match the size of the type. We don't support the same or overlapping offsets. if (abs(prevData.offset - currData.offset) != (int)genTypeSize(prevData.targetType)) { return; } // Since we're merging two stores of the same type, the new type is twice wider. var_types oldType = ind->TypeGet(); var_types newType; switch (oldType) { case TYP_BYTE: case TYP_UBYTE: newType = TYP_USHORT; break; case TYP_SHORT: case TYP_USHORT: newType = TYP_INT; // TYP_UINT is not legal in IR break; #ifdef TARGET_64BIT case TYP_INT: newType = TYP_LONG; break; #endif // TARGET_64BIT // TYP_FLOAT and TYP_DOUBLE aren't needed here - they're expected to // be converted to TYP_INT/TYP_LONG for constant value. // // TODO-CQ: // 2 x LONG/REF -> SIMD16 // 2 x SIMD16 -> SIMD32 // 2 x SIMD32 -> SIMD64 // // where it's legal (e.g. SIMD is not atomic on x64) // default: return; } // Delete previous STOREIND entirely BlockRange().Remove(std::move(prevIndRange)); // We know it's always LEA for now GenTreeAddrMode* addr = ind->Addr()->AsAddrMode(); // Update offset to be the minimum of the two addr->SetOffset(min(prevData.offset, currData.offset)); // Update type for both STOREIND and val ind->gtType = newType; ind->Data()->gtType = newType; // We currently only support these constants for val assert(prevData.value->IsCnsIntOrI() && currData.value->IsCnsIntOrI()); size_t lowerCns = (size_t)prevData.value->AsIntCon()->IconValue(); size_t upperCns = (size_t)currData.value->AsIntCon()->IconValue(); // if the previous store was at a higher address, swap the constants if (prevData.offset > currData.offset) { std::swap(lowerCns, upperCns); } // Trim the constants to the size of the type, e.g. for TYP_SHORT and TYP_USHORT // the mask will be 0xFFFF, for TYP_INT - 0xFFFFFFFF. size_t mask = ~(size_t(0)) >> (sizeof(size_t) - genTypeSize(oldType)) * BITS_IN_BYTE; lowerCns &= mask; upperCns &= mask; size_t val = (lowerCns | (upperCns << (genTypeSize(oldType) * BITS_IN_BYTE))); JITDUMP("Coalesced two stores into a single store with value %lld\n", (int64_t)val); // It's not expected to be contained yet, but just in case... ind->Data()->ClearContained(); ind->Data()->AsIntCon()->gtIconVal = (ssize_t)val; ind->gtFlags |= GTF_IND_UNALIGNED; } while (true); #endif // TARGET_XARCH || TARGET_ARM64 } //------------------------------------------------------------------------ // LowerStoreIndirCommon: a common logic to lower StoreIndir. // Expand Down Expand Up @@ -7842,6 +8121,7 @@ void Lowering::LowerStoreIndirCommon(GenTreeStoreInd* ind) } #endif LowerStoreIndirCoalescing(ind); LowerStoreIndir(ind); } } Expand Down