1//===- RISCVMatInt.cpp - Immediate materialisation -------------*- C++ -*--===// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7//===----------------------------------------------------------------------===// 21for (
auto Instr : Res) {
22// Assume instructions that aren't listed aren't compressible. 23bool Compressed =
false;
24switch (Instr.getOpcode()) {
32 Compressed = isInt<6>(Instr.getImm());
35// Two RVC instructions take the same space as one RVI instruction, but 36// can take longer to execute than the single RVI instruction. Thus, we 37// consider that two RVC instruction are slightly more costly than one 38// RVI instruction. For longer sequences of RVC instructions the space 39// savings can be worth it, though. The costs below try to model that. 41Cost += 100;
// Baseline cost of one RVI instruction: 100%. 43Cost += 70;
// 70% cost of baseline. 48// Recursively generate a sequence for materializing an integer. 51bool IsRV64 = STI.
hasFeature(RISCV::Feature64Bit);
53// Use BSETI for a single bit that can't be expressed by a single LUI or ADDI. 55 (!isInt<32>(Val) || Val == 0x800)) {
61// Depending on the active bits in the immediate Value v, the following 62// instruction sequences are emitted: 65// v[0,12) != 0 && v[12,32) == 0 : ADDI 66// v[0,12) == 0 && v[12,32) != 0 : LUI 67// v[0,32) != 0 : LUI+ADDI(W) 68 int64_t Hi20 = ((Val + 0x800) >> 12) & 0xFFFFF;
69 int64_t Lo12 = SignExtend64<12>(Val);
74if (Lo12 || Hi20 == 0) {
75unsigned AddiOpc = (IsRV64 && Hi20) ? RISCV::ADDIW : RISCV::ADDI;
81assert(IsRV64 &&
"Can't emit >32-bit imm for non-RV64 target");
83// In the worst case, for a full 64-bit constant, a sequence of 8 instructions 84// (i.e., LUI+ADDIW+SLLI+ADDI+SLLI+ADDI+SLLI+ADDI) has to be emitted. Note 85// that the first two instructions (LUI+ADDIW) can contribute up to 32 bits 86// while the following ADDI instructions contribute up to 12 bits each. 88// On the first glance, implementing this seems to be possible by simply 89// emitting the most significant 32 bits (LUI+ADDIW) followed by as many left 90// shift (SLLI) and immediate additions (ADDI) as needed. However, due to the 91// fact that ADDI performs a sign extended addition, doing it like that would 92// only be possible when at most 11 bits of the ADDI instructions are used. 93// Using all 12 bits of the ADDI instructions, like done by GAS, actually 94// requires that the constant is processed starting with the least significant 97// In the following, constants are processed from LSB to MSB but instruction 98// emission is performed from MSB to LSB by recursively calling 99// generateInstSeq. In each recursion, first the lowest 12 bits are removed 100// from the constant and the optimal shift amount, which can be greater than 101// 12 bits if the constant is sparse, is determined. Then, the shifted 102// remaining constant is processed recursively and gets emitted as soon as it 103// fits into 32 bits. The emission of the shifts and additions is subsequently 104// performed when the recursion returns. 106 int64_t Lo12 = SignExtend64<12>(Val);
112// Val might now be valid for LUI without needing a shift. 113if (!isInt<32>(Val)) {
117// If the remaining bits don't fit in 12 bits, we might be able to reduce 118// the // shift amount in order to use LUI which will zero the lower 12 120if (ShiftAmount > 12 && !isInt<12>(Val)) {
121if (isInt<32>((
uint64_t)Val << 12)) {
122// Reduce the shift amount and add zeros to the LSBs so it will match 126 }
elseif (isUInt<32>((
uint64_t)Val << 12) &&
128// Reduce the shift amount and add zeros to the LSBs so it will match 129// LUI, then shift left with SLLI.UW to clear the upper 32 set bits. 131 Val = ((
uint64_t)Val << 12) | (0xffffffffull << 32);
136// Try to use SLLI_UW for Val when it is uint32 but not int32. 139// Use LUI+ADDI or LUI to compose, then clear the upper 32 bits with 141 Val = ((
uint64_t)Val) | (0xffffffffull << 32);
148// Skip shift if we were able to use LUI directly. 150unsigned Opc =
Unsigned ? RISCV::SLLI_UW : RISCV::SLLI;
159// for case: 0b111..1..xxxxxx1..1.. 162if (TrailingOnes > 0 && TrailingOnes < 64 &&
163 (LeadingOnes + TrailingOnes) > (64 - 12))
164return 64 - TrailingOnes;
166// for case: 0bxxx1..1..1...xxx 169if (UpperTrailingOnes < 32 &&
170 (UpperTrailingOnes + LowerLeadingOnes) > (64 - 12))
171return 32 - UpperTrailingOnes;
178assert(Val > 0 &&
"Expected postive val");
182// Fill in the bits that will be shifted out with 1s. An example where this 183// helps is trailing one masks with 32 or more ones. This will generate 184// ADDI -1 and an SRLI. 185 ShiftedVal |= maskTrailingOnes<uint64_t>(LeadingZeros);
190// Keep the new sequence if it is an improvement or the original is empty. 191if ((TmpSeq.
size() + 1) < Res.
size() ||
197// Some cases can benefit from filling the lower bits with zeros instead. 198 ShiftedVal &= maskTrailingZeros<uint64_t>(LeadingZeros);
202// Keep the new sequence if it is an improvement or the original is empty. 203if ((TmpSeq.
size() + 1) < Res.
size() ||
209// If we have exactly 32 leading zeros and Zba, we can try using zext.w at 210// the end of the sequence. 211if (LeadingZeros == 32 && STI.
hasFeature(RISCV::FeatureStdExtZba)) {
212// Try replacing upper bits with 1. 213uint64_t LeadingOnesVal = Val | maskLeadingOnes<uint64_t>(LeadingZeros);
217// Keep the new sequence if it is an improvement. 218if ((TmpSeq.
size() + 1) < Res.
size() ||
231// If the low 12 bits are non-zero, the first expansion may end with an ADDI 232// or ADDIW. If there are trailing zeros, try generating a sign extended 233// constant with no trailing zeros and use a final SLLI to restore them. 234if ((Val & 0xfff) != 0 && (Val & 1) == 0 && Res.
size() >= 2) {
236 int64_t ShiftedVal = Val >> TrailingZeros;
237// If we can use C.LI+C.SLLI instead of LUI+ADDI(W) prefer that since 238// its more compressible. But only if LUI+ADDI(W) isn't fusable. 239// NOTE: We don't check for C extension to minimize differences in generated 241bool IsShiftedCompressible =
242 isInt<6>(ShiftedVal) && !STI.
hasFeature(RISCV::TuneLUIADDIFusion);
246// Keep the new sequence if it is an improvement. 247if ((TmpSeq.
size() + 1) < Res.
size() || IsShiftedCompressible) {
253// If we have a 1 or 2 instruction sequence this is the best we can do. This 254// will always be true for RV32 and will often be true for RV64. 259"Expected RV32 to only need 2 instructions");
261// If the lower 13 bits are something like 0x17ff, try to add 1 to change the 262// lower 13 bits to 0x1800. We can restore this with an ADDI of -1 at the end 263// of the sequence. Call generateInstSeqImpl on the new constant which may 264// subtract 0xfffffffffffff800 to create another ADDI. This will leave a 265// constant with more than 12 trailing zeros for the next recursive step. 266if ((Val & 0xfff) != 0 && (Val & 0x1800) == 0x1000) {
267 int64_t Imm12 = -(0x800 - (Val & 0xfff));
268 int64_t AdjustedVal = Val - Imm12;
272// Keep the new sequence if it is an improvement. 273if ((TmpSeq.
size() + 1) < Res.
size()) {
279// If the constant is positive we might be able to generate a shifted constant 280// with no leading zeros and use a final SRLI to restore them. 281if (Val > 0 && Res.
size() > 2) {
285// If the constant is negative, trying inverting and using our trailing zero 286// optimizations. Use an xori to invert the final value. 287if (Val < 0 && Res.
size() > 3) {
292// Keep it if we found a sequence that is smaller after inverting. 299// If the Low and High halves are the same, use pack. The pack instruction 300// packs the XLEN/2-bit lower halves of rs1 and rs2 into rd, with rs1 in the 301// lower half and rs2 in the upper half. 303 int64_t LoVal = SignExtend64<32>(Val);
304 int64_t HiVal = SignExtend64<32>(Val >> 32);
308if ((TmpSeq.
size() + 1) < Res.
size()) {
315// Perform optimization with BSETI in the Zbs extension. 317// Create a simm32 value for LUI+ADDIW by forcing the upper 33 bits to zero. 318// Xor that with original value to get which bits should be set by BSETI. 330Hi &= (
Hi - 1);
// Clear lowest set bit. 336// Perform optimization with BCLRI in the Zbs extension. 338// Create a simm32 value for LUI+ADDIW by forcing the upper 33 bits to one. 339// Xor that with original value to get which bits should be cleared by 351Hi &= (
Hi - 1);
// Clear lowest set bit. 357// Perform optimization with SH*ADD in the Zba extension. 362// Select the opcode and divisor. 363if ((Val % 3) == 0 && isInt<32>(Val / 3)) {
366 }
elseif ((Val % 5) == 0 && isInt<32>(Val / 5)) {
369 }
elseif ((Val % 9) == 0 && isInt<32>(Val / 9)) {
373// Build the new instruction sequence. 376if ((TmpSeq.
size() + 1) < Res.
size()) {
381// Try to use LUI+SH*ADD+ADDI. 382 int64_t Hi52 = ((
uint64_t)Val + 0x800ull) & ~0xfffull;
383 int64_t Lo12 = SignExtend64<12>(Val);
385if (isInt<32>(Hi52 / 3) && (Hi52 % 3) == 0) {
388 }
elseif (isInt<32>(Hi52 / 5) && (Hi52 % 5) == 0) {
391 }
elseif (isInt<32>(Hi52 / 9) && (Hi52 % 9) == 0) {
395// Build the new instruction sequence. 397// For Val that has zero Lo12 (implies Val equals to Hi52) should has 398// already been processed to LUI+SH*ADD by previous optimization. 400"unexpected instruction sequence for immediate materialisation");
403if ((TmpSeq.
size() + 2) < Res.
size()) {
412// Perform optimization with rori in the Zbb and th.srri in the XTheadBb 415 STI.
hasFeature(RISCV::FeatureVendorXTHeadBb))) {
418uint64_t NegImm12 = llvm::rotl<uint64_t>(Val, Rotate);
419assert(isInt<12>(NegImm12));
463// Only the first instruction has X0 as its source. 469unsigned &ShiftAmt,
unsigned &AddOpc) {
470 int64_t LoVal = SignExtend64<32>(Val);
474// Subtract the LoVal to emulate the effect of the final ADD. 478// Use trailing zero counts to figure how far we need to shift LoVal to line 479// up with the remaining constant. 480// TODO: This algorithm assumes all non-zero bits in the low 32 bits of the 481// final constant come from LoVal. 484assert(TzLo < 32 && TzHi >= 32);
485 ShiftAmt = TzHi - TzLo;
488if (Tmp == ((
uint64_t)LoVal << ShiftAmt))
491// If we have Zba, we can use (ADD_UW X, (SLLI X, 32)). 494 AddOpc = RISCV::ADD_UW;
502bool CompressionCost,
bool FreeZeroes) {
503bool IsRV64 = STI.
hasFeature(RISCV::Feature64Bit);
506int PlatRegSize = IsRV64 ? 64 : 32;
508// Split the constant into platform register sized chunks, and calculate cost 511for (
unsigned ShiftVal = 0; ShiftVal <
Size; ShiftVal += PlatRegSize) {
518return std::max(FreeZeroes ? 0 : 1,
Cost);
548}
// namespace llvm::RISCVMatInt This file implements a class to represent arbitrary precision integral constant values and operations...
static void generateInstSeqLeadingZeros(int64_t Val, const MCSubtargetInfo &STI, RISCVMatInt::InstSeq &Res)
static void generateInstSeqImpl(int64_t Val, const MCSubtargetInfo &STI, RISCVMatInt::InstSeq &Res)
static unsigned extractRotateInfo(int64_t Val)
static int getInstSeqCost(RISCVMatInt::InstSeq &Res, bool HasRVC)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Class for arbitrary precision integers.
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
int64_t getSExtValue() const
Get sign extended value.
MCInstBuilder & addReg(MCRegister Reg)
Add a new register operand.
MCInstBuilder & addImm(int64_t Val)
Add a new integer immediate operand.
Wrapper class representing physical registers. Should be passed by value.
Generic base class for all target subtargets.
bool hasFeature(unsigned Feature) const
unsigned getOpcode() const
OpndKind getOpndKind() const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
InstSeq generateInstSeq(int64_t Val, const MCSubtargetInfo &STI)
int getIntMatCost(const APInt &Val, unsigned Size, const MCSubtargetInfo &STI, bool CompressionCost, bool FreeZeroes)
SmallVector< Inst, 8 > InstSeq
InstSeq generateTwoRegInstSeq(int64_t Val, const MCSubtargetInfo &STI, unsigned &ShiftAmt, unsigned &AddOpc)
void generateMCInstSeq(int64_t Val, const MCSubtargetInfo &STI, MCRegister DestReg, SmallVectorImpl< MCInst > &Insts)
This is an optimization pass for GlobalISel generic memory operations.
int popcount(T Value) noexcept
Count the number of set bits in a value.
int countr_one(T Value)
Count the number of ones from the least significant bit to the first zero bit.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
int countl_one(T Value)
Count the number of ones from the most significant bit to the first zero bit.
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.