The full code (works on x86_64) isavailable on GitHub
At first I used just original floating-point version, but it turned out to be too slow, because used DSP (32 bit, 120 MHz max, 112kB program and 448kB data RAM) does not have FPU.Rewriting the code to be fixed-point made the program run faster by a factor of about 4-5, which was still not enough (but very close). I calculated the necessary speedup to be 15% (but more is better, as always), using the following facts:
- Initial floating-point code produced ~14s audio, while on PC it is about 2.6s (music)
- Fixed-point code produces 75.4s audio vs 67s on PC (sine wave, example is on the picture)

- Not doing any decoding at all and just throwing out some predefined data does not produce same defect.
This defect is represented as a block of zeroes between the blocks of valid data.
So I assumed that there is a timer responsible for outputting the audio data, and if not enough data from decoder is available then silence is produced. So the differences in lengths of output file from DSP (excluding silence at the start and at the end of the file which is completely expected behavior) and of the file decoded on the PC should correlate with necessary speed up.
Next I benchmarked the code (using clock() before and after each function call):
| Function | Average time | Number of calls | Percentage |
|---|---|---|---|
| decodeBand | 0,49 | 1 | 3,24% |
| decodeGradient | 0,45 | 1 | 2,97% |
| calculateGradient | 0,45 | 1 | 2,96% |
| decodeScaleFactors | 0,67 | 2 | 8,91% |
| calculatePrecisionMask | 0,45 | 2 | 6,03% |
| calculatePrecisions | 0,49 | 2 | 6,56% |
| decodeSpectrum | 1,14 | 2 | 15,13% |
| decodeSpectrumFine | 0,48 | 2 | 6,36% |
| dequantizeSpectra | 0,78 | 2 | 10,40% |
| scaleSpectrum | 0,59 | 2 | 7,84% |
| RunImdct | 2,23 | 2 | 29,60% |
| Total | 15,07 | - | - |
Main candidates for optimization areRunImdct,decodeSpectrum,dequantizeSpectra. Here's the code for this functions (a little bit cleaned from variables used for fixed-point translation):
#define WINDOW_SHIFT 29void RunImdct(Mdct* mdct, scalar* input, scalar* output){ const int size = 1 << mdct->Bits; const int half = size / 2; scalar dctOut[MAX_FRAME_SAMPLES] = { 0 }; const scalar* window = ImdctWindow[mdct->Bits - 7]; scalar* previous = mdct->ImdctPrevious; Dct4(mdct, input, dctOut); for (int i = 0; i < half; i++) { output[i] = mul_rsftrnd_ldac(window[i], dctOut[i + half], WINDOW_SHIFT) + previous[i]; output[i + half] = mul_rsftrnd_ldac(window[i + half], -dctOut[size - 1 - i], WINDOW_SHIFT) - previous[i + half]; previous[i] = mul_rsftrnd_ldac(window[size - 1 - i], -dctOut[half - i - 1], WINDOW_SHIFT); previous[i + half] = mul_rsftrnd_ldac(window[half - i - 1], dctOut[i], WINDOW_SHIFT); }}static void Dct4(Mdct* mdct, scalar* input, scalar* output){ int MdctBits = mdct->Bits; int MdctSize = 1 << MdctBits; const int* shuffleTable = ShuffleTables[MdctBits]; const scalar* sinTable = SinTables[MdctBits]; const scalar* cosTable = CosTables[MdctBits]; scalar dctTemp[MAX_FRAME_SAMPLES]; int size = MdctSize; int lastIndex = size - 1; int halfSize = size / 2; for (int i = 0; i < halfSize; i++) { int i2 = i * 2; scalar a = input[i2]; scalar b = input[lastIndex - i2]; scalar sin = sinTable[i]; scalar cos = cosTable[i]; dctTemp[i2] = mul_rsftrnd_ldac(a, cos, 31) + mul_rsftrnd_ldac(b, sin, 31); dctTemp[i2 + 1] = mul_rsftrnd_ldac(a, sin, 31) - mul_rsftrnd_ldac(b, cos, 31); } int stageCount = MdctBits - 1; for (int stage = 0; stage < stageCount; stage++) { int blockCount = 1 << stage; int blockSizeBits = stageCount - stage; int blockHalfSizeBits = blockSizeBits - 1; int blockSize = 1 << blockSizeBits; int blockHalfSize = 1 << blockHalfSizeBits; sinTable = SinTables[blockHalfSizeBits]; cosTable = CosTables[blockHalfSizeBits]; for (int block = 0; block < blockCount; block++) { for (int i = 0; i < blockHalfSize; i++) { int frontPos = (block * blockSize + i) * 2; int backPos = frontPos + blockSize; scalar a = dctTemp[frontPos] - dctTemp[backPos]; scalar b = dctTemp[frontPos + 1] - dctTemp[backPos + 1]; scalar sin = sinTable[i]; scalar cos = cosTable[i]; dctTemp[frontPos] += dctTemp[backPos]; dctTemp[frontPos + 1] += dctTemp[backPos + 1]; dctTemp[backPos] = mul_rsftrnd_ldac(a, cos, 31) + mul_rsftrnd_ldac(b, sin, 31); dctTemp[backPos + 1] = mul_rsftrnd_ldac(a, sin, 31) - mul_rsftrnd_ldac(b, cos, 31); } } } for (int i = 0; i < MdctSize; i++) { output[i] = dctTemp[shuffleTable[i]]; } }int decodeSpectrum( channel_t *this, BitReaderCxt *br ){ frame_t *frame = this->frame; for( int i=0; i<frame->quantizationUnitCount; ++i ) { int startSubband = ga_isp_ldac[i]; int endSubband = ga_isp_ldac[i+1]; int nsps = ga_nsps_ldac[i]; int wl = ga_wl_ldac[this->precisions[i]]; if( this->precisions[i] == 1 ) { int idxSpectrum = startSubband; if( nsps == 2 ) { int value = decode2DSpectrum[ReadInt( br, LDAC_2DIMSPECBITS )]; this->quantizedSpectra[idxSpectrum] = ((value>>2)&3) - 1; this->quantizedSpectra[idxSpectrum+1] = (value&3) - 1; } else { for (int j = 0; j < nsps/4; j++, idxSpectrum+=4) { int value = decode4DSpectrum[ReadInt( br, LDAC_4DIMSPECBITS )]; this->quantizedSpectra[idxSpectrum] = ((value>>6)&3) - 1; this->quantizedSpectra[idxSpectrum+1] = ((value>>4)&3) - 1; this->quantizedSpectra[idxSpectrum+2] = ((value>>2)&3) - 1; this->quantizedSpectra[idxSpectrum+3] = (value&3) - 1; } } } else { for( int j = startSubband; j<endSubband; ++j ) this->quantizedSpectra[j] = ReadSignedInt( br, wl ); } } LOG_ARRAY_LEN( this->quantizedSpectra, "%3d, ", frame->quantizationUnitCount ); return 0;}static void dequantizeQuantUnit( channel_t* this, int band ){ const int subBandIndex = ga_isp_ldac[band]; const int subBandCount = ga_nsps_ldac[band]; const scalar stepSize = QuantizerStepSize[this->precisions[band]]; const scalar stepSizeFine = QuantizerFineStepSize[this->precisions[band]]; for( int sb=0; sb<subBandCount; ++sb ) { //Q31 <- Q00 * Q30 //Q31 <- Q00 * Q31 const scalar coarse = mul_lsftrnd_ldac(this->quantizedSpectra[subBandIndex+sb], stepSize, -1); const scalar fine = mul_lsftrnd_ldac(this->quantizedSpectraFine[subBandIndex+sb], stepSizeFine, 0); this->spectra[subBandIndex + sb] = coarse + fine; }}void dequantizeSpectra( channel_t *this ){ frame_t *frame = this->frame; memset( this->spectra, 0, sizeof(this->spectra) ); for( int i=0; i<frame->quantizationUnitCount; ++i ) { dequantizeQuantUnit( this, i ); } LOG_ARRAY_LEN( this->spectra, "%e, ", ga_isp_ldac[frame->quantizationUnitCount-1] + ga_nsps_ldac[frame->quantizationUnitCount-1] ); }How this should be optimized?
UPD Disassembly of macromul_rsftrnd_ldac (inside simple function):
00000000 <$_multiply_shift31>: 0: 2f 00 13 cc rMAC = r1 * r0 (SS); 4: ff 00 11 91 rMAC = rMAC ASHIFT -1 (56bit); 8: 00 00 31 8d r1 = rMAC LSHIFT 0; c: 20 00 21 8d r0 = rMAC LSHIFT 32; 10: 00 40 00 fd r0 = r0 + 1073741824; 14: 00 00 22 01 18: 00 00 30 07 r1 = r1 + Null + Carry; 1c: 01 00 43 8d r2 = r1 LSHIFT 1; 20: e1 00 22 8d r0 = r0 LSHIFT -31; 24: e1 00 33 91 r1 = r1 ASHIFT -31; 28: 00 00 24 87 r0 = r0 OR r2;0000002c <Lc_multiply_shift31_2>: 2c: 0f 00 0d dc rts;- \$\begingroup\$
mul_rsftrnd_ldacaccording to my information is a macro that expands to a plain old 64-bit multiplication followed by rounding and a right shift, which may or may not map well onto the instructions offered by your DSP. Can you find out what that compiles to? Does the compiler that you use offer any intrinsic functions that map well onto the special DSP-oriented instructions supported by the DSP?\$\endgroup\$user555045– user5550452022-01-26 00:31:05 +00:00CommentedJan 26, 2022 at 0:31 - \$\begingroup\$@harold It compiles into 11 instructions, updated the question accordingly. Compiler provides support for some of the "TR-18037 Embedded C extensions" (
fract- Q1.31 andaccum- Q9.63 types).\$\endgroup\$Anonymix321– Anonymix3212022-01-26 09:49:02 +00:00CommentedJan 26, 2022 at 9:49
You mustlog in to answer this question.
Explore related questions
See similar questions with these tags.