Apr 14, 2024 · Mar 25, 2024 · Apr 9, 2024 · Apr 9, 2024 · Apr 14, 2024 · Apr 14, 2024
diff --git a/src/libraries/System.Private.CoreLib/src/System/Buffers/Text/Base64Decoder.cs b/src/libraries/System.Private.CoreLib/src/System/Buffers/Text/Base64Decoder.cs
                        }
                    }

                    end = srcMax - 66;
                    if (AdvSimd.Arm64.IsSupported && (end >= src))
                    {
                        AdvSimdDecode(ref src, ref dest, end, maxSrcLength, destLength, srcBytes, destBytes);

                        if (src == srcEnd)
                        {
                            goto DoneExit;
                        }
                    }

                    end = srcMax - 24;
                    if ((Ssse3.IsSupported || AdvSimd.Arm64.IsSupported) && BitConverter.IsLittleEndian && (end >= src))
                    {
            return Vector128.ShuffleUnsafe(left, right);
        }

        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        [CompExactlyDependsOn(typeof(AdvSimd.Arm64))]
        private static unsafe void AdvSimdDecode(ref byte* srcBytes, ref byte* destBytes, byte* srcEnd, int sourceLength, int destLength, byte* srcStart, byte* destStart)
        {
            // C# implementation of https://github.com/aklomp/base64/blob/3a5add8652076612a8407627a42c768736a4263f/lib/arch/neon64/dec_loop.c
            // If we have AdvSimd support, pick off 64 bytes at a time for as long as we can,
            // but make sure that we quit before seeing any == markers at the end of the
            // string. 64 + 2 = 66 bytes.

            Vector128<byte> decLutOne1 = Vector128.Create(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF).AsByte();
            Vector128<byte> decLutOne2 = Vector128.Create(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF).AsByte();
            Vector128<byte> decLutOne3 = Vector128.Create(0xFFFFFFFF, 0xFFFFFFFF, 0x3EFFFFFF, 0x3FFFFFFF).AsByte();
            Vector128<byte> decLutOne4 = Vector128.Create(0x37363534, 0x3B3A3938, 0xFFFF3D3C, 0xFFFFFFFF).AsByte();
            Vector128<byte> decLutTwo1 = Vector128.Create(0x0100FF00, 0x05040302, 0x09080706, 0x0D0C0B0A).AsByte();
            Vector128<byte> decLutTwo2 = Vector128.Create(0x11100F0E, 0x15141312, 0x19181716, 0xFFFFFFFF).AsByte();
            Vector128<byte> decLutTwo3 = Vector128.Create(0x1B1AFFFF, 0x1F1E1D1C, 0x23222120, 0x27262524).AsByte();
            Vector128<byte> decLutTwo4 = Vector128.Create(0x2B2A2928, 0x2F2E2D2C, 0x33323130, 0xFFFFFFFF).AsByte();

            Vector128<byte> decOne1;
            Vector128<byte> decOne2;
            Vector128<byte> decOne3;
            Vector128<byte> decOne4;
            Vector128<byte> decTwo1;
            Vector128<byte> decTwo2;
            Vector128<byte> decTwo3;
            Vector128<byte> decTwo4;
            Vector128<byte> str1;
            Vector128<byte> str2;
            Vector128<byte> str3;
            Vector128<byte> str4;
            Vector128<byte> res1;
            Vector128<byte> res2;
            Vector128<byte> res3;

            byte* src = srcBytes;
            byte* dest = destBytes;
            Vector128<byte> offset = AdvSimd.DuplicateToVector128((byte)0x3F);
            var decLutOne = (decLutOne1, decLutOne2, decLutOne3, decLutOne4);
            var decLutTwo = (decLutTwo1, decLutTwo2, decLutTwo3, decLutTwo4);

            do
            {
                // Load 64 bytes and de-interleave.
                AssertRead<Vector128<byte>>(src, srcStart, sourceLength);
                (str1, str2, str3, str4) = AdvSimd.Arm64.LoadVector128x4AndUnzip(src);

                // Get indices for second LUT:
                decTwo1 = AdvSimd.SubtractSaturate(str1, offset);
                decTwo2 = AdvSimd.SubtractSaturate(str2, offset);
                decTwo3 = AdvSimd.SubtractSaturate(str3, offset);
                decTwo4 = AdvSimd.SubtractSaturate(str4, offset);

                // Get values from first LUT. Out-of-range indices are set to 0.
                decOne1 = AdvSimd.Arm64.VectorTableLookup(decLutOne, str1);
                decOne2 = AdvSimd.Arm64.VectorTableLookup(decLutOne, str2);
                decOne3 = AdvSimd.Arm64.VectorTableLookup(decLutOne, str3);
                decOne4 = AdvSimd.Arm64.VectorTableLookup(decLutOne, str4);

                // Get values from second LUT. Out-of-range indices are unchanged.
                decTwo1 = AdvSimd.Arm64.VectorTableLookupExtension(decTwo1, decLutTwo, decTwo1);
                decTwo2 = AdvSimd.Arm64.VectorTableLookupExtension(decTwo2, decLutTwo, decTwo2);
                decTwo3 = AdvSimd.Arm64.VectorTableLookupExtension(decTwo3, decLutTwo, decTwo3);
                decTwo4 = AdvSimd.Arm64.VectorTableLookupExtension(decTwo4, decLutTwo, decTwo4);

                // Invalid values are set to 255 during above look-ups using 'decLutTwo' and 'decLutTwo'.
                // Thus the intermediate results 'decOne' and 'decTwo' could be OR-ed to get final values.
                str1 = decOne1 | decTwo1;
                str2 = decOne2 | decTwo2;
                str3 = decOne3 | decTwo3;
                str4 = decOne4 | decTwo4;

                // Check for invalid input, any value larger than 63.
                Vector128<byte> classified = AdvSimd.CompareGreaterThan(str1, offset)
                                           | AdvSimd.CompareGreaterThan(str2, offset)
                                           | AdvSimd.CompareGreaterThan(str3, offset)
                                           | AdvSimd.CompareGreaterThan(str4, offset);

                // Check that all bits are zero.
                if (classified != Vector128<byte>.Zero)
                {
                    break;
                }

                // Compress four bytes into three.
                res1 = AdvSimd.ShiftLeftLogical(str1, 2) | AdvSimd.ShiftRightLogical(str2, 4);
                res2 = AdvSimd.ShiftLeftLogical(str2, 4) | AdvSimd.ShiftRightLogical(str3, 2);
                res3 = AdvSimd.ShiftLeftLogical(str3, 6) | str4;

                // Interleave and store decoded result.
                AssertWrite<Vector128<byte>>(dest, destStart, destLength);
                AdvSimd.Arm64.StoreVector128x3AndZip(dest, (res1, res2, res3));

                src += 64;
                dest += 48;
            }
            while (src <= srcEnd);

            srcBytes = src;
            destBytes = dest;
        }

        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        [CompExactlyDependsOn(typeof(AdvSimd.Arm64))]
        [CompExactlyDependsOn(typeof(Ssse3))]
Original file line number	Diff line number	Diff line change
Expand Up		@@ -90,6 +90,17 @@ private static unsafe OperationStatus DecodeFromUtf8(ReadOnlySpan<byte> utf8, Sp
		}
		}

		end = srcMax - 66;
		if (AdvSimd.Arm64.IsSupported && (end >= src))
		{
		AdvSimdDecode(ref src, ref dest, end, maxSrcLength, destLength, srcBytes, destBytes);

		if (src == srcEnd)
		{
		goto DoneExit;
		}
		}

		end = srcMax - 24;
		if ((Ssse3.IsSupported \|\| AdvSimd.Arm64.IsSupported) && BitConverter.IsLittleEndian && (end >= src))
		{
Expand DownExpand Up		@@ -844,6 +855,107 @@ private static Vector128<byte> SimdShuffle(Vector128<byte> left, Vector128<byte>
		return Vector128.ShuffleUnsafe(left, right);
		}

		[MethodImpl(MethodImplOptions.AggressiveInlining)]
		[CompExactlyDependsOn(typeof(AdvSimd.Arm64))]
		private static unsafe void AdvSimdDecode(ref byte* srcBytes, ref byte* destBytes, byte* srcEnd, int sourceLength, int destLength, byte* srcStart, byte* destStart)
		{
		// C# implementation of https://github.com/aklomp/base64/blob/3a5add8652076612a8407627a42c768736a4263f/lib/arch/neon64/dec_loop.c
		// If we have AdvSimd support, pick off 64 bytes at a time for as long as we can,
		// but make sure that we quit before seeing any == markers at the end of the
		// string. 64 + 2 = 66 bytes.

		Vector128<byte> decLutOne1 = Vector128.Create(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF).AsByte();
		Vector128<byte> decLutOne2 = Vector128.Create(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF).AsByte();
Comment on lines +867 to +868 Copy link Member tannergoodingApr 14, 2024 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. These could have been`Vector128<byte>.AllBitsSet` SwapnilGaikwad reacted with thumbs up emoji
		Vector128<byte> decLutOne3 = Vector128.Create(0xFFFFFFFF, 0xFFFFFFFF, 0x3EFFFFFF, 0x3FFFFFFF).AsByte();
		Vector128<byte> decLutOne4 = Vector128.Create(0x37363534, 0x3B3A3938, 0xFFFF3D3C, 0xFFFFFFFF).AsByte();
		Vector128<byte> decLutTwo1 = Vector128.Create(0x0100FF00, 0x05040302, 0x09080706, 0x0D0C0B0A).AsByte();
		Vector128<byte> decLutTwo2 = Vector128.Create(0x11100F0E, 0x15141312, 0x19181716, 0xFFFFFFFF).AsByte();
		Vector128<byte> decLutTwo3 = Vector128.Create(0x1B1AFFFF, 0x1F1E1D1C, 0x23222120, 0x27262524).AsByte();
		Vector128<byte> decLutTwo4 = Vector128.Create(0x2B2A2928, 0x2F2E2D2C, 0x33323130, 0xFFFFFFFF).AsByte();
Comment on lines +867 to +874 Copy link Member tannergoodingApr 14, 2024 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. It's not clear why these all need to be declared up front. It's effectively defining unnecessary locals and work for the JIT to do. This could have been`var decLutOne = (Vector128<byte>.AllBitsSet, Vector128<byte>.AllBitsSet, Vector128.Create(0xFFFFFFFF, 0xFFFFFFFF, 0x3EFFFFFF, 0x3FFFFFFF).AsByte(), Vector128.Create(0x37363534, 0x3B3A3938, 0xFFFF3D3C, 0xFFFFFFFF).AsByte())` instead, as an example. SwapnilGaikwad reacted with thumbs up emoji

		Vector128<byte> decOne1;
		Vector128<byte> decOne2;
		Vector128<byte> decOne3;
		Vector128<byte> decOne4;
		Vector128<byte> decTwo1;
		Vector128<byte> decTwo2;
		Vector128<byte> decTwo3;
		Vector128<byte> decTwo4;
		Vector128<byte> str1;
		Vector128<byte> str2;
		Vector128<byte> str3;
		Vector128<byte> str4;
		Vector128<byte> res1;
		Vector128<byte> res2;
		Vector128<byte> res3;
Comment on lines +876 to +890 Copy link Member tannergoodingApr 14, 2024• edited Loading Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. Similar here, it's not clear why these were defined up front like this. They could have been declared in the loop at the declaration site (since they aren't used outside the loop) making the overall code much more readable, without needing 15 up front declarations SwapnilGaikwad reacted with thumbs up emoji

		byte* src = srcBytes;
		byte* dest = destBytes;
		Vector128<byte> offset = AdvSimd.DuplicateToVector128((byte)0x3F);
Copy link Member tannergoodingApr 14, 2024 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. `Vector128.Create<byte>(0x3F)` is preferred over`AdvSimd.DuplicateToVector128`. The former allows more general optimizations to kick in as it does not prescribe a particular instruction be emitted. SwapnilGaikwad reacted with thumbs up emoji
		var decLutOne = (decLutOne1, decLutOne2, decLutOne3, decLutOne4);
		var decLutTwo = (decLutTwo1, decLutTwo2, decLutTwo3, decLutTwo4);

		do
		{
		// Load 64 bytes and de-interleave.
		AssertRead<Vector128<byte>>(src, srcStart, sourceLength);
Copy link Member tannergoodingApr 14, 2024 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. I know this is pre-existing to the PR, but this is not a great method name. It should probably be something like`AssertReadIsValid` or similar.
		(str1, str2, str3, str4) = AdvSimd.Arm64.LoadVector128x4AndUnzip(src);
Copy link Member tannergoodingApr 14, 2024 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. It would be nice if`LoadVector128x4AndUnzip` was something we could pattern match and implicitly light up. Copy link ContributorAuthor SwapnilGaikwadApr 15, 2024 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. Did you mean a wrapper method, say`LoadVectorAndUnzip()`, that would do the right thing based on the return type? Copy link Member tannergoodingApr 15, 2024 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. No, rather as per my comment above (#100589 (comment)) we prefer to write xplat code so logic can be shared across CPU architectures where possible. To facilitate this, we ideally have the JIT recognize simple patterns and optimize them.`LoadVector128x4`, for example, could be done automatically if the JIT were to recognize 4x sequential`Vector128.Load`. `LoadVector128x4AndUnzip` could theoretically have the same done if we were to recognize 4x sequence`Vector128.Load` and a general pattern for`unzip`. Today that would be recognizing`Shuffle` in particular, but longer term it might be a good reason for there to be an xplat`Vector128.Unzip` (since that just corresponds to`Unzip` on Arm64 and`Unpack` on`x64`). The driving question tends to be, "does this logic actually have to be platform/ISA specific, or is there a simple pattern we could enable the JIT to recognize instead?". In the case simple pattern recognition is possible, then it's generally ideal to enable it so that existing user code lights up and gets faster on Arm64 without the need for users to go and manually do the work. It also lets the same code automatically light-up for things like`AdvSimd` vs`SVE`, rather than being strictly tied down to`AdvSimd`. Copy link ContributorAuthor SwapnilGaikwadApr 15, 2024 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. Right, thanks for this clarification. I'll keep this mind for the common patterns in the future.

		// Get indices for second LUT:
		decTwo1 = AdvSimd.SubtractSaturate(str1, offset);
		decTwo2 = AdvSimd.SubtractSaturate(str2, offset);
		decTwo3 = AdvSimd.SubtractSaturate(str3, offset);
		decTwo4 = AdvSimd.SubtractSaturate(str4, offset);

		// Get values from first LUT. Out-of-range indices are set to 0.
		decOne1 = AdvSimd.Arm64.VectorTableLookup(decLutOne, str1);
		decOne2 = AdvSimd.Arm64.VectorTableLookup(decLutOne, str2);
		decOne3 = AdvSimd.Arm64.VectorTableLookup(decLutOne, str3);
		decOne4 = AdvSimd.Arm64.VectorTableLookup(decLutOne, str4);

		// Get values from second LUT. Out-of-range indices are unchanged.
		decTwo1 = AdvSimd.Arm64.VectorTableLookupExtension(decTwo1, decLutTwo, decTwo1);
		decTwo2 = AdvSimd.Arm64.VectorTableLookupExtension(decTwo2, decLutTwo, decTwo2);
		decTwo3 = AdvSimd.Arm64.VectorTableLookupExtension(decTwo3, decLutTwo, decTwo3);
		decTwo4 = AdvSimd.Arm64.VectorTableLookupExtension(decTwo4, decLutTwo, decTwo4);

		// Invalid values are set to 255 during above look-ups using 'decLutTwo' and 'decLutTwo'.
		// Thus the intermediate results 'decOne' and 'decTwo' could be OR-ed to get final values.
		str1 = decOne1 \| decTwo1;
		str2 = decOne2 \| decTwo2;
		str3 = decOne3 \| decTwo3;
		str4 = decOne4 \| decTwo4;
Comment on lines +904 to +927 Copy link Member tannergoodingApr 14, 2024 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. This general algorithm probably could do with a comment walking through what's happening so the "magic" is better understood, much as exists for`Vector128Decode` Copy link ContributorAuthor SwapnilGaikwadApr 15, 2024 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. Agreed, it's difficult to follow. Although, it's a bit tricky to describe the entire workings with lookup tables and interleaving. I'll try to make it less opaque.

		// Check for invalid input, any value larger than 63.
		Vector128<byte> classified = AdvSimd.CompareGreaterThan(str1, offset)
		\| AdvSimd.CompareGreaterThan(str2, offset)
		\| AdvSimd.CompareGreaterThan(str3, offset)
		\| AdvSimd.CompareGreaterThan(str4, offset);
Comment on lines +930 to +933 Copy link Member tannergoodingApr 14, 2024 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. These could have been the xplat`Vector128.GreaterThan(str1, offset)` Copy link ContributorAuthor SwapnilGaikwadApr 15, 2024 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. I didn't understand the xplat bit. Could you elaborate on this, please? Copy link Member tannergoodingApr 15, 2024 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. We have APIs defined on`Vector128` that expose functionality common across multiple ISAs (AdvSimd, SVE, etc) and common across multiple architectures (Arm64, x64, WASM, etc). Using these APIs tends to be preferred over using the platform specific APIs, particularly when there is a clear 1-to-1 mapping. Thus`x + y` is preferred over`AdvSimd.Add` and`Vector128.GreaterThan` is preffered over`AdvSimd.CompareGreaterThan`. Using the xplat (cross platform) APIs gives the JIT more flexibility in the code it generates and allows more portability, while also often improving readability. Copy link ContributorAuthor SwapnilGaikwadApr 15, 2024 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. Makes sense, I'll update these calls 👍

		// Check that all bits are zero.
		if (classified != Vector128<byte>.Zero)
		{
		break;
		}

		// Compress four bytes into three.
		res1 = AdvSimd.ShiftLeftLogical(str1, 2) \| AdvSimd.ShiftRightLogical(str2, 4);
		res2 = AdvSimd.ShiftLeftLogical(str2, 4) \| AdvSimd.ShiftRightLogical(str3, 2);
		res3 = AdvSimd.ShiftLeftLogical(str3, 6) \| str4;
Comment on lines +942 to +944 Copy link Member tannergoodingApr 14, 2024 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. These could have been of the form`res1 = (str1 << 2) \| (str2 << 4);` SwapnilGaikwad reacted with thumbs up emoji

		// Interleave and store decoded result.
		AssertWrite<Vector128<byte>>(dest, destStart, destLength);
		AdvSimd.Arm64.StoreVector128x3AndZip(dest, (res1, res2, res3));

		src += 64;
		dest += 48;
		}
		while (src <= srcEnd);

		srcBytes = src;
		destBytes = dest;
		}

		[MethodImpl(MethodImplOptions.AggressiveInlining)]
		[CompExactlyDependsOn(typeof(AdvSimd.Arm64))]
		[CompExactlyDependsOn(typeof(Ssse3))]
Expand Down