- Notifications
You must be signed in to change notification settings - Fork5.2k
Use multi-reg load/store for DecodeFromUTF8#100589
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to ourterms of service andprivacy statement. We’ll occasionally send you account related emails.
Already on GitHub?Sign in to your account
Uh oh!
There was an error while loading.Please reload this page.
Changes fromall commits
File filter
Filter by extension
Conversations
Uh oh!
There was an error while loading.Please reload this page.
Jump to
Uh oh!
There was an error while loading.Please reload this page.
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -90,6 +90,17 @@ private static unsafe OperationStatus DecodeFromUtf8(ReadOnlySpan<byte> utf8, Sp | ||
| } | ||
| } | ||
| end = srcMax - 66; | ||
| if (AdvSimd.Arm64.IsSupported && (end >= src)) | ||
| { | ||
| AdvSimdDecode(ref src, ref dest, end, maxSrcLength, destLength, srcBytes, destBytes); | ||
| if (src == srcEnd) | ||
| { | ||
| goto DoneExit; | ||
| } | ||
| } | ||
| end = srcMax - 24; | ||
| if ((Ssse3.IsSupported || AdvSimd.Arm64.IsSupported) && BitConverter.IsLittleEndian && (end >= src)) | ||
| { | ||
| @@ -844,6 +855,107 @@ private static Vector128<byte> SimdShuffle(Vector128<byte> left, Vector128<byte> | ||
| return Vector128.ShuffleUnsafe(left, right); | ||
| } | ||
| [MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
| [CompExactlyDependsOn(typeof(AdvSimd.Arm64))] | ||
| private static unsafe void AdvSimdDecode(ref byte* srcBytes, ref byte* destBytes, byte* srcEnd, int sourceLength, int destLength, byte* srcStart, byte* destStart) | ||
| { | ||
| // C# implementation of https://github.com/aklomp/base64/blob/3a5add8652076612a8407627a42c768736a4263f/lib/arch/neon64/dec_loop.c | ||
| // If we have AdvSimd support, pick off 64 bytes at a time for as long as we can, | ||
| // but make sure that we quit before seeing any == markers at the end of the | ||
| // string. 64 + 2 = 66 bytes. | ||
| Vector128<byte> decLutOne1 = Vector128.Create(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF).AsByte(); | ||
| Vector128<byte> decLutOne2 = Vector128.Create(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF).AsByte(); | ||
Comment on lines +867 to +868 Member There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. These could have been | ||
| Vector128<byte> decLutOne3 = Vector128.Create(0xFFFFFFFF, 0xFFFFFFFF, 0x3EFFFFFF, 0x3FFFFFFF).AsByte(); | ||
| Vector128<byte> decLutOne4 = Vector128.Create(0x37363534, 0x3B3A3938, 0xFFFF3D3C, 0xFFFFFFFF).AsByte(); | ||
| Vector128<byte> decLutTwo1 = Vector128.Create(0x0100FF00, 0x05040302, 0x09080706, 0x0D0C0B0A).AsByte(); | ||
| Vector128<byte> decLutTwo2 = Vector128.Create(0x11100F0E, 0x15141312, 0x19181716, 0xFFFFFFFF).AsByte(); | ||
| Vector128<byte> decLutTwo3 = Vector128.Create(0x1B1AFFFF, 0x1F1E1D1C, 0x23222120, 0x27262524).AsByte(); | ||
| Vector128<byte> decLutTwo4 = Vector128.Create(0x2B2A2928, 0x2F2E2D2C, 0x33323130, 0xFFFFFFFF).AsByte(); | ||
Comment on lines +867 to +874 Member There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. It's not clear why these all need to be declared up front. It's effectively defining unnecessary locals and work for the JIT to do. This could have been | ||
| Vector128<byte> decOne1; | ||
| Vector128<byte> decOne2; | ||
| Vector128<byte> decOne3; | ||
| Vector128<byte> decOne4; | ||
| Vector128<byte> decTwo1; | ||
| Vector128<byte> decTwo2; | ||
| Vector128<byte> decTwo3; | ||
| Vector128<byte> decTwo4; | ||
| Vector128<byte> str1; | ||
| Vector128<byte> str2; | ||
| Vector128<byte> str3; | ||
| Vector128<byte> str4; | ||
| Vector128<byte> res1; | ||
| Vector128<byte> res2; | ||
| Vector128<byte> res3; | ||
Comment on lines +876 to +890 Member
| ||
| byte* src = srcBytes; | ||
| byte* dest = destBytes; | ||
| Vector128<byte> offset = AdvSimd.DuplicateToVector128((byte)0x3F); | ||
Member There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more.
The former allows more general optimizations to kick in as it does not prescribe a particular instruction be emitted. | ||
| var decLutOne = (decLutOne1, decLutOne2, decLutOne3, decLutOne4); | ||
| var decLutTwo = (decLutTwo1, decLutTwo2, decLutTwo3, decLutTwo4); | ||
| do | ||
| { | ||
| // Load 64 bytes and de-interleave. | ||
| AssertRead<Vector128<byte>>(src, srcStart, sourceLength); | ||
Member There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. I know this is pre-existing to the PR, but this is not a great method name. It should probably be something like | ||
| (str1, str2, str3, str4) = AdvSimd.Arm64.LoadVector128x4AndUnzip(src); | ||
Member There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. It would be nice if ContributorAuthor There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. Did you mean a wrapper method, say Member There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. No, rather as per my comment above (#100589 (comment)) we prefer to write xplat code so logic can be shared across CPU architectures where possible. To facilitate this, we ideally have the JIT recognize simple patterns and optimize them.
The driving question tends to be, "does this logic actually have to be platform/ISA specific, or is there a simple pattern we could enable the JIT to recognize instead?". In the case simple pattern recognition is possible, then it's generally ideal to enable it so that existing user code lights up and gets faster on Arm64 without the need for users to go and manually do the work. It also lets the same code automatically light-up for things like ContributorAuthor There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. Right, thanks for this clarification. I'll keep this mind for the common patterns in the future. | ||
| // Get indices for second LUT: | ||
| decTwo1 = AdvSimd.SubtractSaturate(str1, offset); | ||
| decTwo2 = AdvSimd.SubtractSaturate(str2, offset); | ||
| decTwo3 = AdvSimd.SubtractSaturate(str3, offset); | ||
| decTwo4 = AdvSimd.SubtractSaturate(str4, offset); | ||
| // Get values from first LUT. Out-of-range indices are set to 0. | ||
| decOne1 = AdvSimd.Arm64.VectorTableLookup(decLutOne, str1); | ||
| decOne2 = AdvSimd.Arm64.VectorTableLookup(decLutOne, str2); | ||
| decOne3 = AdvSimd.Arm64.VectorTableLookup(decLutOne, str3); | ||
| decOne4 = AdvSimd.Arm64.VectorTableLookup(decLutOne, str4); | ||
| // Get values from second LUT. Out-of-range indices are unchanged. | ||
| decTwo1 = AdvSimd.Arm64.VectorTableLookupExtension(decTwo1, decLutTwo, decTwo1); | ||
| decTwo2 = AdvSimd.Arm64.VectorTableLookupExtension(decTwo2, decLutTwo, decTwo2); | ||
| decTwo3 = AdvSimd.Arm64.VectorTableLookupExtension(decTwo3, decLutTwo, decTwo3); | ||
| decTwo4 = AdvSimd.Arm64.VectorTableLookupExtension(decTwo4, decLutTwo, decTwo4); | ||
| // Invalid values are set to 255 during above look-ups using 'decLutTwo' and 'decLutTwo'. | ||
| // Thus the intermediate results 'decOne' and 'decTwo' could be OR-ed to get final values. | ||
| str1 = decOne1 | decTwo1; | ||
| str2 = decOne2 | decTwo2; | ||
| str3 = decOne3 | decTwo3; | ||
| str4 = decOne4 | decTwo4; | ||
Comment on lines +904 to +927 Member There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. This general algorithm probably could do with a comment walking through what's happening so the "magic" is better understood, much as exists for ContributorAuthor There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. Agreed, it's difficult to follow. Although, it's a bit tricky to describe the entire workings with lookup tables and interleaving. I'll try to make it less opaque. | ||
| // Check for invalid input, any value larger than 63. | ||
| Vector128<byte> classified = AdvSimd.CompareGreaterThan(str1, offset) | ||
| | AdvSimd.CompareGreaterThan(str2, offset) | ||
| | AdvSimd.CompareGreaterThan(str3, offset) | ||
| | AdvSimd.CompareGreaterThan(str4, offset); | ||
Comment on lines +930 to +933 Member There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. These could have been the xplat ContributorAuthor There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. I didn't understand the xplat bit. Could you elaborate on this, please? Member There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. We have APIs defined on Using these APIs tends to be preferred over using the platform specific APIs, particularly when there is a clear 1-to-1 mapping. Thus Using the xplat (cross platform) APIs gives the JIT more flexibility in the code it generates and allows more portability, while also often improving readability. ContributorAuthor There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. Makes sense, I'll update these calls 👍 | ||
| // Check that all bits are zero. | ||
| if (classified != Vector128<byte>.Zero) | ||
| { | ||
| break; | ||
| } | ||
| // Compress four bytes into three. | ||
| res1 = AdvSimd.ShiftLeftLogical(str1, 2) | AdvSimd.ShiftRightLogical(str2, 4); | ||
| res2 = AdvSimd.ShiftLeftLogical(str2, 4) | AdvSimd.ShiftRightLogical(str3, 2); | ||
| res3 = AdvSimd.ShiftLeftLogical(str3, 6) | str4; | ||
Comment on lines +942 to +944 Member There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. These could have been of the form | ||
| // Interleave and store decoded result. | ||
| AssertWrite<Vector128<byte>>(dest, destStart, destLength); | ||
| AdvSimd.Arm64.StoreVector128x3AndZip(dest, (res1, res2, res3)); | ||
| src += 64; | ||
| dest += 48; | ||
| } | ||
| while (src <= srcEnd); | ||
| srcBytes = src; | ||
| destBytes = dest; | ||
| } | ||
| [MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
| [CompExactlyDependsOn(typeof(AdvSimd.Arm64))] | ||
| [CompExactlyDependsOn(typeof(Ssse3))] | ||