Jun 28, 2024 · Jun 17, 2024 · Jun 17, 2024 · Jun 17, 2024 · Jun 17, 2024 · Jun 17, 2024
diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp
                    {
                        intrinsic = NI_AVX10v1_MultiplyLow;
                    }
                    else
                    else if (compOpportunisticallyDependsOn(InstructionSet_AVX512DQ_VL))
                    {
                        assert(compIsaSupportedDebugOnly(InstructionSet_AVX512DQ_VL));
                        intrinsic = NI_AVX512DQ_VL_MultiplyLow;
                    }
                    else
                    {
                        assert(((simdSize == 16) && compOpportunisticallyDependsOn(InstructionSet_SSE41)) ||
                               ((simdSize == 32) && compOpportunisticallyDependsOn(InstructionSet_AVX2)));

                        // Make op1 and op2 multi-use:
                        GenTree* op1Dup = fgMakeMultiUse(&op1);
                        GenTree* op2Dup = fgMakeMultiUse(&op2);

                        const bool is256 = simdSize == 32;

                        // Vector256<ulong> tmp0 = Avx2.Multiply(left, right);
                        GenTreeHWIntrinsic* tmp0 =
                            gtNewSimdHWIntrinsicNode(type, op1, op2, is256 ? NI_AVX2_Multiply : NI_SSE2_Multiply,
                                                     CORINFO_TYPE_ULONG, simdSize);

                        // Vector256<uint> tmp1 = Avx2.Shuffle(right.AsUInt32(), ZWXY);
                        GenTree*            shuffleMask = gtNewIconNode(SHUFFLE_ZWXY, TYP_INT);
                        GenTreeHWIntrinsic* tmp1        = gtNewSimdHWIntrinsicNode(type, op2Dup, shuffleMask,
                                                                            is256 ? NI_AVX2_Shuffle : NI_SSE2_Shuffle,
                                                                                   CORINFO_TYPE_UINT, simdSize);

                        // Vector256<uint> tmp2 = Avx2.MultiplyLow(left.AsUInt32(), tmp1);
                        GenTreeHWIntrinsic* tmp2 =
                            gtNewSimdHWIntrinsicNode(type, op1Dup, tmp1,
                                                     is256 ? NI_AVX2_MultiplyLow : NI_SSE41_MultiplyLow,
                                                     CORINFO_TYPE_UINT, simdSize);

                        // Vector256<int> tmp3 = Avx2.HorizontalAdd(tmp2.AsInt32(), Vector256<int>.Zero);
                        GenTreeHWIntrinsic* tmp3 =
                            gtNewSimdHWIntrinsicNode(type, tmp2, gtNewZeroConNode(type),
                                                     is256 ? NI_AVX2_HorizontalAdd : NI_SSSE3_HorizontalAdd,
                                                     CORINFO_TYPE_UINT, simdSize);

                        // Vector256<int> tmp4 = Avx2.Shuffle(tmp3, YWXW);
                        shuffleMask = gtNewIconNode(SHUFFLE_YWXW, TYP_INT);
                        GenTreeHWIntrinsic* tmp4 =
                            gtNewSimdHWIntrinsicNode(type, tmp3, shuffleMask, is256 ? NI_AVX2_Shuffle : NI_SSE2_Shuffle,
                                                     CORINFO_TYPE_UINT, simdSize);

                        // result = tmp0 + tmp4;
                        op1       = tmp0;
                        op2       = tmp4;
                        intrinsic = simdSize == 32 ? NI_AVX2_Add : NI_SSE2_Add;
                    }

                    break;
                }
diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp

            if (varTypeIsLong(simdBaseType))
            {
                if (simdSize != 64 && !canUseEvexEncoding())
                if (TARGET_POINTER_SIZE == 4)
                {
                    // TODO-XARCH-CQ:We shouldsupport long/ulong multiplication
                    // TODO-XARCH-CQ:32bitsupport
                    break;
                }
                // else if simdSize == 64 then above assert would check if baseline isa supported

 #if defined(TARGET_X86)
                // TODO-XARCH-CQ: We need to support 64-bit CreateBroadcast
                break;
 #endif // TARGET_X86
                if ((simdSize == 32) && compOpportunisticallyDependsOn(InstructionSet_AVX2))
                {
                    // Emulate NI_AVX512DQ_VL_MultiplyLow with AVX2 for SIMD32
                }
                else if ((simdSize == 16) && compOpportunisticallyDependsOn(InstructionSet_SSE41))
                {
                    // Emulate NI_AVX512DQ_VL_MultiplyLow with SSE41 for SIMD16
                }
                else
                {
                    // Software fallback
                    break;
                }
            }

            CORINFO_ARG_LIST_HANDLE arg1     = sig->args;
diff --git a/src/coreclr/jit/simd.h b/src/coreclr/jit/simd.h
 #define SHUFFLE_XYZW 0x1B // 00 01 10 11
 #define SHUFFLE_YXYX 0x44 // 01 00 01 00
 #define SHUFFLE_YWXZ 0x72 // 01 11 00 10
 #define SHUFFLE_YWXW 0x73 // 01 11 00 11
 #define SHUFFLE_YYZZ 0x5A // 01 01 10 10
 #define SHUFFLE_ZXXX 0x80 // 10 00 00 00
 #define SHUFFLE_ZXXY 0x81 // 10 00 00 01
diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128_1.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128_1.cs
 // Licensed to the .NET Foundation under one or more agreements.
 // Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.

 using System.Diagnostics;
Original file line number	Diff line number	Diff line change
Expand Up		@@ -21592,11 +21592,55 @@ GenTree* Compiler::gtNewSimdBinOpNode(
		{
		intrinsic = NI_AVX10v1_MultiplyLow;
		}
		else
		else if (compOpportunisticallyDependsOn(InstructionSet_AVX512DQ_VL))
tannergooding marked this conversation as resolved. OutdatedShow resolvedHide resolved
		{
		assert(compIsaSupportedDebugOnly(InstructionSet_AVX512DQ_VL));
		intrinsic = NI_AVX512DQ_VL_MultiplyLow;
		}
		else
		{
		assert(((simdSize == 16) && compOpportunisticallyDependsOn(InstructionSet_SSE41)) \|\|
		((simdSize == 32) && compOpportunisticallyDependsOn(InstructionSet_AVX2)));

		// Make op1 and op2 multi-use:
		GenTree* op1Dup = fgMakeMultiUse(&op1);
		GenTree* op2Dup = fgMakeMultiUse(&op2);

		const bool is256 = simdSize == 32;

		// Vector256<ulong> tmp0 = Avx2.Multiply(left, right);
		GenTreeHWIntrinsic* tmp0 =
		gtNewSimdHWIntrinsicNode(type, op1, op2, is256 ? NI_AVX2_Multiply : NI_SSE2_Multiply,
		CORINFO_TYPE_ULONG, simdSize);

		// Vector256<uint> tmp1 = Avx2.Shuffle(right.AsUInt32(), ZWXY);
		GenTree* shuffleMask = gtNewIconNode(SHUFFLE_ZWXY, TYP_INT);
		GenTreeHWIntrinsic* tmp1 = gtNewSimdHWIntrinsicNode(type, op2Dup, shuffleMask,
		is256 ? NI_AVX2_Shuffle : NI_SSE2_Shuffle,
		CORINFO_TYPE_UINT, simdSize);

		// Vector256<uint> tmp2 = Avx2.MultiplyLow(left.AsUInt32(), tmp1);
		GenTreeHWIntrinsic* tmp2 =
		gtNewSimdHWIntrinsicNode(type, op1Dup, tmp1,
		is256 ? NI_AVX2_MultiplyLow : NI_SSE41_MultiplyLow,
		CORINFO_TYPE_UINT, simdSize);

		// Vector256<int> tmp3 = Avx2.HorizontalAdd(tmp2.AsInt32(), Vector256<int>.Zero);
		GenTreeHWIntrinsic* tmp3 =
		gtNewSimdHWIntrinsicNode(type, tmp2, gtNewZeroConNode(type),
		is256 ? NI_AVX2_HorizontalAdd : NI_SSSE3_HorizontalAdd,
		CORINFO_TYPE_UINT, simdSize);
Comment on lines +21536 to +21540 Copy link Member tannergoodingJun 27, 2024 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. I know in other places we've started avoiding`hadd` in favor of`shuffle+add`, might be worth seeing if that's appropriate here too (low priority, non blocking) Copy link MemberAuthor EgorBoJun 27, 2024 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. I tried to benchmark different implementations for it and they all were equaly fast e.g.#99871 (comment) tannergooding reacted with thumbs up emoji

		// Vector256<int> tmp4 = Avx2.Shuffle(tmp3, YWXW);
		shuffleMask = gtNewIconNode(SHUFFLE_YWXW, TYP_INT);
		GenTreeHWIntrinsic* tmp4 =
		gtNewSimdHWIntrinsicNode(type, tmp3, shuffleMask, is256 ? NI_AVX2_Shuffle : NI_SSE2_Shuffle,
		CORINFO_TYPE_UINT, simdSize);

		// result = tmp0 + tmp4;
		op1 = tmp0;
		op2 = tmp4;
		intrinsic = simdSize == 32 ? NI_AVX2_Add : NI_SSE2_Add;
		}

		break;
		}
Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -2706,17 +2706,25 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,

		if (varTypeIsLong(simdBaseType))
		{
		if (simdSize != 64 && !canUseEvexEncoding())
		if (TARGET_POINTER_SIZE == 4)
		{
		// TODO-XARCH-CQ:We shouldsupport long/ulong multiplication
		// TODO-XARCH-CQ:32bitsupport
Copy link Member tannergoodingJun 27, 2024 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. What's blocking 32-bit support? It doesn't look like we're using any`_X64` intrinsics in the fallback logic? Copy link MemberAuthor EgorBoJun 27, 2024 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. Not sure to be honest, that check was pre-existing, I only changed comment
		break;
		}
		// else if simdSize == 64 then above assert would check if baseline isa supported

		#if defined(TARGET_X86)
		// TODO-XARCH-CQ: We need to support 64-bit CreateBroadcast
		break;
		#endif // TARGET_X86
		if ((simdSize == 32) && compOpportunisticallyDependsOn(InstructionSet_AVX2))
		{
		// Emulate NI_AVX512DQ_VL_MultiplyLow with AVX2 for SIMD32
		}
		else if ((simdSize == 16) && compOpportunisticallyDependsOn(InstructionSet_SSE41))
		{
		// Emulate NI_AVX512DQ_VL_MultiplyLow with SSE41 for SIMD16
		}
		else
		{
		// Software fallback
		break;
		}
		}

		CORINFO_ARG_LIST_HANDLE arg1 = sig->args;
Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -1015,6 +1015,7 @@ void BroadcastConstantToSimd(TSimd* result, TBase arg0)
		#define SHUFFLE_XYZW 0x1B // 00 01 10 11
		#define SHUFFLE_YXYX 0x44 // 01 00 01 00
		#define SHUFFLE_YWXZ 0x72 // 01 11 00 10
		#define SHUFFLE_YWXW 0x73 // 01 11 00 11
		#define SHUFFLE_YYZZ 0x5A // 01 01 10 10
		#define SHUFFLE_ZXXX 0x80 // 10 00 00 00
		#define SHUFFLE_ZXXY 0x81 // 10 00 00 01
Expand Down
Original file line number	Diff line number	Diff line change
		@@ -1,4 +1,4 @@
		// Licensed to the .NET Foundation under one or more agreements.
		// Licensed to the .NET Foundation under one or more agreements.
		// The .NET Foundation licenses this file to you under the MIT license.

		using System.Diagnostics;
Expand Down