- Notifications
You must be signed in to change notification settings - Fork14.5k
[LLVM][NVPTX] Upstream tanh intrinsic for libdevice#149596
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to ourterms of service andprivacy statement. We’ll occasionally send you account related emails.
Already on GitHub?Sign in to your account
base:main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-backend-nvptx Author: Meredith Julian (mjulian31) ChangesCurrently __nv_fast_tanhf() in libdevice maps to an nvvm intrinsic that has not been upstreamed, which is causing issues when using the NVPTX backend from upstream. Instead of upstreaming the intrinsic, we can instead use the existing Intrinsic::tanh with the afn flag. This change adds NVPTX backend support for ISD::TANH, adds auto-upgrade for the old tanh_approx intrinsic to @llvm.tanh.f32 with afn flag so that libdevice works properly upstream, and adds a basic codegen test and a case to the auto-upgrade test. Full diff:https://github.com/llvm/llvm-project/pull/149596.diff 5 Files Affected:
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cppindex 86285a03c66bb..7b2640ede9c49 100644--- a/llvm/lib/IR/AutoUpgrade.cpp+++ b/llvm/lib/IR/AutoUpgrade.cpp@@ -1438,6 +1438,7 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, .Case("popc.ll", true) .Case("h2f", true) .Case("swap.lo.hi.b64", true)+ .Case("tanh.approx.f32", true) .Default(false); if (Expand) {@@ -2532,6 +2533,12 @@ static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI, MDNode *MD = MDNode::get(Builder.getContext(), {}); LD->setMetadata(LLVMContext::MD_invariant_load, MD); return LD;+ } else if (Name == "tanh.approx.f32") {+ // nvvm.tanh.approx.f32 -> afn llvm.tanh.f32+ FastMathFlags FMF;+ FMF.setApproxFunc();+ Rep = Builder.CreateUnaryIntrinsic(Intrinsic::tanh, CI->getArgOperand(0),+ FMF); } else if (Name == "barrier0" || Name == "barrier.n" || Name == "bar.sync") { Value *Arg = Name.ends_with('0') ? Builder.getInt32(0) : CI->getArgOperand(0);diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cppindex 7aa06f9079b09..74511929bb1c6 100644--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp@@ -950,10 +950,13 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, // promoted to f32. v2f16 is expanded to f16, which is then promoted // to f32. for (const auto &Op :- {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS}) {+ {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS, ISD::FTANH}) { setOperationAction(Op, MVT::f16, Promote); setOperationAction(Op, MVT::f32, Legal);- setOperationAction(Op, MVT::f64, Legal);+ // fsin, fcos, and ftanh are not supported on f64+ if (Op == ISD::FDIV || Op == ISD::FREM || Op == ISD::FSQRT) {+ setOperationAction(Op, MVT::f64, Legal);+ } setOperationAction(Op, {MVT::v2f16, MVT::v2bf16, MVT::v2f32}, Expand); setOperationAction(Op, MVT::bf16, Promote); AddPromotedToType(Op, MVT::bf16, MVT::f32);diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.tdindex a5bb83dfadb84..959e5297971f9 100644--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td@@ -1233,7 +1233,7 @@ defm FMA_F32 : FMA<F32RT, allow_ftz = true>; defm FMA_F32x2 : FMA<F32X2RT, allow_ftz = true, preds = [hasF32x2Instructions]>; defm FMA_F64 : FMA<F64RT, allow_ftz = false>;-// sin/cos+// sin/cos/tanh class UnaryOpAllowsApproxFn<SDPatternOperator operator> : PatFrag<(ops node:$A),@@ -1249,6 +1249,9 @@ def COS_APPROX_f32 : BasicFlagsNVPTXInst<(outs B32:$dst), (ins B32:$src), (ins FTZFlag:$ftz), "cos.approx$ftz.f32", [(set f32:$dst, (UnaryOpAllowsApproxFn<fcos> f32:$src))]>;+def TANH_APPROX_f32 :+ BasicNVPTXInst<(outs B32:$dst), (ins B32:$src), "tanh.approx.f32",+ [(set f32:$dst, (UnaryOpAllowsApproxFn<ftanh> f32:$src))]>; //----------------------------------- // Bitwise operationsdiff --git a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.llindex a17f11a680aa2..362586af4f9b7 100644--- a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll+++ b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll@@ -17,6 +17,8 @@ declare float @llvm.nvvm.fabs.f(float) declare float @llvm.nvvm.fabs.ftz.f(float) declare double @llvm.nvvm.fabs.d(double)+declare float @llvm.nvvm.tanh.approx.f32(float)+ declare i16 @llvm.nvvm.max.s(i16, i16) declare i32 @llvm.nvvm.max.i(i32, i32) declare i64 @llvm.nvvm.max.ll(i64, i64)@@ -138,6 +140,13 @@ define void @fabs(float %a, double %b) { ret void }+; CHECK-LABEL: @tanh+define void @tanh(float %a) {+; CHECK: call afn float @llvm.tanh.f32(float %a)+ %r1 = call float @llvm.nvvm.tanh.approx.f32(float %a)+ ret void+}+ ; CHECK-LABEL: @min_max define void @min_max(i16 %a1, i16 %a2, i32 %b1, i32 %b2, i64 %c1, i64 %c2) { ; CHECK: [[maxs:%[a-zA-Z0-9.]+]] = icmp sge i16 %a1, %a2diff --git a/llvm/test/CodeGen/NVPTX/tanhf.ll b/llvm/test/CodeGen/NVPTX/tanhf.llnew file mode 100644index 0000000000000..73f3ca68768e9--- /dev/null+++ b/llvm/test/CodeGen/NVPTX/tanhf.ll@@ -0,0 +1,70 @@+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5+; RUN: llc < %s -mtriple=nvptx64 | FileCheck %s+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 | %ptxas-verify %}+++define float @test1(float %in) local_unnamed_addr {+; CHECK-LABEL: test1(+; CHECK: {+; CHECK-NEXT: .reg .b32 %r<3>;+; CHECK-EMPTY:+; CHECK-NEXT: // %bb.0:+; CHECK-NEXT: ld.param.b32 %r1, [test1_param_0];+; CHECK-NEXT: tanh.approx.f32 %r2, %r1;+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;+; CHECK-NEXT: ret;+ %call = call afn float @llvm.tanh.f32(float %in)+ ret float %call+}++define float @test2(float %in) local_unnamed_addr {+; CHECK-LABEL: test2(+; CHECK: {+; CHECK-NEXT: .reg .b32 %r<3>;+; CHECK-EMPTY:+; CHECK-NEXT: // %bb.0:+; CHECK-NEXT: ld.param.b32 %r1, [test2_param_0];+; CHECK-NEXT: tanh.approx.f32 %r2, %r1;+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;+; CHECK-NEXT: ret;+ %call = tail call afn float @llvm.tanh.f32(float %in)+ ret float %call+}++define half @test3(half %in) local_unnamed_addr {+; CHECK-LABEL: test3(+; CHECK: {+; CHECK-NEXT: .reg .b16 %rs<3>;+; CHECK-NEXT: .reg .b32 %r<3>;+; CHECK-EMPTY:+; CHECK-NEXT: // %bb.0:+; CHECK-NEXT: ld.param.b16 %rs1, [test3_param_0];+; CHECK-NEXT: cvt.f32.f16 %r1, %rs1;+; CHECK-NEXT: tanh.approx.f32 %r2, %r1;+; CHECK-NEXT: cvt.rn.f16.f32 %rs2, %r2;+; CHECK-NEXT: st.param.b16 [func_retval0], %rs2;+; CHECK-NEXT: ret;+ %call = call afn half @llvm.tanh.f16(half %in)+ ret half %call+}++define half @test4(half %in) local_unnamed_addr {+; CHECK-LABEL: test4(+; CHECK: {+; CHECK-NEXT: .reg .b16 %rs<3>;+; CHECK-NEXT: .reg .b32 %r<3>;+; CHECK-EMPTY:+; CHECK-NEXT: // %bb.0:+; CHECK-NEXT: ld.param.b16 %rs1, [test4_param_0];+; CHECK-NEXT: cvt.f32.f16 %r1, %rs1;+; CHECK-NEXT: tanh.approx.f32 %r2, %r1;+; CHECK-NEXT: cvt.rn.f16.f32 %rs2, %r2;+; CHECK-NEXT: st.param.b16 [func_retval0], %rs2;+; CHECK-NEXT: ret;+ %call = tail call afn half @llvm.tanh.f16(half %in)+ ret half %call+}++declare float @llvm.tanh.f32(float)+declare half @llvm.tanh.f16(half)+ |
@llvm/pr-subscribers-llvm-ir Author: Meredith Julian (mjulian31) ChangesCurrently __nv_fast_tanhf() in libdevice maps to an nvvm intrinsic that has not been upstreamed, which is causing issues when using the NVPTX backend from upstream. Instead of upstreaming the intrinsic, we can instead use the existing Intrinsic::tanh with the afn flag. This change adds NVPTX backend support for ISD::TANH, adds auto-upgrade for the old tanh_approx intrinsic to @llvm.tanh.f32 with afn flag so that libdevice works properly upstream, and adds a basic codegen test and a case to the auto-upgrade test. Full diff:https://github.com/llvm/llvm-project/pull/149596.diff 5 Files Affected:
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cppindex 86285a03c66bb..7b2640ede9c49 100644--- a/llvm/lib/IR/AutoUpgrade.cpp+++ b/llvm/lib/IR/AutoUpgrade.cpp@@ -1438,6 +1438,7 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, .Case("popc.ll", true) .Case("h2f", true) .Case("swap.lo.hi.b64", true)+ .Case("tanh.approx.f32", true) .Default(false); if (Expand) {@@ -2532,6 +2533,12 @@ static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI, MDNode *MD = MDNode::get(Builder.getContext(), {}); LD->setMetadata(LLVMContext::MD_invariant_load, MD); return LD;+ } else if (Name == "tanh.approx.f32") {+ // nvvm.tanh.approx.f32 -> afn llvm.tanh.f32+ FastMathFlags FMF;+ FMF.setApproxFunc();+ Rep = Builder.CreateUnaryIntrinsic(Intrinsic::tanh, CI->getArgOperand(0),+ FMF); } else if (Name == "barrier0" || Name == "barrier.n" || Name == "bar.sync") { Value *Arg = Name.ends_with('0') ? Builder.getInt32(0) : CI->getArgOperand(0);diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cppindex 7aa06f9079b09..74511929bb1c6 100644--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp@@ -950,10 +950,13 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, // promoted to f32. v2f16 is expanded to f16, which is then promoted // to f32. for (const auto &Op :- {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS}) {+ {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS, ISD::FTANH}) { setOperationAction(Op, MVT::f16, Promote); setOperationAction(Op, MVT::f32, Legal);- setOperationAction(Op, MVT::f64, Legal);+ // fsin, fcos, and ftanh are not supported on f64+ if (Op == ISD::FDIV || Op == ISD::FREM || Op == ISD::FSQRT) {+ setOperationAction(Op, MVT::f64, Legal);+ } setOperationAction(Op, {MVT::v2f16, MVT::v2bf16, MVT::v2f32}, Expand); setOperationAction(Op, MVT::bf16, Promote); AddPromotedToType(Op, MVT::bf16, MVT::f32);diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.tdindex a5bb83dfadb84..959e5297971f9 100644--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td@@ -1233,7 +1233,7 @@ defm FMA_F32 : FMA<F32RT, allow_ftz = true>; defm FMA_F32x2 : FMA<F32X2RT, allow_ftz = true, preds = [hasF32x2Instructions]>; defm FMA_F64 : FMA<F64RT, allow_ftz = false>;-// sin/cos+// sin/cos/tanh class UnaryOpAllowsApproxFn<SDPatternOperator operator> : PatFrag<(ops node:$A),@@ -1249,6 +1249,9 @@ def COS_APPROX_f32 : BasicFlagsNVPTXInst<(outs B32:$dst), (ins B32:$src), (ins FTZFlag:$ftz), "cos.approx$ftz.f32", [(set f32:$dst, (UnaryOpAllowsApproxFn<fcos> f32:$src))]>;+def TANH_APPROX_f32 :+ BasicNVPTXInst<(outs B32:$dst), (ins B32:$src), "tanh.approx.f32",+ [(set f32:$dst, (UnaryOpAllowsApproxFn<ftanh> f32:$src))]>; //----------------------------------- // Bitwise operationsdiff --git a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.llindex a17f11a680aa2..362586af4f9b7 100644--- a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll+++ b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll@@ -17,6 +17,8 @@ declare float @llvm.nvvm.fabs.f(float) declare float @llvm.nvvm.fabs.ftz.f(float) declare double @llvm.nvvm.fabs.d(double)+declare float @llvm.nvvm.tanh.approx.f32(float)+ declare i16 @llvm.nvvm.max.s(i16, i16) declare i32 @llvm.nvvm.max.i(i32, i32) declare i64 @llvm.nvvm.max.ll(i64, i64)@@ -138,6 +140,13 @@ define void @fabs(float %a, double %b) { ret void }+; CHECK-LABEL: @tanh+define void @tanh(float %a) {+; CHECK: call afn float @llvm.tanh.f32(float %a)+ %r1 = call float @llvm.nvvm.tanh.approx.f32(float %a)+ ret void+}+ ; CHECK-LABEL: @min_max define void @min_max(i16 %a1, i16 %a2, i32 %b1, i32 %b2, i64 %c1, i64 %c2) { ; CHECK: [[maxs:%[a-zA-Z0-9.]+]] = icmp sge i16 %a1, %a2diff --git a/llvm/test/CodeGen/NVPTX/tanhf.ll b/llvm/test/CodeGen/NVPTX/tanhf.llnew file mode 100644index 0000000000000..73f3ca68768e9--- /dev/null+++ b/llvm/test/CodeGen/NVPTX/tanhf.ll@@ -0,0 +1,70 @@+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5+; RUN: llc < %s -mtriple=nvptx64 | FileCheck %s+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 | %ptxas-verify %}+++define float @test1(float %in) local_unnamed_addr {+; CHECK-LABEL: test1(+; CHECK: {+; CHECK-NEXT: .reg .b32 %r<3>;+; CHECK-EMPTY:+; CHECK-NEXT: // %bb.0:+; CHECK-NEXT: ld.param.b32 %r1, [test1_param_0];+; CHECK-NEXT: tanh.approx.f32 %r2, %r1;+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;+; CHECK-NEXT: ret;+ %call = call afn float @llvm.tanh.f32(float %in)+ ret float %call+}++define float @test2(float %in) local_unnamed_addr {+; CHECK-LABEL: test2(+; CHECK: {+; CHECK-NEXT: .reg .b32 %r<3>;+; CHECK-EMPTY:+; CHECK-NEXT: // %bb.0:+; CHECK-NEXT: ld.param.b32 %r1, [test2_param_0];+; CHECK-NEXT: tanh.approx.f32 %r2, %r1;+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;+; CHECK-NEXT: ret;+ %call = tail call afn float @llvm.tanh.f32(float %in)+ ret float %call+}++define half @test3(half %in) local_unnamed_addr {+; CHECK-LABEL: test3(+; CHECK: {+; CHECK-NEXT: .reg .b16 %rs<3>;+; CHECK-NEXT: .reg .b32 %r<3>;+; CHECK-EMPTY:+; CHECK-NEXT: // %bb.0:+; CHECK-NEXT: ld.param.b16 %rs1, [test3_param_0];+; CHECK-NEXT: cvt.f32.f16 %r1, %rs1;+; CHECK-NEXT: tanh.approx.f32 %r2, %r1;+; CHECK-NEXT: cvt.rn.f16.f32 %rs2, %r2;+; CHECK-NEXT: st.param.b16 [func_retval0], %rs2;+; CHECK-NEXT: ret;+ %call = call afn half @llvm.tanh.f16(half %in)+ ret half %call+}++define half @test4(half %in) local_unnamed_addr {+; CHECK-LABEL: test4(+; CHECK: {+; CHECK-NEXT: .reg .b16 %rs<3>;+; CHECK-NEXT: .reg .b32 %r<3>;+; CHECK-EMPTY:+; CHECK-NEXT: // %bb.0:+; CHECK-NEXT: ld.param.b16 %rs1, [test4_param_0];+; CHECK-NEXT: cvt.f32.f16 %r1, %rs1;+; CHECK-NEXT: tanh.approx.f32 %r2, %r1;+; CHECK-NEXT: cvt.rn.f16.f32 %rs2, %r2;+; CHECK-NEXT: st.param.b16 [func_retval0], %rs2;+; CHECK-NEXT: ret;+ %call = tail call afn half @llvm.tanh.f16(half %in)+ ret half %call+}++declare float @llvm.tanh.f32(float)+declare half @llvm.tanh.f16(half)+ |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others.Learn more.
LGTM with a nit.
Uh oh!
There was an error while loading.Please reload this page.
For the context: it appears that the dependency on the new intrinsic was introduced in libdevice in CUDA-12.8+ and we're not touching it from the clang side yet, AFAICT. Please make sure that all tests pass -- github reports that some CI checks have failed. |
It looks like these tests are failing from this recent commit, and are not related to this change:#149571 |
And yes, this was added to libdevice in CUDA 12.8- hence adding the auto-upgrade for the previously-used intrinsic |
Currently __nv_fast_tanhf() in libdevice maps to an nvvm intrinsic that has not been upstreamed, which is causing issues when using the NVPTX backend from upstream. Instead of upstreaming the intrinsic, we can instead use the existing Intrinsic::tanh with the afn flag. This change adds NVPTX backend support for ISD::TANH, adds auto-upgrade for the old tanh_approx intrinsic to @llvm.tanh.f32 with afn flag so that libdevice works properly upstream, and adds a basic codegen test and a case to the auto-upgrade test.