diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index f0fbe0135353f..231e8eb9e90d0 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -6208,6 +6208,102 @@ static bool hasMoreUses(const MachineInstr &MI0, const MachineInstr &MI1, MRI.use_instr_nodbg_end()); } +/// Check if all uses of a multiply instruction can be contracted into FMAs. +/// This prevents creating FMAs when the multiply has other uses that can't +/// be contracted, which would duplicate the multiply computation. +static bool allMulUsesCanBeContracted(const MachineInstr &MulMI, + const MachineRegisterInfo &MRI, + bool AllowFusionGlobally) { + if (!isContractableFMul(const_cast(MulMI), + AllowFusionGlobally)) + return false; + Register MulReg = MulMI.getOperand(0).getReg(); + + // Check all uses of the multiply result + for (const MachineInstr &UseMI : MRI.use_nodbg_instructions(MulReg)) { + unsigned Opcode = UseMI.getOpcode(); + + // Direct FADD/FSUB uses + if (Opcode == TargetOpcode::G_FADD || Opcode == TargetOpcode::G_FSUB) { + // Check that we're not contracting both operands (which would duplicate) + Register Op1 = UseMI.getOperand(1).getReg(); + Register Op2 = UseMI.getOperand(2).getReg(); + if (Op1 == MulReg && Op2 == MulReg) + return false; + + continue; + } + + // FNEG → FSUB pattern + // Also handles FNEG → FPEXT → FSUB + if (Opcode == TargetOpcode::G_FNEG) { + Register FNegReg = UseMI.getOperand(0).getReg(); + // ALL users of the FNEG must be contractable FSUBs or FPEXTs leading to + // FSUBs + for (const MachineInstr &FNegUseMI : + MRI.use_nodbg_instructions(FNegReg)) { + unsigned FNegUseOpcode = FNegUseMI.getOpcode(); + + if (FNegUseOpcode == TargetOpcode::G_FSUB) { + continue; + } + if (FNegUseOpcode == TargetOpcode::G_FPEXT) { + // FNEG → FPEXT → FSUB + Register FNegFPExtReg = FNegUseMI.getOperand(0).getReg(); + for (const MachineInstr &FNegFPExtUseMI : + MRI.use_nodbg_instructions(FNegFPExtReg)) { + if (FNegFPExtUseMI.getOpcode() != TargetOpcode::G_FSUB) + return false; + } + continue; + } + return false; + } + continue; + } + + // FP_EXTEND → {FADD, FSUB, FMA, FMAD} pattern + // Also handles FP_EXTEND → FNEG → FSUB + if (Opcode == TargetOpcode::G_FPEXT) { + Register FPExtReg = UseMI.getOperand(0).getReg(); + + // ALL users of the FP_EXTEND must be contractable operations or FNEGs + for (const MachineInstr &FPExtUseMI : + MRI.use_nodbg_instructions(FPExtReg)) { + unsigned ExtUseOpcode = FPExtUseMI.getOpcode(); + if (ExtUseOpcode == TargetOpcode::G_FADD || + ExtUseOpcode == TargetOpcode::G_FSUB || + ExtUseOpcode == TargetOpcode::G_FMA || + ExtUseOpcode == TargetOpcode::G_FMAD) { + continue; + } + if (ExtUseOpcode == TargetOpcode::G_FNEG) { + // FP_EXTEND → FNEG → FSUB + Register FPExtFNegReg = FPExtUseMI.getOperand(0).getReg(); + for (const MachineInstr &FPExtFNegUseMI : + MRI.use_nodbg_instructions(FPExtFNegReg)) { + if (FPExtFNegUseMI.getOpcode() != TargetOpcode::G_FSUB) + return false; + } + continue; + } + return false; + } + continue; + } + + // FMA/FMAD uses - assume contractable to avoid complex tracking + if (Opcode == TargetOpcode::G_FMA || Opcode == TargetOpcode::G_FMAD) { + continue; + } + + // Any other use type is not contractable + return false; + } + + return true; +} + bool CombinerHelper::canCombineFMadOrFMA(MachineInstr &MI, bool &AllowFusionGlobally, bool &HasFMAD, bool &Aggressive, @@ -6265,7 +6361,9 @@ bool CombinerHelper::matchCombineFAddFMulToFMadOrFMA( // fold (fadd (fmul x, y), z) -> (fma x, y, z) if (isContractableFMul(*LHS.MI, AllowFusionGlobally) && - (Aggressive || MRI.hasOneNonDBGUse(LHS.Reg))) { + (MRI.hasOneNonDBGUse(LHS.Reg) || + (Aggressive && + allMulUsesCanBeContracted(*LHS.MI, MRI, AllowFusionGlobally)))) { MatchInfo = [=, &MI](MachineIRBuilder &B) { B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()}, {LHS.MI->getOperand(1).getReg(), @@ -6276,7 +6374,9 @@ bool CombinerHelper::matchCombineFAddFMulToFMadOrFMA( // fold (fadd x, (fmul y, z)) -> (fma y, z, x) if (isContractableFMul(*RHS.MI, AllowFusionGlobally) && - (Aggressive || MRI.hasOneNonDBGUse(RHS.Reg))) { + (MRI.hasOneNonDBGUse(RHS.Reg) || + (Aggressive && + allMulUsesCanBeContracted(*RHS.MI, MRI, AllowFusionGlobally)))) { MatchInfo = [=, &MI](MachineIRBuilder &B) { B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()}, {RHS.MI->getOperand(1).getReg(), @@ -6319,6 +6419,9 @@ bool CombinerHelper::matchCombineFAddFpExtFMulToFMadOrFMA( MachineInstr *FpExtSrc; if (mi_match(LHS.Reg, MRI, m_GFPExt(m_MInstr(FpExtSrc))) && isContractableFMul(*FpExtSrc, AllowFusionGlobally) && + (MRI.hasOneNonDBGUse(FpExtSrc->getOperand(0).getReg()) || + (Aggressive && + allMulUsesCanBeContracted(*FpExtSrc, MRI, AllowFusionGlobally))) && TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstType, MRI.getType(FpExtSrc->getOperand(1).getReg()))) { MatchInfo = [=, &MI](MachineIRBuilder &B) { @@ -6334,6 +6437,9 @@ bool CombinerHelper::matchCombineFAddFpExtFMulToFMadOrFMA( // Note: Commutes FADD operands. if (mi_match(RHS.Reg, MRI, m_GFPExt(m_MInstr(FpExtSrc))) && isContractableFMul(*FpExtSrc, AllowFusionGlobally) && + (MRI.hasOneNonDBGUse(FpExtSrc->getOperand(0).getReg()) || + (Aggressive && + allMulUsesCanBeContracted(*FpExtSrc, MRI, AllowFusionGlobally))) && TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstType, MRI.getType(FpExtSrc->getOperand(1).getReg()))) { MatchInfo = [=, &MI](MachineIRBuilder &B) { @@ -6463,6 +6569,7 @@ bool CombinerHelper::matchCombineFAddFpExtFMulToFMadOrFMAAggressive( mi_match(LHS.MI->getOperand(3).getReg(), MRI, m_GFPExt(m_MInstr(FMulMI))) && isContractableFMul(*FMulMI, AllowFusionGlobally) && + allMulUsesCanBeContracted(*FMulMI, MRI, AllowFusionGlobally) && TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstType, MRI.getType(FMulMI->getOperand(0).getReg()))) { MatchInfo = [=](MachineIRBuilder &B) { @@ -6483,6 +6590,7 @@ bool CombinerHelper::matchCombineFAddFpExtFMulToFMadOrFMAAggressive( FMAMI->getOpcode() == PreferredFusedOpcode) { MachineInstr *FMulMI = MRI.getVRegDef(FMAMI->getOperand(3).getReg()); if (isContractableFMul(*FMulMI, AllowFusionGlobally) && + allMulUsesCanBeContracted(*FMulMI, MRI, AllowFusionGlobally) && TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstType, MRI.getType(FMAMI->getOperand(0).getReg()))) { MatchInfo = [=](MachineIRBuilder &B) { @@ -6504,6 +6612,7 @@ bool CombinerHelper::matchCombineFAddFpExtFMulToFMadOrFMAAggressive( mi_match(RHS.MI->getOperand(3).getReg(), MRI, m_GFPExt(m_MInstr(FMulMI))) && isContractableFMul(*FMulMI, AllowFusionGlobally) && + allMulUsesCanBeContracted(*FMulMI, MRI, AllowFusionGlobally) && TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstType, MRI.getType(FMulMI->getOperand(0).getReg()))) { MatchInfo = [=](MachineIRBuilder &B) { @@ -6524,6 +6633,7 @@ bool CombinerHelper::matchCombineFAddFpExtFMulToFMadOrFMAAggressive( FMAMI->getOpcode() == PreferredFusedOpcode) { MachineInstr *FMulMI = MRI.getVRegDef(FMAMI->getOperand(3).getReg()); if (isContractableFMul(*FMulMI, AllowFusionGlobally) && + allMulUsesCanBeContracted(*FMulMI, MRI, AllowFusionGlobally) && TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstType, MRI.getType(FMAMI->getOperand(0).getReg()))) { MatchInfo = [=](MachineIRBuilder &B) { @@ -6570,7 +6680,9 @@ bool CombinerHelper::matchCombineFSubFMulToFMadOrFMA( // fold (fsub (fmul x, y), z) -> (fma x, y, -z) if (FirstMulHasFewerUses && (isContractableFMul(*LHS.MI, AllowFusionGlobally) && - (Aggressive || MRI.hasOneNonDBGUse(LHS.Reg)))) { + (MRI.hasOneNonDBGUse(LHS.Reg) || + (Aggressive && + allMulUsesCanBeContracted(*LHS.MI, MRI, AllowFusionGlobally))))) { MatchInfo = [=, &MI](MachineIRBuilder &B) { Register NegZ = B.buildFNeg(DstTy, RHS.Reg).getReg(0); B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()}, @@ -6580,8 +6692,10 @@ bool CombinerHelper::matchCombineFSubFMulToFMadOrFMA( return true; } // fold (fsub x, (fmul y, z)) -> (fma -y, z, x) - else if ((isContractableFMul(*RHS.MI, AllowFusionGlobally) && - (Aggressive || MRI.hasOneNonDBGUse(RHS.Reg)))) { + if (isContractableFMul(*RHS.MI, AllowFusionGlobally) && + (MRI.hasOneNonDBGUse(RHS.Reg) || + (Aggressive && + allMulUsesCanBeContracted(*RHS.MI, MRI, AllowFusionGlobally)))) { MatchInfo = [=, &MI](MachineIRBuilder &B) { Register NegY = B.buildFNeg(DstTy, RHS.MI->getOperand(1).getReg()).getReg(0); @@ -6613,8 +6727,10 @@ bool CombinerHelper::matchCombineFSubFNegFMulToFMadOrFMA( MachineInstr *FMulMI; // fold (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) if (mi_match(LHSReg, MRI, m_GFNeg(m_MInstr(FMulMI))) && - (Aggressive || (MRI.hasOneNonDBGUse(LHSReg) && - MRI.hasOneNonDBGUse(FMulMI->getOperand(0).getReg()))) && + ((MRI.hasOneNonDBGUse(LHSReg) && + MRI.hasOneNonDBGUse(FMulMI->getOperand(0).getReg())) || + (Aggressive && + allMulUsesCanBeContracted(*FMulMI, MRI, AllowFusionGlobally))) && isContractableFMul(*FMulMI, AllowFusionGlobally)) { MatchInfo = [=, &MI](MachineIRBuilder &B) { Register NegX = @@ -6628,8 +6744,10 @@ bool CombinerHelper::matchCombineFSubFNegFMulToFMadOrFMA( // fold (fsub x, (fneg (fmul, y, z))) -> (fma y, z, x) if (mi_match(RHSReg, MRI, m_GFNeg(m_MInstr(FMulMI))) && - (Aggressive || (MRI.hasOneNonDBGUse(RHSReg) && - MRI.hasOneNonDBGUse(FMulMI->getOperand(0).getReg()))) && + ((MRI.hasOneNonDBGUse(RHSReg) && + MRI.hasOneNonDBGUse(FMulMI->getOperand(0).getReg())) || + (Aggressive && + allMulUsesCanBeContracted(*FMulMI, MRI, AllowFusionGlobally))) && isContractableFMul(*FMulMI, AllowFusionGlobally)) { MatchInfo = [=, &MI](MachineIRBuilder &B) { B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()}, @@ -6662,7 +6780,9 @@ bool CombinerHelper::matchCombineFSubFpExtFMulToFMadOrFMA( // fold (fsub (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), (fneg z)) if (mi_match(LHSReg, MRI, m_GFPExt(m_MInstr(FMulMI))) && isContractableFMul(*FMulMI, AllowFusionGlobally) && - (Aggressive || MRI.hasOneNonDBGUse(LHSReg))) { + (MRI.hasOneNonDBGUse(LHSReg) || + (Aggressive && + allMulUsesCanBeContracted(*FMulMI, MRI, AllowFusionGlobally)))) { MatchInfo = [=, &MI](MachineIRBuilder &B) { Register FpExtX = B.buildFPExt(DstTy, FMulMI->getOperand(1).getReg()).getReg(0); @@ -6678,7 +6798,9 @@ bool CombinerHelper::matchCombineFSubFpExtFMulToFMadOrFMA( // fold (fsub x, (fpext (fmul y, z))) -> (fma (fneg (fpext y)), (fpext z), x) if (mi_match(RHSReg, MRI, m_GFPExt(m_MInstr(FMulMI))) && isContractableFMul(*FMulMI, AllowFusionGlobally) && - (Aggressive || MRI.hasOneNonDBGUse(RHSReg))) { + (MRI.hasOneNonDBGUse(RHSReg) || + (Aggressive && + allMulUsesCanBeContracted(*FMulMI, MRI, AllowFusionGlobally)))) { MatchInfo = [=, &MI](MachineIRBuilder &B) { Register FpExtY = B.buildFPExt(DstTy, FMulMI->getOperand(1).getReg()).getReg(0); @@ -6726,6 +6848,7 @@ bool CombinerHelper::matchCombineFSubFpExtFNegFMulToFMadOrFMA( if ((mi_match(LHSReg, MRI, m_GFPExt(m_GFNeg(m_MInstr(FMulMI)))) || mi_match(LHSReg, MRI, m_GFNeg(m_GFPExt(m_MInstr(FMulMI))))) && isContractableFMul(*FMulMI, AllowFusionGlobally) && + allMulUsesCanBeContracted(*FMulMI, MRI, AllowFusionGlobally) && TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstTy, MRI.getType(FMulMI->getOperand(0).getReg()))) { MatchInfo = [=, &MI](MachineIRBuilder &B) { @@ -6742,6 +6865,7 @@ bool CombinerHelper::matchCombineFSubFpExtFNegFMulToFMadOrFMA( if ((mi_match(RHSReg, MRI, m_GFPExt(m_GFNeg(m_MInstr(FMulMI)))) || mi_match(RHSReg, MRI, m_GFNeg(m_GFPExt(m_MInstr(FMulMI))))) && isContractableFMul(*FMulMI, AllowFusionGlobally) && + allMulUsesCanBeContracted(*FMulMI, MRI, AllowFusionGlobally) && TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstTy, MRI.getType(FMulMI->getOperand(0).getReg()))) { MatchInfo = [=, &MI](MachineIRBuilder &B) { diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 07cbedbcb201e..e5b5b79859a0e 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -17217,6 +17217,131 @@ static bool isContractableFMUL(const TargetOptions &Options, SDValue N) { N->getFlags().hasAllowContract(); } +// Check if a node is a fused FMA or FMAD operation. +template +static bool isFusedOp(const MatcherClass &matcher, SDValue N) { + return matcher.match(N, ISD::FMA) || matcher.match(N, ISD::FMAD); +} + +// Forward declaration for mutual recursion with allMulUsesCanBeContracted. +template +static bool isContractableUser(SDValue Val, bool AllowFusionGlobally, + const MatcherClass &matcher, + const TargetLowering &TLI, SelectionDAG &DAG); + +// Check if all uses of a multiply can be contracted into FMA operations. +// Returns true if all uses of the multiply are contractable, meaning the +// multiply can potentially be eliminated through FMA contraction. +// Returns false if any use cannot be contracted, which would mean contracting +// would duplicate the multiply without reducing the total number of operations. +// +// This uses a simple, non-recursive check for the following patterns: +// - fmul → fadd/fsub: Direct contraction +// - fmul → fneg → fsub: FNEG folds into FMA with negated operand +// - fmul → fpext → {fadd, fsub, fma}: FPEXT folds if target supports it +// - fmul → fma: Assume FMA contraction will handle it (to avoid complexity) +template +static bool allMulUsesCanBeContracted(SDValue Mul, bool AllowFusionGlobally, + const MatcherClass &matcher, + const TargetLowering &TLI, + SelectionDAG &DAG) { + // Check if all uses are contractable patterns + for (const auto *User : Mul->users()) { + SDNode *UserNode = const_cast(User); + unsigned Opcode = UserNode->getOpcode(); + + // Direct FADD/FSUB - contractable if fusion is allowed + if (Opcode == ISD::FADD || Opcode == ISD::FSUB) { + if (AllowFusionGlobally || UserNode->getFlags().hasAllowContract()) { + // Make sure both operands aren't already fused ops + if (!isFusedOp(matcher, UserNode->getOperand(0)) || + !isFusedOp(matcher, UserNode->getOperand(1))) + continue; // This use is contractable + } + return false; // Not contractable + } + + // FNEG - check if ALL users are contractable FSUBs (which can fold the + // negation) Also handles FNEG → FPEXT → FSUB + if (Opcode == ISD::FNEG) { + for (const auto *FNegUser : UserNode->users()) { + unsigned FNegUserOp = FNegUser->getOpcode(); + + if (FNegUserOp == ISD::FSUB) { + if (!(AllowFusionGlobally || FNegUser->getFlags().hasAllowContract())) + return false; + } else if (FNegUserOp == ISD::FP_EXTEND) { + // FNEG → FPEXT → FSUB + EVT FNegSrcVT = UserNode->getOperand(0).getValueType(); + EVT FNegDstVT = FNegUser->getValueType(0); + + if (!TLI.isFPExtFoldable(DAG, ISD::FMA, FNegDstVT, FNegSrcVT)) + return false; + + for (const auto *FNegFPExtUser : FNegUser->users()) { + if (FNegFPExtUser->getOpcode() != ISD::FSUB || + !(AllowFusionGlobally || + FNegFPExtUser->getFlags().hasAllowContract())) + return false; + } + } else { + return false; + } + } + continue; // All FNEG uses are contractable + } + + // FP_EXTEND - check if ALL users are FADD, FSUB, or FMA (and if target + // supports folding) Also handles FP_EXTEND → FNEG → FSUB + if (Opcode == ISD::FP_EXTEND) { + EVT SrcVT = UserNode->getOperand(0).getValueType(); + EVT DstVT = UserNode->getValueType(0); + + if (!TLI.isFPExtFoldable(DAG, ISD::FMA, DstVT, SrcVT)) + return false; // Target doesn't support folding this FPEXT + + for (const auto *FPExtUser : UserNode->users()) { + unsigned ExtUserOp = FPExtUser->getOpcode(); + bool isContractableUse = false; + + if (ExtUserOp == ISD::FADD || ExtUserOp == ISD::FSUB) { + if (AllowFusionGlobally || FPExtUser->getFlags().hasAllowContract()) + isContractableUse = true; + } else if (ExtUserOp == ISD::FMA || ExtUserOp == ISD::FMAD) { + isContractableUse = true; + } else if (ExtUserOp == ISD::FNEG) { + // FP_EXTEND → FNEG → FSUB + for (const auto *FPExtFNegUser : FPExtUser->users()) { + if (FPExtFNegUser->getOpcode() != ISD::FSUB || + !(AllowFusionGlobally || + FPExtFNegUser->getFlags().hasAllowContract())) { + isContractableUse = false; + break; + } + isContractableUse = true; + } + } + + if (!isContractableUse) + return false; // Found non-contractable user of FPEXT + } + continue; // All FPEXT uses are contractable + } + + // FMA/FMAD - assume a mul being used by a fusedop will get contracted. + // There is a chance we may miss some corner cases where we will still have + // the mul left over, but this keeps the analysis simple and maintains + // existing behavior in the worse case. + if (isFusedOp(matcher, SDValue(UserNode, 0))) + continue; + + // Any other use type is not contractable + return false; + } + + return true; // All uses can be contracted +} + /// Try to perform FMA combining on a given FADD node. template SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { @@ -17263,10 +17388,6 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA; bool Aggressive = TLI.enableAggressiveFMAFusion(VT); - auto isFusedOp = [&](SDValue N) { - return matcher.match(N, ISD::FMA) || matcher.match(N, ISD::FMAD); - }; - // Is the node an FMUL and contractable either due to global flags or // SDNodeFlags. auto isContractableFMUL = [AllowFusionGlobally, &matcher](SDValue N) { @@ -17282,14 +17403,24 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { } // fold (fadd (fmul x, y), z) -> (fma x, y, z) - if (isContractableFMUL(N0) && (Aggressive || N0->hasOneUse())) { + // Only contract if the multiply has one use or all uses are contractable, + // avoiding duplication of the multiply without reducing total operations. + if (isContractableFMUL(N0) && + (N0->hasOneUse() || + (Aggressive && allMulUsesCanBeContracted(N0, AllowFusionGlobally, + matcher, TLI, DAG)))) { return matcher.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(0), N0.getOperand(1), N1); } // fold (fadd x, (fmul y, z)) -> (fma y, z, x) // Note: Commutes FADD operands. - if (isContractableFMUL(N1) && (Aggressive || N1->hasOneUse())) { + // Only contract if the multiply has one use or all uses are contractable, + // avoiding duplication of the multiply without reducing total operations. + if (isContractableFMUL(N1) && + (N1->hasOneUse() || + (Aggressive && allMulUsesCanBeContracted(N1, AllowFusionGlobally, + matcher, TLI, DAG)))) { return matcher.getNode(PreferredFusedOpcode, SL, VT, N1.getOperand(0), N1.getOperand(1), N0); } @@ -17305,16 +17436,16 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { bool CanReassociate = N->getFlags().hasAllowReassociation(); if (CanReassociate) { SDValue FMA, E; - if (isFusedOp(N0) && N0.hasOneUse()) { + if (isFusedOp(matcher, N0) && N0.hasOneUse()) { FMA = N0; E = N1; - } else if (isFusedOp(N1) && N1.hasOneUse()) { + } else if (isFusedOp(matcher, N1) && N1.hasOneUse()) { FMA = N1; E = N0; } SDValue TmpFMA = FMA; - while (E && isFusedOp(TmpFMA) && TmpFMA.hasOneUse()) { + while (E && isFusedOp(matcher, TmpFMA) && TmpFMA.hasOneUse()) { SDValue FMul = TmpFMA->getOperand(2); if (matcher.match(FMul, ISD::FMUL) && FMul.hasOneUse()) { SDValue C = FMul.getOperand(0); @@ -17336,6 +17467,8 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { if (matcher.match(N0, ISD::FP_EXTEND)) { SDValue N00 = N0.getOperand(0); if (isContractableFMUL(N00) && + allMulUsesCanBeContracted(N00, AllowFusionGlobally, matcher, TLI, + DAG) && TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, N00.getValueType())) { return matcher.getNode( @@ -17350,6 +17483,8 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { if (matcher.match(N1, ISD::FP_EXTEND)) { SDValue N10 = N1.getOperand(0); if (isContractableFMUL(N10) && + allMulUsesCanBeContracted(N10, AllowFusionGlobally, matcher, TLI, + DAG) && TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, N10.getValueType())) { return matcher.getNode( @@ -17371,11 +17506,13 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { matcher.getNode(ISD::FP_EXTEND, SL, VT, U), matcher.getNode(ISD::FP_EXTEND, SL, VT, V), Z)); }; - if (isFusedOp(N0)) { + if (isFusedOp(matcher, N0)) { SDValue N02 = N0.getOperand(2); if (matcher.match(N02, ISD::FP_EXTEND)) { SDValue N020 = N02.getOperand(0); if (isContractableFMUL(N020) && + allMulUsesCanBeContracted(N020, AllowFusionGlobally, matcher, TLI, + DAG) && TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, N020.getValueType())) { return FoldFAddFMAFPExtFMul(N0.getOperand(0), N0.getOperand(1), @@ -17402,9 +17539,11 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { }; if (N0.getOpcode() == ISD::FP_EXTEND) { SDValue N00 = N0.getOperand(0); - if (isFusedOp(N00)) { + if (isFusedOp(matcher, N00)) { SDValue N002 = N00.getOperand(2); if (isContractableFMUL(N002) && + allMulUsesCanBeContracted(N002, AllowFusionGlobally, matcher, TLI, + DAG) && TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, N00.getValueType())) { return FoldFAddFPExtFMAFMul(N00.getOperand(0), N00.getOperand(1), @@ -17416,11 +17555,13 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { // fold (fadd x, (fma y, z, (fpext (fmul u, v))) // -> (fma y, z, (fma (fpext u), (fpext v), x)) - if (isFusedOp(N1)) { + if (isFusedOp(matcher, N1)) { SDValue N12 = N1.getOperand(2); if (N12.getOpcode() == ISD::FP_EXTEND) { SDValue N120 = N12.getOperand(0); if (isContractableFMUL(N120) && + allMulUsesCanBeContracted(N120, AllowFusionGlobally, matcher, TLI, + DAG) && TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, N120.getValueType())) { return FoldFAddFMAFPExtFMul(N1.getOperand(0), N1.getOperand(1), @@ -17437,9 +17578,11 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { // interesting for all targets, especially GPUs. if (N1.getOpcode() == ISD::FP_EXTEND) { SDValue N10 = N1.getOperand(0); - if (isFusedOp(N10)) { + if (isFusedOp(matcher, N10)) { SDValue N102 = N10.getOperand(2); if (isContractableFMUL(N102) && + allMulUsesCanBeContracted(N102, AllowFusionGlobally, matcher, TLI, + DAG) && TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, N10.getValueType())) { return FoldFAddFPExtFMAFMul(N10.getOperand(0), N10.getOperand(1), @@ -17505,7 +17648,12 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { // fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) auto tryToFoldXYSubZ = [&](SDValue XY, SDValue Z) { - if (isContractableFMUL(XY) && (Aggressive || XY->hasOneUse())) { + // Only contract if the multiply has one use or all uses are contractable, + // avoiding duplication of the multiply without reducing total operations. + if (isContractableFMUL(XY) && + (XY->hasOneUse() || + (Aggressive && allMulUsesCanBeContracted(XY, AllowFusionGlobally, + matcher, TLI, DAG)))) { return matcher.getNode(PreferredFusedOpcode, SL, VT, XY.getOperand(0), XY.getOperand(1), matcher.getNode(ISD::FNEG, SL, VT, Z)); @@ -17516,7 +17664,12 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { // fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) // Note: Commutes FSUB operands. auto tryToFoldXSubYZ = [&](SDValue X, SDValue YZ) { - if (isContractableFMUL(YZ) && (Aggressive || YZ->hasOneUse())) { + // Only contract if the multiply has one use or all uses are contractable, + // avoiding duplication of the multiply without reducing total operations. + if (isContractableFMUL(YZ) && + (YZ->hasOneUse() || + (Aggressive && allMulUsesCanBeContracted(YZ, AllowFusionGlobally, + matcher, TLI, DAG)))) { return matcher.getNode( PreferredFusedOpcode, SL, VT, matcher.getNode(ISD::FNEG, SL, VT, YZ.getOperand(0)), @@ -17546,7 +17699,10 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { // fold (fsub (fneg (fmul, x, y)), z) -> (fma (fneg x), y, (fneg z)) if (matcher.match(N0, ISD::FNEG) && isContractableFMUL(N0.getOperand(0)) && - (Aggressive || (N0->hasOneUse() && N0.getOperand(0).hasOneUse()))) { + ((Aggressive && + allMulUsesCanBeContracted(N0.getOperand(0), AllowFusionGlobally, + matcher, TLI, DAG)) || + (N0->hasOneUse() && N0.getOperand(0).hasOneUse()))) { SDValue N00 = N0.getOperand(0).getOperand(0); SDValue N01 = N0.getOperand(0).getOperand(1); return matcher.getNode(PreferredFusedOpcode, SL, VT, @@ -17561,6 +17717,8 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { if (matcher.match(N0, ISD::FP_EXTEND)) { SDValue N00 = N0.getOperand(0); if (isContractableFMUL(N00) && + allMulUsesCanBeContracted(N00, AllowFusionGlobally, matcher, TLI, + DAG) && TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, N00.getValueType())) { return matcher.getNode( @@ -17577,6 +17735,8 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { if (matcher.match(N1, ISD::FP_EXTEND)) { SDValue N10 = N1.getOperand(0); if (isContractableFMUL(N10) && + allMulUsesCanBeContracted(N10, AllowFusionGlobally, matcher, TLI, + DAG) && TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, N10.getValueType())) { return matcher.getNode( @@ -17599,6 +17759,8 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { if (matcher.match(N00, ISD::FNEG)) { SDValue N000 = N00.getOperand(0); if (isContractableFMUL(N000) && + allMulUsesCanBeContracted(N000, AllowFusionGlobally, matcher, TLI, + DAG) && TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, N00.getValueType())) { return matcher.getNode( @@ -17623,6 +17785,8 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { if (matcher.match(N00, ISD::FP_EXTEND)) { SDValue N000 = N00.getOperand(0); if (isContractableFMUL(N000) && + allMulUsesCanBeContracted(N000, AllowFusionGlobally, matcher, TLI, + DAG) && TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, N000.getValueType())) { return matcher.getNode( @@ -17640,16 +17804,12 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { return isContractableFMUL(N) && N->getFlags().hasAllowReassociation(); }; - auto isFusedOp = [&](SDValue N) { - return matcher.match(N, ISD::FMA) || matcher.match(N, ISD::FMAD); - }; - // More folding opportunities when target permits. if (Aggressive && N->getFlags().hasAllowReassociation()) { bool CanFuse = N->getFlags().hasAllowContract(); // fold (fsub (fma x, y, (fmul u, v)), z) // -> (fma x, y (fma u, v, (fneg z))) - if (CanFuse && isFusedOp(N0) && + if (CanFuse && isFusedOp(matcher, N0) && isContractableAndReassociableFMUL(N0.getOperand(2)) && N0->hasOneUse() && N0.getOperand(2)->hasOneUse()) { return matcher.getNode( @@ -17662,7 +17822,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { // fold (fsub x, (fma y, z, (fmul u, v))) // -> (fma (fneg y), z, (fma (fneg u), v, x)) - if (CanFuse && isFusedOp(N1) && + if (CanFuse && isFusedOp(matcher, N1) && isContractableAndReassociableFMUL(N1.getOperand(2)) && N1->hasOneUse() && NoSignedZero) { SDValue N20 = N1.getOperand(2).getOperand(0); @@ -17677,11 +17837,13 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { // fold (fsub (fma x, y, (fpext (fmul u, v))), z) // -> (fma x, y (fma (fpext u), (fpext v), (fneg z))) - if (isFusedOp(N0) && N0->hasOneUse()) { + if (isFusedOp(matcher, N0) && N0->hasOneUse()) { SDValue N02 = N0.getOperand(2); if (matcher.match(N02, ISD::FP_EXTEND)) { SDValue N020 = N02.getOperand(0); if (isContractableAndReassociableFMUL(N020) && + allMulUsesCanBeContracted(N020, AllowFusionGlobally, matcher, TLI, + DAG) && TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, N020.getValueType())) { return matcher.getNode( @@ -17703,9 +17865,11 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { // interesting for all targets, especially GPUs. if (matcher.match(N0, ISD::FP_EXTEND)) { SDValue N00 = N0.getOperand(0); - if (isFusedOp(N00)) { + if (isFusedOp(matcher, N00)) { SDValue N002 = N00.getOperand(2); if (isContractableAndReassociableFMUL(N002) && + allMulUsesCanBeContracted(N002, AllowFusionGlobally, matcher, TLI, + DAG) && TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, N00.getValueType())) { return matcher.getNode( @@ -17723,10 +17887,12 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { // fold (fsub x, (fma y, z, (fpext (fmul u, v)))) // -> (fma (fneg y), z, (fma (fneg (fpext u)), (fpext v), x)) - if (isFusedOp(N1) && matcher.match(N1.getOperand(2), ISD::FP_EXTEND) && - N1->hasOneUse()) { + if (isFusedOp(matcher, N1) && + matcher.match(N1.getOperand(2), ISD::FP_EXTEND) && N1->hasOneUse()) { SDValue N120 = N1.getOperand(2).getOperand(0); if (isContractableAndReassociableFMUL(N120) && + allMulUsesCanBeContracted(N120, AllowFusionGlobally, matcher, TLI, + DAG) && TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, N120.getValueType())) { SDValue N1200 = N120.getOperand(0); @@ -17749,12 +17915,15 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { // FIXME: This turns two single-precision and one double-precision // operation into two double-precision operations, which might not be // interesting for all targets, especially GPUs. - if (matcher.match(N1, ISD::FP_EXTEND) && isFusedOp(N1.getOperand(0))) { + if (matcher.match(N1, ISD::FP_EXTEND) && + isFusedOp(matcher, N1.getOperand(0))) { SDValue CvtSrc = N1.getOperand(0); SDValue N100 = CvtSrc.getOperand(0); SDValue N101 = CvtSrc.getOperand(1); SDValue N102 = CvtSrc.getOperand(2); if (isContractableAndReassociableFMUL(N102) && + allMulUsesCanBeContracted(N102, AllowFusionGlobally, matcher, TLI, + DAG) && TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, CvtSrc.getValueType())) { SDValue N1020 = N102.getOperand(0); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index e08c7f6bb3c49..af164d1c8a4d4 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -15517,6 +15517,36 @@ unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG, return 0; } +// Check if an FADD node should be contracted into a fused operation. +// Returns true if the FADD has only one use, or if all uses are contractable +// FADD/FSUB operations that would allow the FADD to be eliminated. +static bool shouldContractFAdd(SDValue FAdd, SelectionDAG &DAG) { + if (FAdd->hasOneUse()) + return true; + + // Check if all uses are contractable fadd or fsub operations + const TargetOptions &Options = DAG.getTarget().Options; + for (const auto *User : FAdd->users()) { + bool IsContractable = false; + if (User->getOpcode() == ISD::FADD || User->getOpcode() == ISD::FSUB) { + // Check if we can get a fused opcode for this user + if (Options.AllowFPOpFusion == FPOpFusion::Fast || + (User->getFlags().hasAllowContract() && + FAdd->getFlags().hasAllowContract())) { + // Verify that FAdd is used as an operand that would be contracted + if (User->getOperand(0) == FAdd || User->getOperand(1) == FAdd) { + IsContractable = true; + } + } + } + + if (!IsContractable) + return false; // Found a use that won't be contracted + } + + return true; // All uses can be contracted +} + // For a reassociatable opcode perform: // op x, (op y, z) -> op (op x, z), y, if x and z are uniform SDValue SITargetLowering::reassociateScalarOps(SDNode *N, @@ -16393,7 +16423,9 @@ SDValue SITargetLowering::performFAddCombine(SDNode *N, // fadd (fadd (a, a), b) -> mad 2.0, a, b if (LHS.getOpcode() == ISD::FADD) { SDValue A = LHS.getOperand(0); - if (A == LHS.getOperand(1)) { + // Only contract if fadd(a,a) has one use or all uses are contractable, + // avoiding duplication of the fadd without reducing total operations. + if (A == LHS.getOperand(1) && shouldContractFAdd(LHS, DAG)) { unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode()); if (FusedOp != 0) { const SDValue Two = DAG.getConstantFP(2.0, SL, VT); @@ -16405,7 +16437,9 @@ SDValue SITargetLowering::performFAddCombine(SDNode *N, // fadd (b, fadd (a, a)) -> mad 2.0, a, b if (RHS.getOpcode() == ISD::FADD) { SDValue A = RHS.getOperand(0); - if (A == RHS.getOperand(1)) { + // Only contract if fadd(a,a) has one use or all uses are contractable, + // avoiding duplication of the fadd without reducing total operations. + if (A == RHS.getOperand(1) && shouldContractFAdd(RHS, DAG)) { unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode()); if (FusedOp != 0) { const SDValue Two = DAG.getConstantFP(2.0, SL, VT); @@ -16437,7 +16471,9 @@ SDValue SITargetLowering::performFSubCombine(SDNode *N, if (LHS.getOpcode() == ISD::FADD) { // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c) SDValue A = LHS.getOperand(0); - if (A == LHS.getOperand(1)) { + // Only contract if fadd(a,a) has one use or all uses are contractable, + // avoiding duplication of the fadd without reducing total operations. + if (A == LHS.getOperand(1) && shouldContractFAdd(LHS, DAG)) { unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode()); if (FusedOp != 0) { const SDValue Two = DAG.getConstantFP(2.0, SL, VT); @@ -16452,7 +16488,9 @@ SDValue SITargetLowering::performFSubCombine(SDNode *N, // (fsub c, (fadd a, a)) -> mad -2.0, a, c SDValue A = RHS.getOperand(0); - if (A == RHS.getOperand(1)) { + // Only contract if fadd(a,a) has one use or all uses are contractable, + // avoiding duplication of the fadd without reducing total operations. + if (A == RHS.getOperand(1) && shouldContractFAdd(RHS, DAG)) { unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode()); if (FusedOp != 0) { const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT); diff --git a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll index 285dc7cd4ce7c..fdec6886920b0 100644 --- a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll +++ b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll @@ -339,12 +339,12 @@ define float @test_copysign_pow_fast_f32__integral_y(float %x, i32 %y.i) { ; GFX9-NEXT: v_mov_b32_e32 v2, 0x42000000 ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GFX9-NEXT: v_sub_f32_e32 v2, v3, v2 -; GFX9-NEXT: v_mul_f32_e32 v3, v2, v1 +; GFX9-NEXT: v_mul_f32_e32 v2, v2, v1 ; GFX9-NEXT: s_mov_b32 s4, 0xc2fc0000 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x42800000 -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc -; GFX9-NEXT: v_fma_f32 v2, v2, v1, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x42800000 +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, v2, v3 ; GFX9-NEXT: v_exp_f32_e32 v2, v2 ; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1 ; GFX9-NEXT: v_not_b32_e32 v3, 63 diff --git a/llvm/test/CodeGen/AMDGPU/fma-multiple-uses-contraction.ll b/llvm/test/CodeGen/AMDGPU/fma-multiple-uses-contraction.ll new file mode 100644 index 0000000000000..864e234cead53 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fma-multiple-uses-contraction.ll @@ -0,0 +1,2385 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 + +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx9-generic < %s | FileCheck -check-prefixes=GFX9-SDAG %s +; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx9-generic < %s | FileCheck -check-prefixes=GFX9-GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10-SDAG %s +; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10-GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx11-generic < %s | FileCheck -check-prefixes=GFX11-SDAG %s +; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx11-generic < %s | FileCheck -check-prefixes=GFX11-GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx12-generic < %s | FileCheck -check-prefixes=GFX12-SDAG %s +; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx12-generic < %s | FileCheck -check-prefixes=GFX12-GISEL %s + +; Test that FMA contraction is prevented when multiply has uses that cannot +; all be contracted, which would result in duplicating the multiply without +; reducing total operation count. + +; Test case: multiply has one non-contractable use (another multiply) +; Should NOT contract the fadd since the multiply would be duplicated +define amdgpu_kernel void @mul_has_noncontractable_use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 { +; +; +; GFX9-SDAG-LABEL: mul_has_noncontractable_use: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v4, v0, s[2:3] offset:12 glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX9-SDAG-NEXT: v_mul_f32_e32 v2, v1, v3 +; GFX9-SDAG-NEXT: v_add_f32_e32 v1, v1, v4 +; GFX9-SDAG-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] offset:4 +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: s_endpgm +; +; GFX9-GISEL-LABEL: mul_has_noncontractable_use: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v4, v0, s[2:3] offset:12 glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: v_mul_f32_e32 v2, v1, v3 +; GFX9-GISEL-NEXT: v_add_f32_e32 v1, v1, v4 +; GFX9-GISEL-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] offset:4 +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX10-SDAG-LABEL: mul_has_noncontractable_use: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: global_load_dword v4, v0, s[2:3] offset:12 glc dlc +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX10-SDAG-NEXT: v_mul_f32_e32 v2, v1, v3 +; GFX10-SDAG-NEXT: v_add_f32_e32 v1, v1, v4 +; GFX10-SDAG-NEXT: global_store_dword v0, v2, s[0:1] +; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] offset:4 +; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: mul_has_noncontractable_use: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: global_load_dword v4, v0, s[2:3] offset:12 glc dlc +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX10-GISEL-NEXT: v_mul_f32_e32 v2, v1, v3 +; GFX10-GISEL-NEXT: v_add_f32_e32 v1, v1, v4 +; GFX10-GISEL-NEXT: global_store_dword v0, v2, s[0:1] +; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] offset:4 +; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: mul_has_noncontractable_use: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[2:3] offset:4 glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[2:3] offset:8 glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v4, v0, s[2:3] offset:12 glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_mul_f32_e32 v2, v1, v3 +; GFX11-SDAG-NEXT: v_add_f32_e32 v1, v1, v4 +; GFX11-SDAG-NEXT: global_store_b32 v0, v2, s[0:1] dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] offset:4 dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: mul_has_noncontractable_use: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[2:3] offset:4 glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[2:3] offset:8 glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v4, v0, s[2:3] offset:12 glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mul_f32_e32 v2, v1, v3 +; GFX11-GISEL-NEXT: v_add_f32_e32 v1, v1, v4 +; GFX11-GISEL-NEXT: global_store_b32 v0, v2, s[0:1] dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] offset:4 dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: mul_has_noncontractable_use: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: global_load_b32 v2, v0, s[2:3] offset:4 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: global_load_b32 v3, v0, s[2:3] offset:8 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: global_load_b32 v4, v0, s[2:3] offset:12 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_mul_f32_e32 v2, v1, v3 +; GFX12-SDAG-NEXT: v_add_f32_e32 v1, v1, v4 +; GFX12-SDAG-NEXT: global_store_b32 v0, v2, s[0:1] scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] offset:4 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: mul_has_noncontractable_use: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: global_load_b32 v2, v0, s[2:3] offset:4 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: global_load_b32 v3, v0, s[2:3] offset:8 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: global_load_b32 v4, v0, s[2:3] offset:12 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_mul_f32_e32 v2, v1, v3 +; GFX12-GISEL-NEXT: v_add_f32_e32 v1, v1, v4 +; GFX12-GISEL-NEXT: global_store_b32 v0, v2, s[0:1] scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] offset:4 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: s_endpgm + %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 0 + %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1 + %gep.2 = getelementptr float, ptr addrspace(1) %gep.0, i32 2 + %gep.3 = getelementptr float, ptr addrspace(1) %gep.0, i32 3 + %gep.out.0 = getelementptr float, ptr addrspace(1) %out, i32 0 + %gep.out.1 = getelementptr float, ptr addrspace(1) %gep.out.0, i32 1 + + %a = load volatile float, ptr addrspace(1) %gep.0 + %b = load volatile float, ptr addrspace(1) %gep.1 + %c = load volatile float, ptr addrspace(1) %gep.2 + %d = load volatile float, ptr addrspace(1) %gep.3 + + %mul = fmul contract float %a, %b + %extrause = fmul contract float %mul, %c ; Non-contractable use + %fma1 = fadd contract float %mul, %d ; Would like to contract but can't + store volatile float %extrause, ptr addrspace(1) %gep.out.0 + store volatile float %fma1, ptr addrspace(1) %gep.out.1 + ret void +} + +; Test case: multiply has two contractable fadd uses +; SHOULD contract both fadds into fmas, eliminating the multiply +define amdgpu_kernel void @mul_two_contractable_fadd_uses(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 { +; +; +; GFX9-SDAG-LABEL: mul_two_contractable_fadd_uses: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v4, v0, s[2:3] offset:12 glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_mac_f32_e32 v3, v1, v2 +; GFX9-SDAG-NEXT: v_mac_f32_e32 v4, v1, v2 +; GFX9-SDAG-NEXT: global_store_dword v0, v3, s[0:1] +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_store_dword v0, v4, s[0:1] offset:4 +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: s_endpgm +; +; GFX9-GISEL-LABEL: mul_two_contractable_fadd_uses: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v4, v0, s[2:3] offset:12 glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_mac_f32_e32 v3, v1, v2 +; GFX9-GISEL-NEXT: v_mac_f32_e32 v4, v1, v2 +; GFX9-GISEL-NEXT: global_store_dword v0, v3, s[0:1] +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_store_dword v0, v4, s[0:1] offset:4 +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX10-SDAG-LABEL: mul_two_contractable_fadd_uses: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: global_load_dword v4, v0, s[2:3] offset:12 glc dlc +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: v_fmac_f32_e32 v3, v1, v2 +; GFX10-SDAG-NEXT: v_fmac_f32_e32 v4, v1, v2 +; GFX10-SDAG-NEXT: global_store_dword v0, v3, s[0:1] +; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-SDAG-NEXT: global_store_dword v0, v4, s[0:1] offset:4 +; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: mul_two_contractable_fadd_uses: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: global_load_dword v4, v0, s[2:3] offset:12 glc dlc +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: v_fmac_f32_e32 v3, v1, v2 +; GFX10-GISEL-NEXT: v_fmac_f32_e32 v4, v1, v2 +; GFX10-GISEL-NEXT: global_store_dword v0, v3, s[0:1] +; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-GISEL-NEXT: global_store_dword v0, v4, s[0:1] offset:4 +; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: mul_two_contractable_fadd_uses: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[2:3] offset:4 glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[2:3] offset:8 glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v4, v0, s[2:3] offset:12 glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_fmac_f32_e32 v3, v1, v2 +; GFX11-SDAG-NEXT: v_fmac_f32_e32 v4, v1, v2 +; GFX11-SDAG-NEXT: global_store_b32 v0, v3, s[0:1] dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: global_store_b32 v0, v4, s[0:1] offset:4 dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: mul_two_contractable_fadd_uses: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[2:3] offset:4 glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[2:3] offset:8 glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v4, v0, s[2:3] offset:12 glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_fmac_f32_e32 v3, v1, v2 +; GFX11-GISEL-NEXT: v_fmac_f32_e32 v4, v1, v2 +; GFX11-GISEL-NEXT: global_store_b32 v0, v3, s[0:1] dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: global_store_b32 v0, v4, s[0:1] offset:4 dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: mul_two_contractable_fadd_uses: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: global_load_b32 v2, v0, s[2:3] offset:4 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: global_load_b32 v3, v0, s[2:3] offset:8 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: global_load_b32 v4, v0, s[2:3] offset:12 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: v_fmac_f32_e32 v3, v1, v2 +; GFX12-SDAG-NEXT: v_fmac_f32_e32 v4, v1, v2 +; GFX12-SDAG-NEXT: global_store_b32 v0, v3, s[0:1] scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: global_store_b32 v0, v4, s[0:1] offset:4 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: mul_two_contractable_fadd_uses: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: global_load_b32 v2, v0, s[2:3] offset:4 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: global_load_b32 v3, v0, s[2:3] offset:8 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: global_load_b32 v4, v0, s[2:3] offset:12 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: v_fmac_f32_e32 v3, v1, v2 +; GFX12-GISEL-NEXT: v_fmac_f32_e32 v4, v1, v2 +; GFX12-GISEL-NEXT: global_store_b32 v0, v3, s[0:1] scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: global_store_b32 v0, v4, s[0:1] offset:4 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: s_endpgm + %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 0 + %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1 + %gep.2 = getelementptr float, ptr addrspace(1) %gep.0, i32 2 + %gep.3 = getelementptr float, ptr addrspace(1) %gep.0, i32 3 + %gep.out.0 = getelementptr float, ptr addrspace(1) %out, i32 0 + %gep.out.1 = getelementptr float, ptr addrspace(1) %gep.out.0, i32 1 + + %a = load volatile float, ptr addrspace(1) %gep.0 + %b = load volatile float, ptr addrspace(1) %gep.1 + %c = load volatile float, ptr addrspace(1) %gep.2 + %d = load volatile float, ptr addrspace(1) %gep.3 + + %mul = fmul contract float %a, %b + %fma1 = fadd contract float %mul, %c ; Contractable + %fma2 = fadd contract float %mul, %d ; Contractable + store volatile float %fma1, ptr addrspace(1) %gep.out.0 + store volatile float %fma2, ptr addrspace(1) %gep.out.1 + ret void +} + +; Test case: multiply with constant and two contractable uses +; SHOULD contract both fadds +define amdgpu_kernel void @mul_constant_two_contractable_uses(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 { +; +; +; GFX9-SDAG-LABEL: mul_constant_two_contractable_uses: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[2:3] offset:8 glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[2:3] offset:12 glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_mac_f32_e32 v2, 2.0, v1 +; GFX9-SDAG-NEXT: v_mac_f32_e32 v3, 2.0, v1 +; GFX9-SDAG-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_store_dword v0, v3, s[0:1] offset:4 +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: s_endpgm +; +; GFX9-GISEL-LABEL: mul_constant_two_contractable_uses: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[2:3] offset:8 glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[2:3] offset:12 glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_mac_f32_e32 v2, 2.0, v1 +; GFX9-GISEL-NEXT: v_mac_f32_e32 v3, 2.0, v1 +; GFX9-GISEL-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_store_dword v0, v3, s[0:1] offset:4 +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX10-SDAG-LABEL: mul_constant_two_contractable_uses: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: global_load_dword v2, v0, s[2:3] offset:8 glc dlc +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: global_load_dword v3, v0, s[2:3] offset:12 glc dlc +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: v_fmac_f32_e32 v2, 2.0, v1 +; GFX10-SDAG-NEXT: v_fmac_f32_e32 v3, 2.0, v1 +; GFX10-SDAG-NEXT: global_store_dword v0, v2, s[0:1] +; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-SDAG-NEXT: global_store_dword v0, v3, s[0:1] offset:4 +; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: mul_constant_two_contractable_uses: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: global_load_dword v2, v0, s[2:3] offset:8 glc dlc +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: global_load_dword v3, v0, s[2:3] offset:12 glc dlc +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: v_fmac_f32_e32 v2, 2.0, v1 +; GFX10-GISEL-NEXT: v_fmac_f32_e32 v3, 2.0, v1 +; GFX10-GISEL-NEXT: global_store_dword v0, v2, s[0:1] +; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-GISEL-NEXT: global_store_dword v0, v3, s[0:1] offset:4 +; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: mul_constant_two_contractable_uses: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[2:3] offset:8 glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[2:3] offset:12 glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_fmac_f32_e32 v2, 2.0, v1 +; GFX11-SDAG-NEXT: v_fmac_f32_e32 v3, 2.0, v1 +; GFX11-SDAG-NEXT: global_store_b32 v0, v2, s[0:1] dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: global_store_b32 v0, v3, s[0:1] offset:4 dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: mul_constant_two_contractable_uses: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[2:3] offset:8 glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[2:3] offset:12 glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_fmac_f32_e32 v2, 2.0, v1 +; GFX11-GISEL-NEXT: v_fmac_f32_e32 v3, 2.0, v1 +; GFX11-GISEL-NEXT: global_store_b32 v0, v2, s[0:1] dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: global_store_b32 v0, v3, s[0:1] offset:4 dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: mul_constant_two_contractable_uses: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: global_load_b32 v2, v0, s[2:3] offset:8 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: global_load_b32 v3, v0, s[2:3] offset:12 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: v_fmac_f32_e32 v2, 2.0, v1 +; GFX12-SDAG-NEXT: v_fmac_f32_e32 v3, 2.0, v1 +; GFX12-SDAG-NEXT: global_store_b32 v0, v2, s[0:1] scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: global_store_b32 v0, v3, s[0:1] offset:4 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: mul_constant_two_contractable_uses: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: global_load_b32 v2, v0, s[2:3] offset:8 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: global_load_b32 v3, v0, s[2:3] offset:12 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: v_fmac_f32_e32 v2, 2.0, v1 +; GFX12-GISEL-NEXT: v_fmac_f32_e32 v3, 2.0, v1 +; GFX12-GISEL-NEXT: global_store_b32 v0, v2, s[0:1] scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: global_store_b32 v0, v3, s[0:1] offset:4 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: s_endpgm + %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 0 + %gep.2 = getelementptr float, ptr addrspace(1) %gep.0, i32 2 + %gep.3 = getelementptr float, ptr addrspace(1) %gep.0, i32 3 + %gep.out.0 = getelementptr float, ptr addrspace(1) %out, i32 0 + %gep.out.1 = getelementptr float, ptr addrspace(1) %gep.out.0, i32 1 + + %a = load volatile float, ptr addrspace(1) %gep.0 + %c = load volatile float, ptr addrspace(1) %gep.2 + %d = load volatile float, ptr addrspace(1) %gep.3 + + %mul = fmul contract float %a, 2.0 + %fma1 = fadd contract float %mul, %c + %fma2 = fadd contract float %mul, %d + store volatile float %fma1, ptr addrspace(1) %gep.out.0 + store volatile float %fma2, ptr addrspace(1) %gep.out.1 + ret void +} + +; Test case: multiply has two contractable uses AND direct use +; Should NOT contract since multiply is used directly +define amdgpu_kernel void @mul_two_contractable_uses_plus_direct_use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 { +; +; +; GFX9-SDAG-LABEL: mul_two_contractable_uses_plus_direct_use: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[2:3] offset:8 glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[2:3] offset:12 glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_add_f32_e32 v1, v1, v1 +; GFX9-SDAG-NEXT: v_add_f32_e32 v2, v1, v2 +; GFX9-SDAG-NEXT: v_add_f32_e32 v3, v1, v3 +; GFX9-SDAG-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_store_dword v0, v3, s[0:1] offset:4 +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] offset:8 +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: s_endpgm +; +; GFX9-GISEL-LABEL: mul_two_contractable_uses_plus_direct_use: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[2:3] offset:8 glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[2:3] offset:12 glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_mul_f32_e32 v1, 2.0, v1 +; GFX9-GISEL-NEXT: v_add_f32_e32 v2, v1, v2 +; GFX9-GISEL-NEXT: v_add_f32_e32 v3, v1, v3 +; GFX9-GISEL-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_store_dword v0, v3, s[0:1] offset:4 +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] offset:8 +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX10-SDAG-LABEL: mul_two_contractable_uses_plus_direct_use: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: global_load_dword v2, v0, s[2:3] offset:8 glc dlc +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: global_load_dword v3, v0, s[2:3] offset:12 glc dlc +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: v_add_f32_e32 v1, v1, v1 +; GFX10-SDAG-NEXT: v_add_f32_e32 v2, v1, v2 +; GFX10-SDAG-NEXT: v_add_f32_e32 v3, v1, v3 +; GFX10-SDAG-NEXT: global_store_dword v0, v2, s[0:1] +; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-SDAG-NEXT: global_store_dword v0, v3, s[0:1] offset:4 +; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] offset:8 +; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: mul_two_contractable_uses_plus_direct_use: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: global_load_dword v2, v0, s[2:3] offset:8 glc dlc +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: global_load_dword v3, v0, s[2:3] offset:12 glc dlc +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: v_mul_f32_e32 v1, 2.0, v1 +; GFX10-GISEL-NEXT: v_add_f32_e32 v2, v1, v2 +; GFX10-GISEL-NEXT: v_add_f32_e32 v3, v1, v3 +; GFX10-GISEL-NEXT: global_store_dword v0, v2, s[0:1] +; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-GISEL-NEXT: global_store_dword v0, v3, s[0:1] offset:4 +; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] offset:8 +; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: mul_two_contractable_uses_plus_direct_use: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[2:3] offset:8 glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[2:3] offset:12 glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_add_f32_e32 v1, v1, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_add_f32_e32 v2, v1, v2 +; GFX11-SDAG-NEXT: v_add_f32_e32 v3, v1, v3 +; GFX11-SDAG-NEXT: global_store_b32 v0, v2, s[0:1] dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: global_store_b32 v0, v3, s[0:1] offset:4 dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] offset:8 dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: mul_two_contractable_uses_plus_direct_use: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[2:3] offset:8 glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[2:3] offset:12 glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_mul_f32_e32 v1, 2.0, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_add_f32_e32 v2, v1, v2 +; GFX11-GISEL-NEXT: v_add_f32_e32 v3, v1, v3 +; GFX11-GISEL-NEXT: global_store_b32 v0, v2, s[0:1] dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: global_store_b32 v0, v3, s[0:1] offset:4 dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] offset:8 dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: mul_two_contractable_uses_plus_direct_use: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: global_load_b32 v2, v0, s[2:3] offset:8 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: global_load_b32 v3, v0, s[2:3] offset:12 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: v_add_f32_e32 v1, v1, v1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_add_f32_e32 v2, v1, v2 +; GFX12-SDAG-NEXT: v_add_f32_e32 v3, v1, v3 +; GFX12-SDAG-NEXT: global_store_b32 v0, v2, s[0:1] scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: global_store_b32 v0, v3, s[0:1] offset:4 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] offset:8 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: mul_two_contractable_uses_plus_direct_use: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: global_load_b32 v2, v0, s[2:3] offset:8 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: global_load_b32 v3, v0, s[2:3] offset:12 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: v_mul_f32_e32 v1, 2.0, v1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_add_f32_e32 v2, v1, v2 +; GFX12-GISEL-NEXT: v_add_f32_e32 v3, v1, v3 +; GFX12-GISEL-NEXT: global_store_b32 v0, v2, s[0:1] scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: global_store_b32 v0, v3, s[0:1] offset:4 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] offset:8 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: s_endpgm + %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 0 + %gep.2 = getelementptr float, ptr addrspace(1) %gep.0, i32 2 + %gep.3 = getelementptr float, ptr addrspace(1) %gep.0, i32 3 + %gep.out.0 = getelementptr float, ptr addrspace(1) %out, i32 0 + %gep.out.1 = getelementptr float, ptr addrspace(1) %gep.out.0, i32 1 + %gep.out.2 = getelementptr float, ptr addrspace(1) %gep.out.0, i32 2 + + %a = load volatile float, ptr addrspace(1) %gep.0 + %c = load volatile float, ptr addrspace(1) %gep.2 + %d = load volatile float, ptr addrspace(1) %gep.3 + + %mul = fmul contract float %a, 2.0 + %fma1 = fadd contract float %mul, %c + %fma2 = fadd contract float %mul, %d + store volatile float %fma1, ptr addrspace(1) %gep.out.0 + store volatile float %fma2, ptr addrspace(1) %gep.out.1 + store volatile float %mul, ptr addrspace(1) %gep.out.2 ; Direct use prevents contraction + ret void +} + +; Test case: multiply has two contractable fsub uses +; SHOULD contract both fsubs into fmas +define amdgpu_kernel void @mul_two_contractable_fsub_uses(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 { +; +; +; GFX9-SDAG-LABEL: mul_two_contractable_fsub_uses: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[2:3] offset:8 glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[2:3] offset:12 glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_mad_f32 v2, v1, 2.0, -v2 +; GFX9-SDAG-NEXT: v_mad_f32 v1, v1, 2.0, -v3 +; GFX9-SDAG-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] offset:4 +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: s_endpgm +; +; GFX9-GISEL-LABEL: mul_two_contractable_fsub_uses: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[2:3] offset:8 glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[2:3] offset:12 glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_mad_f32 v2, v1, 2.0, -v2 +; GFX9-GISEL-NEXT: v_mad_f32 v1, v1, 2.0, -v3 +; GFX9-GISEL-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] offset:4 +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX10-SDAG-LABEL: mul_two_contractable_fsub_uses: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: global_load_dword v2, v0, s[2:3] offset:8 glc dlc +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: global_load_dword v3, v0, s[2:3] offset:12 glc dlc +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: v_fma_f32 v2, v1, 2.0, -v2 +; GFX10-SDAG-NEXT: v_fma_f32 v1, v1, 2.0, -v3 +; GFX10-SDAG-NEXT: global_store_dword v0, v2, s[0:1] +; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] offset:4 +; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: mul_two_contractable_fsub_uses: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: global_load_dword v2, v0, s[2:3] offset:8 glc dlc +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: global_load_dword v3, v0, s[2:3] offset:12 glc dlc +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: v_fma_f32 v2, v1, 2.0, -v2 +; GFX10-GISEL-NEXT: v_fma_f32 v1, v1, 2.0, -v3 +; GFX10-GISEL-NEXT: global_store_dword v0, v2, s[0:1] +; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] offset:4 +; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: mul_two_contractable_fsub_uses: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[2:3] offset:8 glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[2:3] offset:12 glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_fma_f32 v2, v1, 2.0, -v2 +; GFX11-SDAG-NEXT: v_fma_f32 v1, v1, 2.0, -v3 +; GFX11-SDAG-NEXT: global_store_b32 v0, v2, s[0:1] dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] offset:4 dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: mul_two_contractable_fsub_uses: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[2:3] offset:8 glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[2:3] offset:12 glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_fma_f32 v2, v1, 2.0, -v2 +; GFX11-GISEL-NEXT: v_fma_f32 v1, v1, 2.0, -v3 +; GFX11-GISEL-NEXT: global_store_b32 v0, v2, s[0:1] dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] offset:4 dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: mul_two_contractable_fsub_uses: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: global_load_b32 v2, v0, s[2:3] offset:8 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: global_load_b32 v3, v0, s[2:3] offset:12 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 +; GFX12-SDAG-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_fmac_f32_e32 v2, 2.0, v1 +; GFX12-SDAG-NEXT: v_fmac_f32_e32 v3, 2.0, v1 +; GFX12-SDAG-NEXT: global_store_b32 v0, v2, s[0:1] scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: global_store_b32 v0, v3, s[0:1] offset:4 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: mul_two_contractable_fsub_uses: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: global_load_b32 v2, v0, s[2:3] offset:8 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: global_load_b32 v3, v0, s[2:3] offset:12 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: v_fma_f32 v2, v1, 2.0, -v2 +; GFX12-GISEL-NEXT: v_fma_f32 v1, v1, 2.0, -v3 +; GFX12-GISEL-NEXT: global_store_b32 v0, v2, s[0:1] scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] offset:4 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: s_endpgm + %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 0 + %gep.2 = getelementptr float, ptr addrspace(1) %gep.0, i32 2 + %gep.3 = getelementptr float, ptr addrspace(1) %gep.0, i32 3 + %gep.out.0 = getelementptr float, ptr addrspace(1) %out, i32 0 + %gep.out.1 = getelementptr float, ptr addrspace(1) %gep.out.0, i32 1 + + %a = load volatile float, ptr addrspace(1) %gep.0 + %c = load volatile float, ptr addrspace(1) %gep.2 + %d = load volatile float, ptr addrspace(1) %gep.3 + + %mul = fmul contract float %a, 2.0 + %fma1 = fsub contract float %mul, %c + %fma2 = fsub contract float %mul, %d + store volatile float %fma1, ptr addrspace(1) %gep.out.0 + store volatile float %fma2, ptr addrspace(1) %gep.out.1 + ret void +} + +; Test case: multiply with two contractable fsub uses AND direct use +; Should NOT contract since multiply is used directly +define amdgpu_kernel void @mul_two_contractable_fsub_uses_plus_direct_use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 { +; +; +; GFX9-SDAG-LABEL: mul_two_contractable_fsub_uses_plus_direct_use: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[2:3] offset:8 glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[2:3] offset:12 glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_add_f32_e32 v1, v1, v1 +; GFX9-SDAG-NEXT: v_sub_f32_e32 v2, v1, v2 +; GFX9-SDAG-NEXT: v_sub_f32_e32 v3, v1, v3 +; GFX9-SDAG-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_store_dword v0, v3, s[0:1] offset:4 +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] offset:8 +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: s_endpgm +; +; GFX9-GISEL-LABEL: mul_two_contractable_fsub_uses_plus_direct_use: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[2:3] offset:8 glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[2:3] offset:12 glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_mul_f32_e32 v1, 2.0, v1 +; GFX9-GISEL-NEXT: v_sub_f32_e32 v2, v1, v2 +; GFX9-GISEL-NEXT: v_sub_f32_e32 v3, v1, v3 +; GFX9-GISEL-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_store_dword v0, v3, s[0:1] offset:4 +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] offset:8 +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX10-SDAG-LABEL: mul_two_contractable_fsub_uses_plus_direct_use: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: global_load_dword v2, v0, s[2:3] offset:8 glc dlc +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: global_load_dword v3, v0, s[2:3] offset:12 glc dlc +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: v_add_f32_e32 v1, v1, v1 +; GFX10-SDAG-NEXT: v_sub_f32_e32 v2, v1, v2 +; GFX10-SDAG-NEXT: v_sub_f32_e32 v3, v1, v3 +; GFX10-SDAG-NEXT: global_store_dword v0, v2, s[0:1] +; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-SDAG-NEXT: global_store_dword v0, v3, s[0:1] offset:4 +; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] offset:8 +; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: mul_two_contractable_fsub_uses_plus_direct_use: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: global_load_dword v2, v0, s[2:3] offset:8 glc dlc +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: global_load_dword v3, v0, s[2:3] offset:12 glc dlc +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: v_mul_f32_e32 v1, 2.0, v1 +; GFX10-GISEL-NEXT: v_sub_f32_e32 v2, v1, v2 +; GFX10-GISEL-NEXT: v_sub_f32_e32 v3, v1, v3 +; GFX10-GISEL-NEXT: global_store_dword v0, v2, s[0:1] +; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-GISEL-NEXT: global_store_dword v0, v3, s[0:1] offset:4 +; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] offset:8 +; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: mul_two_contractable_fsub_uses_plus_direct_use: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[2:3] offset:8 glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[2:3] offset:12 glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_add_f32_e32 v1, v1, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_sub_f32_e32 v2, v1, v2 +; GFX11-SDAG-NEXT: v_sub_f32_e32 v3, v1, v3 +; GFX11-SDAG-NEXT: global_store_b32 v0, v2, s[0:1] dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: global_store_b32 v0, v3, s[0:1] offset:4 dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] offset:8 dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: mul_two_contractable_fsub_uses_plus_direct_use: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[2:3] offset:8 glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[2:3] offset:12 glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_mul_f32_e32 v1, 2.0, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_sub_f32_e32 v2, v1, v2 +; GFX11-GISEL-NEXT: v_sub_f32_e32 v3, v1, v3 +; GFX11-GISEL-NEXT: global_store_b32 v0, v2, s[0:1] dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: global_store_b32 v0, v3, s[0:1] offset:4 dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] offset:8 dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: mul_two_contractable_fsub_uses_plus_direct_use: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: global_load_b32 v2, v0, s[2:3] offset:8 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: global_load_b32 v3, v0, s[2:3] offset:12 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: v_add_f32_e32 v1, v1, v1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_sub_f32_e32 v2, v1, v2 +; GFX12-SDAG-NEXT: v_sub_f32_e32 v3, v1, v3 +; GFX12-SDAG-NEXT: global_store_b32 v0, v2, s[0:1] scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: global_store_b32 v0, v3, s[0:1] offset:4 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] offset:8 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: mul_two_contractable_fsub_uses_plus_direct_use: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: global_load_b32 v2, v0, s[2:3] offset:8 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: global_load_b32 v3, v0, s[2:3] offset:12 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: v_mul_f32_e32 v1, 2.0, v1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_sub_f32_e32 v2, v1, v2 +; GFX12-GISEL-NEXT: v_sub_f32_e32 v3, v1, v3 +; GFX12-GISEL-NEXT: global_store_b32 v0, v2, s[0:1] scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: global_store_b32 v0, v3, s[0:1] offset:4 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] offset:8 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: s_endpgm + %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 0 + %gep.2 = getelementptr float, ptr addrspace(1) %gep.0, i32 2 + %gep.3 = getelementptr float, ptr addrspace(1) %gep.0, i32 3 + %gep.out.0 = getelementptr float, ptr addrspace(1) %out, i32 0 + %gep.out.1 = getelementptr float, ptr addrspace(1) %gep.out.0, i32 1 + %gep.out.2 = getelementptr float, ptr addrspace(1) %gep.out.0, i32 2 + + %a = load volatile float, ptr addrspace(1) %gep.0 + %c = load volatile float, ptr addrspace(1) %gep.2 + %d = load volatile float, ptr addrspace(1) %gep.3 + + %mul = fmul contract float %a, 2.0 + %fma1 = fsub contract float %mul, %c + %fma2 = fsub contract float %mul, %d + store volatile float %fma1, ptr addrspace(1) %gep.out.0 + store volatile float %fma2, ptr addrspace(1) %gep.out.1 + store volatile float %mul, ptr addrspace(1) %gep.out.2 ; Direct use prevents contraction + ret void +} + +; Test case: multiply -> fneg -> fsub pattern (single use) +; SHOULD contract into fma with negated operand +define amdgpu_kernel void @mul_fneg_fsub_single_use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 { +; +; +; GFX9-SDAG-LABEL: mul_fneg_fsub_single_use: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_mad_f32 v1, v1, -v2, -v3 +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: s_endpgm +; +; GFX9-GISEL-LABEL: mul_fneg_fsub_single_use: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_mad_f32 v1, v1, -v2, -v3 +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX10-SDAG-LABEL: mul_fneg_fsub_single_use: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: v_fma_f32 v1, -v1, v2, -v3 +; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: mul_fneg_fsub_single_use: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: v_fma_f32 v1, v1, -v2, -v3 +; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: mul_fneg_fsub_single_use: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[2:3] offset:4 glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[2:3] offset:8 glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_fma_f32 v1, -v1, v2, -v3 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: mul_fneg_fsub_single_use: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[2:3] offset:4 glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[2:3] offset:8 glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_fma_f32 v1, v1, -v2, -v3 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: mul_fneg_fsub_single_use: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: global_load_b32 v2, v0, s[2:3] offset:4 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: global_load_b32 v3, v0, s[2:3] offset:8 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GFX12-SDAG-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_fmac_f32_e32 v3, v1, v2 +; GFX12-SDAG-NEXT: global_store_b32 v0, v3, s[0:1] scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: mul_fneg_fsub_single_use: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: global_load_b32 v2, v0, s[2:3] offset:4 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: global_load_b32 v3, v0, s[2:3] offset:8 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: v_fma_f32 v1, v1, -v2, -v3 +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: s_endpgm + %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 0 + %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1 + %gep.2 = getelementptr float, ptr addrspace(1) %gep.0, i32 2 + %gep.out = getelementptr float, ptr addrspace(1) %out, i32 0 + + %a = load volatile float, ptr addrspace(1) %gep.0 + %b = load volatile float, ptr addrspace(1) %gep.1 + %c = load volatile float, ptr addrspace(1) %gep.2 + + %mul = fmul contract float %a, %b + %neg = fneg contract float %mul + %sub = fsub contract float %neg, %c ; Pattern: fsub(fneg(mul(a,b)), c) -> fma(-a, b, -c) + store volatile float %sub, ptr addrspace(1) %gep.out + ret void +} + +; Test case: multiply -> fneg used by both fsub and fadd +; SHOULD contract because fneg->fadd = fsub then fmul->fsub = fma +define amdgpu_kernel void @mul_fneg_mixed_uses(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 { +; +; +; GFX9-SDAG-LABEL: mul_fneg_mixed_uses: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v4, v0, s[2:3] offset:12 glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_mad_f32 v3, -v1, v2, -v3 +; GFX9-SDAG-NEXT: v_mad_f32 v1, -v1, v2, v4 +; GFX9-SDAG-NEXT: global_store_dword v0, v3, s[0:1] +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] offset:4 +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: s_endpgm +; +; GFX9-GISEL-LABEL: mul_fneg_mixed_uses: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v4, v0, s[2:3] offset:12 glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_mad_f32 v3, v1, -v2, -v3 +; GFX9-GISEL-NEXT: v_mad_f32 v1, v1, -v2, v4 +; GFX9-GISEL-NEXT: global_store_dword v0, v3, s[0:1] +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] offset:4 +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX10-SDAG-LABEL: mul_fneg_mixed_uses: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: global_load_dword v4, v0, s[2:3] offset:12 glc dlc +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: v_fma_f32 v3, -v1, v2, -v3 +; GFX10-SDAG-NEXT: v_fma_f32 v1, -v1, v2, v4 +; GFX10-SDAG-NEXT: global_store_dword v0, v3, s[0:1] +; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] offset:4 +; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: mul_fneg_mixed_uses: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: global_load_dword v4, v0, s[2:3] offset:12 glc dlc +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: v_fma_f32 v3, v1, -v2, -v3 +; GFX10-GISEL-NEXT: v_fma_f32 v1, v1, -v2, v4 +; GFX10-GISEL-NEXT: global_store_dword v0, v3, s[0:1] +; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] offset:4 +; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: mul_fneg_mixed_uses: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[2:3] offset:4 glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[2:3] offset:8 glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v4, v0, s[2:3] offset:12 glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_fma_f32 v3, -v1, v2, -v3 +; GFX11-SDAG-NEXT: v_fma_f32 v1, -v1, v2, v4 +; GFX11-SDAG-NEXT: global_store_b32 v0, v3, s[0:1] dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] offset:4 dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: mul_fneg_mixed_uses: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[2:3] offset:4 glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[2:3] offset:8 glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v4, v0, s[2:3] offset:12 glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_fma_f32 v3, v1, -v2, -v3 +; GFX11-GISEL-NEXT: v_fma_f32 v1, v1, -v2, v4 +; GFX11-GISEL-NEXT: global_store_b32 v0, v3, s[0:1] dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] offset:4 dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: mul_fneg_mixed_uses: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: global_load_b32 v2, v0, s[2:3] offset:4 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: global_load_b32 v3, v0, s[2:3] offset:8 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: global_load_b32 v4, v0, s[2:3] offset:12 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GFX12-SDAG-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_fmac_f32_e32 v4, v1, v2 +; GFX12-SDAG-NEXT: v_fmac_f32_e32 v3, v1, v2 +; GFX12-SDAG-NEXT: global_store_b32 v0, v3, s[0:1] scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: global_store_b32 v0, v4, s[0:1] offset:4 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: mul_fneg_mixed_uses: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: global_load_b32 v2, v0, s[2:3] offset:4 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: global_load_b32 v3, v0, s[2:3] offset:8 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: global_load_b32 v4, v0, s[2:3] offset:12 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: v_fma_f32 v3, v1, -v2, -v3 +; GFX12-GISEL-NEXT: v_fma_f32 v1, v1, -v2, v4 +; GFX12-GISEL-NEXT: global_store_b32 v0, v3, s[0:1] scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] offset:4 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: s_endpgm + %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 0 + %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1 + %gep.2 = getelementptr float, ptr addrspace(1) %gep.0, i32 2 + %gep.3 = getelementptr float, ptr addrspace(1) %gep.0, i32 3 + %gep.out.0 = getelementptr float, ptr addrspace(1) %out, i32 0 + %gep.out.1 = getelementptr float, ptr addrspace(1) %gep.out.0, i32 1 + + %a = load volatile float, ptr addrspace(1) %gep.0 + %b = load volatile float, ptr addrspace(1) %gep.1 + %c = load volatile float, ptr addrspace(1) %gep.2 + %d = load volatile float, ptr addrspace(1) %gep.3 + + %mul = fmul contract float %a, %b + %neg = fneg contract float %mul + %sub = fsub contract float %neg, %c ; Contractable use of fneg + %add = fadd contract float %neg, %d ; Will contract because neg->add = sub then mul->sub = fma + store volatile float %sub, ptr addrspace(1) %gep.out.0 + store volatile float %add, ptr addrspace(1) %gep.out.1 + ret void +} + +; Test case: multiply -> fneg used by both fsub and another multiply +; SHOULD NOT contract +define amdgpu_kernel void @mul_fneg_mixed_uses_2(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 { +; +; +; GFX9-SDAG-LABEL: mul_fneg_mixed_uses_2: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v4, v0, s[2:3] offset:12 glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_mul_f32_e64 v1, v1, -v2 +; GFX9-SDAG-NEXT: v_sub_f32_e32 v2, v1, v3 +; GFX9-SDAG-NEXT: v_mul_f32_e32 v1, v1, v4 +; GFX9-SDAG-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] offset:4 +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: s_endpgm +; +; GFX9-GISEL-LABEL: mul_fneg_mixed_uses_2: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v4, v0, s[2:3] offset:12 glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_mul_f32_e64 v1, v1, -v2 +; GFX9-GISEL-NEXT: v_sub_f32_e32 v2, v1, v3 +; GFX9-GISEL-NEXT: v_mul_f32_e32 v1, v1, v4 +; GFX9-GISEL-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] offset:4 +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX10-SDAG-LABEL: mul_fneg_mixed_uses_2: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: global_load_dword v4, v0, s[2:3] offset:12 glc dlc +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: v_mul_f32_e64 v1, v1, -v2 +; GFX10-SDAG-NEXT: v_sub_f32_e32 v2, v1, v3 +; GFX10-SDAG-NEXT: v_mul_f32_e32 v1, v1, v4 +; GFX10-SDAG-NEXT: global_store_dword v0, v2, s[0:1] +; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] offset:4 +; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: mul_fneg_mixed_uses_2: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: global_load_dword v4, v0, s[2:3] offset:12 glc dlc +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: v_mul_f32_e64 v1, v1, -v2 +; GFX10-GISEL-NEXT: v_sub_f32_e32 v2, v1, v3 +; GFX10-GISEL-NEXT: v_mul_f32_e32 v1, v1, v4 +; GFX10-GISEL-NEXT: global_store_dword v0, v2, s[0:1] +; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] offset:4 +; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: mul_fneg_mixed_uses_2: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[2:3] offset:4 glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[2:3] offset:8 glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v4, v0, s[2:3] offset:12 glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_mul_f32_e64 v1, v1, -v2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_sub_f32_e32 v2, v1, v3 +; GFX11-SDAG-NEXT: v_mul_f32_e32 v1, v1, v4 +; GFX11-SDAG-NEXT: global_store_b32 v0, v2, s[0:1] dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] offset:4 dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: mul_fneg_mixed_uses_2: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[2:3] offset:4 glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[2:3] offset:8 glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v4, v0, s[2:3] offset:12 glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_mul_f32_e64 v1, v1, -v2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_sub_f32_e32 v2, v1, v3 +; GFX11-GISEL-NEXT: v_mul_f32_e32 v1, v1, v4 +; GFX11-GISEL-NEXT: global_store_b32 v0, v2, s[0:1] dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] offset:4 dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: mul_fneg_mixed_uses_2: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: global_load_b32 v2, v0, s[2:3] offset:4 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: global_load_b32 v3, v0, s[2:3] offset:8 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: global_load_b32 v4, v0, s[2:3] offset:12 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX12-SDAG-NEXT: v_sub_f32_e32 v2, v1, v3 +; GFX12-SDAG-NEXT: v_mul_f32_e32 v1, v1, v4 +; GFX12-SDAG-NEXT: global_store_b32 v0, v2, s[0:1] scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] offset:4 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: mul_fneg_mixed_uses_2: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: global_load_b32 v2, v0, s[2:3] offset:4 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: global_load_b32 v3, v0, s[2:3] offset:8 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: global_load_b32 v4, v0, s[2:3] offset:12 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: v_mul_f32_e64 v1, v1, -v2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_sub_f32_e32 v2, v1, v3 +; GFX12-GISEL-NEXT: v_mul_f32_e32 v1, v1, v4 +; GFX12-GISEL-NEXT: global_store_b32 v0, v2, s[0:1] scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] offset:4 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: s_endpgm + %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 0 + %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1 + %gep.2 = getelementptr float, ptr addrspace(1) %gep.0, i32 2 + %gep.3 = getelementptr float, ptr addrspace(1) %gep.0, i32 3 + %gep.out.0 = getelementptr float, ptr addrspace(1) %out, i32 0 + %gep.out.1 = getelementptr float, ptr addrspace(1) %gep.out.0, i32 1 + + %a = load volatile float, ptr addrspace(1) %gep.0 + %b = load volatile float, ptr addrspace(1) %gep.1 + %c = load volatile float, ptr addrspace(1) %gep.2 + %d = load volatile float, ptr addrspace(1) %gep.3 + + %mul = fmul contract float %a, %b + %neg = fneg contract float %mul + %sub = fsub contract float %neg, %c ; Contractable use of fneg + %mul2 = fmul contract float %neg, %d ; Cannot be contracted + store volatile float %sub, ptr addrspace(1) %gep.out.0 + store volatile float %mul2, ptr addrspace(1) %gep.out.1 + ret void +} + +; Test case: multiply -> fneg -> multiple fsub uses +; SHOULD contract all fsubs since all fneg uses are contractable +define amdgpu_kernel void @mul_fneg_multiple_fsub_uses(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 { +; +; +; GFX9-SDAG-LABEL: mul_fneg_multiple_fsub_uses: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v4, v0, s[2:3] offset:12 glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_mad_f32 v3, v1, -v2, -v3 +; GFX9-SDAG-NEXT: v_mad_f32 v1, v1, -v2, -v4 +; GFX9-SDAG-NEXT: global_store_dword v0, v3, s[0:1] +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] offset:4 +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: s_endpgm +; +; GFX9-GISEL-LABEL: mul_fneg_multiple_fsub_uses: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v4, v0, s[2:3] offset:12 glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_mad_f32 v3, v1, -v2, -v3 +; GFX9-GISEL-NEXT: v_mad_f32 v1, v1, -v2, -v4 +; GFX9-GISEL-NEXT: global_store_dword v0, v3, s[0:1] +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] offset:4 +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX10-SDAG-LABEL: mul_fneg_multiple_fsub_uses: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: global_load_dword v4, v0, s[2:3] offset:12 glc dlc +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: v_fma_f32 v3, -v1, v2, -v3 +; GFX10-SDAG-NEXT: v_fma_f32 v1, -v1, v2, -v4 +; GFX10-SDAG-NEXT: global_store_dword v0, v3, s[0:1] +; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] offset:4 +; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: mul_fneg_multiple_fsub_uses: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: global_load_dword v4, v0, s[2:3] offset:12 glc dlc +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: v_fma_f32 v3, v1, -v2, -v3 +; GFX10-GISEL-NEXT: v_fma_f32 v1, v1, -v2, -v4 +; GFX10-GISEL-NEXT: global_store_dword v0, v3, s[0:1] +; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] offset:4 +; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: mul_fneg_multiple_fsub_uses: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[2:3] offset:4 glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[2:3] offset:8 glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v4, v0, s[2:3] offset:12 glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_fma_f32 v3, -v1, v2, -v3 +; GFX11-SDAG-NEXT: v_fma_f32 v1, -v1, v2, -v4 +; GFX11-SDAG-NEXT: global_store_b32 v0, v3, s[0:1] dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] offset:4 dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: mul_fneg_multiple_fsub_uses: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[2:3] offset:4 glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[2:3] offset:8 glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v4, v0, s[2:3] offset:12 glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_fma_f32 v3, v1, -v2, -v3 +; GFX11-GISEL-NEXT: v_fma_f32 v1, v1, -v2, -v4 +; GFX11-GISEL-NEXT: global_store_b32 v0, v3, s[0:1] dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] offset:4 dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: mul_fneg_multiple_fsub_uses: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: global_load_b32 v2, v0, s[2:3] offset:4 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: global_load_b32 v3, v0, s[2:3] offset:8 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: global_load_b32 v4, v0, s[2:3] offset:12 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GFX12-SDAG-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GFX12-SDAG-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_fmac_f32_e32 v3, v1, v2 +; GFX12-SDAG-NEXT: v_fmac_f32_e32 v4, v1, v2 +; GFX12-SDAG-NEXT: global_store_b32 v0, v3, s[0:1] scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: global_store_b32 v0, v4, s[0:1] offset:4 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: mul_fneg_multiple_fsub_uses: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: global_load_b32 v2, v0, s[2:3] offset:4 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: global_load_b32 v3, v0, s[2:3] offset:8 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: global_load_b32 v4, v0, s[2:3] offset:12 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: v_fma_f32 v3, v1, -v2, -v3 +; GFX12-GISEL-NEXT: v_fma_f32 v1, v1, -v2, -v4 +; GFX12-GISEL-NEXT: global_store_b32 v0, v3, s[0:1] scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] offset:4 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: s_endpgm + %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 0 + %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1 + %gep.2 = getelementptr float, ptr addrspace(1) %gep.0, i32 2 + %gep.3 = getelementptr float, ptr addrspace(1) %gep.0, i32 3 + %gep.out.0 = getelementptr float, ptr addrspace(1) %out, i32 0 + %gep.out.1 = getelementptr float, ptr addrspace(1) %gep.out.0, i32 1 + + %a = load volatile float, ptr addrspace(1) %gep.0 + %b = load volatile float, ptr addrspace(1) %gep.1 + %c = load volatile float, ptr addrspace(1) %gep.2 + %d = load volatile float, ptr addrspace(1) %gep.3 + + %mul = fmul contract float %a, %b + %neg = fneg contract float %mul + %sub1 = fsub contract float %neg, %c + %sub2 = fsub contract float %neg, %d + store volatile float %sub1, ptr addrspace(1) %gep.out.0 + store volatile float %sub2, ptr addrspace(1) %gep.out.1 + ret void +} + +; Test case: multiply -> mixed fadd and fsub uses +; SHOULD contract both since both are contractable operations +define amdgpu_kernel void @mul_mixed_fadd_fsub_uses(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 { +; +; +; GFX9-SDAG-LABEL: mul_mixed_fadd_fsub_uses: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v4, v0, s[2:3] offset:12 glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_mac_f32_e32 v3, v1, v2 +; GFX9-SDAG-NEXT: v_mad_f32 v1, v1, v2, -v4 +; GFX9-SDAG-NEXT: global_store_dword v0, v3, s[0:1] +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] offset:4 +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: s_endpgm +; +; GFX9-GISEL-LABEL: mul_mixed_fadd_fsub_uses: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v4, v0, s[2:3] offset:12 glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_mac_f32_e32 v3, v1, v2 +; GFX9-GISEL-NEXT: v_mad_f32 v1, v1, v2, -v4 +; GFX9-GISEL-NEXT: global_store_dword v0, v3, s[0:1] +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] offset:4 +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX10-SDAG-LABEL: mul_mixed_fadd_fsub_uses: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: global_load_dword v4, v0, s[2:3] offset:12 glc dlc +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: v_fmac_f32_e32 v3, v1, v2 +; GFX10-SDAG-NEXT: v_fma_f32 v1, v1, v2, -v4 +; GFX10-SDAG-NEXT: global_store_dword v0, v3, s[0:1] +; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] offset:4 +; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: mul_mixed_fadd_fsub_uses: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: global_load_dword v4, v0, s[2:3] offset:12 glc dlc +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: v_fmac_f32_e32 v3, v1, v2 +; GFX10-GISEL-NEXT: v_fma_f32 v1, v1, v2, -v4 +; GFX10-GISEL-NEXT: global_store_dword v0, v3, s[0:1] +; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] offset:4 +; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: mul_mixed_fadd_fsub_uses: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[2:3] offset:4 glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[2:3] offset:8 glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v4, v0, s[2:3] offset:12 glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_fmac_f32_e32 v3, v1, v2 +; GFX11-SDAG-NEXT: v_fma_f32 v1, v1, v2, -v4 +; GFX11-SDAG-NEXT: global_store_b32 v0, v3, s[0:1] dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] offset:4 dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: mul_mixed_fadd_fsub_uses: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[2:3] offset:4 glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[2:3] offset:8 glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v4, v0, s[2:3] offset:12 glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_fmac_f32_e32 v3, v1, v2 +; GFX11-GISEL-NEXT: v_fma_f32 v1, v1, v2, -v4 +; GFX11-GISEL-NEXT: global_store_b32 v0, v3, s[0:1] dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] offset:4 dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: mul_mixed_fadd_fsub_uses: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: global_load_b32 v2, v0, s[2:3] offset:4 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: global_load_b32 v3, v0, s[2:3] offset:8 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: global_load_b32 v4, v0, s[2:3] offset:12 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: v_fmac_f32_e32 v3, v1, v2 +; GFX12-SDAG-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_fmac_f32_e32 v4, v1, v2 +; GFX12-SDAG-NEXT: global_store_b32 v0, v3, s[0:1] scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: global_store_b32 v0, v4, s[0:1] offset:4 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: mul_mixed_fadd_fsub_uses: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: global_load_b32 v2, v0, s[2:3] offset:4 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: global_load_b32 v3, v0, s[2:3] offset:8 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: global_load_b32 v4, v0, s[2:3] offset:12 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: v_fmac_f32_e32 v3, v1, v2 +; GFX12-GISEL-NEXT: v_fma_f32 v1, v1, v2, -v4 +; GFX12-GISEL-NEXT: global_store_b32 v0, v3, s[0:1] scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] offset:4 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: s_endpgm + %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 0 + %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1 + %gep.2 = getelementptr float, ptr addrspace(1) %gep.0, i32 2 + %gep.3 = getelementptr float, ptr addrspace(1) %gep.0, i32 3 + %gep.out.0 = getelementptr float, ptr addrspace(1) %out, i32 0 + %gep.out.1 = getelementptr float, ptr addrspace(1) %gep.out.0, i32 1 + + %a = load volatile float, ptr addrspace(1) %gep.0 + %b = load volatile float, ptr addrspace(1) %gep.1 + %c = load volatile float, ptr addrspace(1) %gep.2 + %d = load volatile float, ptr addrspace(1) %gep.3 + + %mul = fmul contract float %a, %b + %add = fadd contract float %mul, %c + %sub = fsub contract float %mul, %d + store volatile float %add, ptr addrspace(1) %gep.out.0 + store volatile float %sub, ptr addrspace(1) %gep.out.1 + ret void +} + +; Test case: multiply used by FNEG then FSUB - should contract +define amdgpu_kernel void @mul_fneg_fsub_contractable(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 { +; +; +; GFX9-SDAG-LABEL: mul_fneg_fsub_contractable: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX9-SDAG-NEXT: s_load_dword s6, s[2:3], 0x8 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-SDAG-NEXT: v_mad_f32 v1, s4, -v1, -v2 +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-SDAG-NEXT: s_endpgm +; +; GFX9-GISEL-LABEL: mul_fneg_fsub_contractable: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX9-GISEL-NEXT: s_load_dword s6, s[2:3], 0x8 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-GISEL-NEXT: v_mad_f32 v0, s4, -v0, -v1 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX10-SDAG-LABEL: mul_fneg_fsub_contractable: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dword s6, s[2:3], 0x8 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_fma_f32 v0, -s4, s5, -v0 +; GFX10-SDAG-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: mul_fneg_fsub_contractable: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dword s6, s[2:3], 0x8 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_fma_f32 v0, s4, -s5, -v0 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: mul_fneg_fsub_contractable: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_fma_f32 v0, -s2, s3, -v0 +; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: mul_fneg_fsub_contractable: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_fma_f32 v0, s2, -s3, -v0 +; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: mul_fneg_fsub_contractable: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[2:3], 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_xor_b32 s2, s6, 0x80000000 +; GFX12-SDAG-NEXT: s_xor_b32 s3, s4, 0x80000000 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_3) +; GFX12-SDAG-NEXT: s_fmac_f32 s2, s3, s5 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: mul_fneg_fsub_contractable: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[2:3], 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_xor_b32 s2, s5, 0x80000000 +; GFX12-GISEL-NEXT: s_xor_b32 s3, s6, 0x80000000 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_3) +; GFX12-GISEL-NEXT: s_fmac_f32 s3, s4, s2 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s3 +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: s_endpgm + %a = load float, ptr addrspace(1) %in + %gep.1 = getelementptr float, ptr addrspace(1) %in, i32 1 + %b = load float, ptr addrspace(1) %gep.1 + %gep.2 = getelementptr float, ptr addrspace(1) %in, i32 2 + %c = load float, ptr addrspace(1) %gep.2 + + %mul = fmul contract float %a, %b + %neg = fneg float %mul + %sub = fsub contract float %neg, %c + + store float %sub, ptr addrspace(1) %out + ret void +} + +; Test case: multiply used by FNEG then non-SUB - should NOT contract +define amdgpu_kernel void @mul_fneg_nonfsub_noncontractable(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 { +; +; +; GFX9-SDAG-LABEL: mul_fneg_nonfsub_noncontractable: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-SDAG-NEXT: v_mul_f32_e32 v1, s4, v0 +; GFX9-SDAG-NEXT: v_add_f32_e32 v0, s6, v1 +; GFX9-SDAG-NEXT: v_mul_f32_e64 v1, -v1, s7 +; GFX9-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-SDAG-NEXT: s_endpgm +; +; GFX9-GISEL-LABEL: mul_fneg_nonfsub_noncontractable: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-GISEL-NEXT: v_mul_f32_e32 v1, s4, v0 +; GFX9-GISEL-NEXT: v_add_f32_e32 v0, s6, v1 +; GFX9-GISEL-NEXT: v_mul_f32_e64 v1, -v1, s7 +; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX10-SDAG-LABEL: mul_fneg_nonfsub_noncontractable: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mul_f32_e64 v1, s4, s5 +; GFX10-SDAG-NEXT: v_add_f32_e32 v0, s6, v1 +; GFX10-SDAG-NEXT: v_mul_f32_e64 v1, -v1, s7 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: mul_fneg_nonfsub_noncontractable: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mul_f32_e64 v1, s4, s5 +; GFX10-GISEL-NEXT: v_add_f32_e32 v0, s6, v1 +; GFX10-GISEL-NEXT: v_mul_f32_e64 v1, -v1, s7 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: mul_fneg_nonfsub_noncontractable: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_mul_f32_e64 v1, s4, s5 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_add_f32_e32 v0, s6, v1 +; GFX11-SDAG-NEXT: v_mul_f32_e64 v1, -v1, s7 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: mul_fneg_nonfsub_noncontractable: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_mul_f32_e64 v1, s4, s5 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_add_f32_e32 v0, s6, v1 +; GFX11-GISEL-NEXT: v_mul_f32_e64 v1, -v1, s7 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: mul_fneg_nonfsub_noncontractable: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_mul_f32 s2, s4, s5 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(SKIP_3) | instid1(SALU_CYCLE_2) +; GFX12-SDAG-NEXT: s_xor_b32 s3, s2, 0x80000000 +; GFX12-SDAG-NEXT: s_add_f32 s2, s2, s6 +; GFX12-SDAG-NEXT: s_mul_f32 s3, s3, s7 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: mul_fneg_nonfsub_noncontractable: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_mul_f32 s2, s4, s5 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(SKIP_2) | instid1(SALU_CYCLE_3) +; GFX12-GISEL-NEXT: s_xor_b32 s3, s2, 0x80000000 +; GFX12-GISEL-NEXT: s_add_f32 s2, s2, s6 +; GFX12-GISEL-NEXT: s_mul_f32 s3, s3, s7 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_endpgm + %a = load float, ptr addrspace(1) %in + %gep.1 = getelementptr float, ptr addrspace(1) %in, i32 1 + %b = load float, ptr addrspace(1) %gep.1 + %gep.2 = getelementptr float, ptr addrspace(1) %in, i32 2 + %c = load float, ptr addrspace(1) %gep.2 + %gep.3 = getelementptr float, ptr addrspace(1) %in, i32 3 + %d = load float, ptr addrspace(1) %gep.3 + + %mul = fmul contract float %a, %b + %neg = fneg float %mul + %add = fadd contract float %mul, %c ; Direct use - contractable + %other = fmul contract float %neg, %d ; FNEG->FMUL is not contractable + + store float %add, ptr addrspace(1) %out + %gep.out.1 = getelementptr float, ptr addrspace(1) %out, i32 1 + store float %other, ptr addrspace(1) %gep.out.1 + ret void +} + +declare float @llvm.fma.f32(float, float, float) #1 +declare double @llvm.fma.f64(double, double, double) #1 + +attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } +attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/fma.f16.ll b/llvm/test/CodeGen/AMDGPU/fma.f16.ll index bbd493f668847..137c0731aeecb 100644 --- a/llvm/test/CodeGen/AMDGPU/fma.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fma.f16.ll @@ -367,11 +367,9 @@ define i32 @test_D139469_f16(half %arg) { ; GFX9-SDAG-LABEL: test_D139469_f16: ; GFX9-SDAG: ; %bb.0: ; %bb ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x291e -; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0x211e -; GFX9-SDAG-NEXT: v_mul_f16_e32 v1, 0x291e, v0 -; GFX9-SDAG-NEXT: v_fma_f16 v0, v0, s4, v2 -; GFX9-SDAG-NEXT: v_min_f16_e32 v0, v1, v0 +; GFX9-SDAG-NEXT: v_mul_f16_e32 v0, 0x291e, v0 +; GFX9-SDAG-NEXT: v_add_f16_e32 v1, 0x211e, v0 +; GFX9-SDAG-NEXT: v_min_f16_e32 v0, v0, v1 ; GFX9-SDAG-NEXT: v_cmp_gt_f16_e32 vcc, 0, v0 ; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -379,11 +377,9 @@ define i32 @test_D139469_f16(half %arg) { ; GFX9-GISEL-LABEL: test_D139469_f16: ; GFX9-GISEL: ; %bb.0: ; %bb ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mul_f16_e32 v2, 0x291e, v0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0x291e -; GFX9-GISEL-NEXT: v_cmp_gt_f16_e32 vcc, 0, v2 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x211e -; GFX9-GISEL-NEXT: v_fma_f16 v0, v0, v1, v2 +; GFX9-GISEL-NEXT: v_mul_f16_e32 v0, 0x291e, v0 +; GFX9-GISEL-NEXT: v_cmp_gt_f16_e32 vcc, 0, v0 +; GFX9-GISEL-NEXT: v_add_f16_e32 v0, 0x211e, v0 ; GFX9-GISEL-NEXT: v_cmp_gt_f16_e64 s[4:5], 0, v0 ; GFX9-GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] @@ -392,10 +388,9 @@ define i32 @test_D139469_f16(half %arg) { ; GFX10-SDAG-LABEL: test_D139469_f16: ; GFX10-SDAG: ; %bb.0: ; %bb ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: s_movk_i32 s4, 0x291e -; GFX10-SDAG-NEXT: v_mul_f16_e32 v1, 0x291e, v0 -; GFX10-SDAG-NEXT: v_fmaak_f16 v0, s4, v0, 0x211e -; GFX10-SDAG-NEXT: v_min_f16_e32 v0, v1, v0 +; GFX10-SDAG-NEXT: v_mul_f16_e32 v0, 0x291e, v0 +; GFX10-SDAG-NEXT: v_add_f16_e32 v1, 0x211e, v0 +; GFX10-SDAG-NEXT: v_min_f16_e32 v0, v0, v1 ; GFX10-SDAG-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0 ; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -403,10 +398,9 @@ define i32 @test_D139469_f16(half %arg) { ; GFX10-GISEL-LABEL: test_D139469_f16: ; GFX10-GISEL: ; %bb.0: ; %bb ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x211e -; GFX10-GISEL-NEXT: v_mul_f16_e32 v2, 0x291e, v0 -; GFX10-GISEL-NEXT: v_fmac_f16_e32 v1, 0x291e, v0 -; GFX10-GISEL-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2 +; GFX10-GISEL-NEXT: v_mul_f16_e32 v0, 0x291e, v0 +; GFX10-GISEL-NEXT: v_add_f16_e32 v1, 0x211e, v0 +; GFX10-GISEL-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0 ; GFX10-GISEL-NEXT: v_cmp_gt_f16_e64 s4, 0, v1 ; GFX10-GISEL-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 @@ -415,11 +409,10 @@ define i32 @test_D139469_f16(half %arg) { ; GFX11-SDAG-TRUE16-LABEL: test_D139469_f16: ; GFX11-SDAG-TRUE16: ; %bb.0: ; %bb ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0x211e -; GFX11-SDAG-TRUE16-NEXT: v_mul_f16_e32 v1.l, 0x291e, v0.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_fmac_f16_e32 v0.h, 0x291e, v0.l -; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v1.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0x291e, v0.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_add_f16_e32 v0.h, 0x211e, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v0.h ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0.l ; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo @@ -428,11 +421,10 @@ define i32 @test_D139469_f16(half %arg) { ; GFX11-SDAG-FAKE16-LABEL: test_D139469_f16: ; GFX11-SDAG-FAKE16: ; %bb.0: ; %bb ; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0x211e -; GFX11-SDAG-FAKE16-NEXT: v_mul_f16_e32 v2, 0x291e, v0 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-FAKE16-NEXT: v_fmac_f16_e32 v1, 0x291e, v0 -; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v0, v2, v1 +; GFX11-SDAG-FAKE16-NEXT: v_mul_f16_e32 v0, 0x291e, v0 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_add_f16_e32 v1, 0x211e, v0 +; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v0, v0, v1 ; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0 ; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo @@ -441,28 +433,26 @@ define i32 @test_D139469_f16(half %arg) { ; GFX11-GISEL-TRUE16-LABEL: test_D139469_f16: ; GFX11-GISEL-TRUE16: ; %bb.0: ; %bb ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0x211e -; GFX11-GISEL-TRUE16-NEXT: v_mul_f16_e32 v1.l, 0x291e, v0.l -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-TRUE16-NEXT: v_fmac_f16_e32 v0.h, 0x291e, v0.l -; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1.l -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0x291e, v0.l +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-TRUE16-NEXT: v_add_f16_e32 v0.h, 0x211e, v0.l +; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0.l ; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v0.h ; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-FAKE16-LABEL: test_D139469_f16: ; GFX11-GISEL-FAKE16: ; %bb.0: ; %bb ; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0x211e -; GFX11-GISEL-FAKE16-NEXT: v_mul_f16_e32 v2, 0x291e, v0 -; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-FAKE16-NEXT: v_fmac_f16_e32 v1, 0x291e, v0 -; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2 -; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-FAKE16-NEXT: v_mul_f16_e32 v0, 0x291e, v0 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-FAKE16-NEXT: v_add_f16_e32 v1, 0x211e, v0 +; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0 ; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v1 ; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -473,11 +463,10 @@ define i32 @test_D139469_f16(half %arg) { ; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0x211e -; GFX12-SDAG-TRUE16-NEXT: v_mul_f16_e32 v1.l, 0x291e, v0.l -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-TRUE16-NEXT: v_fmac_f16_e32 v0.h, 0x291e, v0.l -; GFX12-SDAG-TRUE16-NEXT: v_min_num_f16_e32 v0.l, v1.l, v0.h +; GFX12-SDAG-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0x291e, v0.l +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-TRUE16-NEXT: v_add_f16_e32 v0.h, 0x211e, v0.l +; GFX12-SDAG-TRUE16-NEXT: v_min_num_f16_e32 v0.l, v0.l, v0.h ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0.l ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) @@ -491,11 +480,10 @@ define i32 @test_D139469_f16(half %arg) { ; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0x211e -; GFX12-SDAG-FAKE16-NEXT: v_mul_f16_e32 v2, 0x291e, v0 -; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-FAKE16-NEXT: v_fmac_f16_e32 v1, 0x291e, v0 -; GFX12-SDAG-FAKE16-NEXT: v_min_num_f16_e32 v0, v2, v1 +; GFX12-SDAG-FAKE16-NEXT: v_mul_f16_e32 v0, 0x291e, v0 +; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-FAKE16-NEXT: v_add_f16_e32 v1, 0x211e, v0 +; GFX12-SDAG-FAKE16-NEXT: v_min_num_f16_e32 v0, v0, v1 ; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0 ; GFX12-SDAG-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) @@ -509,12 +497,10 @@ define i32 @test_D139469_f16(half %arg) { ; GFX12-GISEL-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-GISEL-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0x211e -; GFX12-GISEL-TRUE16-NEXT: v_mul_f16_e32 v1.l, 0x291e, v0.l -; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-TRUE16-NEXT: v_fmac_f16_e32 v0.h, 0x291e, v0.l -; GFX12-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1.l -; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0x291e, v0.l +; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-GISEL-TRUE16-NEXT: v_add_f16_e32 v0.h, 0x211e, v0.l +; GFX12-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0.l ; GFX12-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v0.h ; GFX12-GISEL-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-GISEL-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -528,12 +514,10 @@ define i32 @test_D139469_f16(half %arg) { ; GFX12-GISEL-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-GISEL-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0x211e -; GFX12-GISEL-FAKE16-NEXT: v_mul_f16_e32 v2, 0x291e, v0 -; GFX12-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-FAKE16-NEXT: v_fmac_f16_e32 v1, 0x291e, v0 -; GFX12-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2 -; GFX12-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-FAKE16-NEXT: v_mul_f16_e32 v0, 0x291e, v0 +; GFX12-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-GISEL-FAKE16-NEXT: v_add_f16_e32 v1, 0x211e, v0 +; GFX12-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0 ; GFX12-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v1 ; GFX12-GISEL-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-GISEL-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -554,10 +538,10 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) { ; GFX9-SDAG: ; %bb.0: ; %bb ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x291e -; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0x211e -; GFX9-SDAG-NEXT: v_pk_mul_f16 v1, v0, s4 op_sel_hi:[1,0] -; GFX9-SDAG-NEXT: v_pk_fma_f16 v0, v0, s4, v2 op_sel_hi:[1,0,0] -; GFX9-SDAG-NEXT: v_pk_min_f16 v1, v1, v0 +; GFX9-SDAG-NEXT: v_pk_mul_f16 v0, v0, s4 op_sel_hi:[1,0] +; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x211e +; GFX9-SDAG-NEXT: v_pk_add_f16 v1, v0, s4 op_sel_hi:[1,0] +; GFX9-SDAG-NEXT: v_pk_min_f16 v1, v0, v1 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-SDAG-NEXT: v_cmp_gt_f16_e32 vcc, 0, v1 ; GFX9-SDAG-NEXT: v_cmp_lt_f16_sdwa s[4:5], v1, v2 src0_sel:WORD_1 src1_sel:DWORD @@ -569,14 +553,14 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) { ; GFX9-GISEL: ; %bb.0: ; %bb ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0x291e291e -; GFX9-GISEL-NEXT: v_pk_mul_f16 v2, v0, v1 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-GISEL-NEXT: v_cmp_gt_f16_e32 vcc, 0, v2 -; GFX9-GISEL-NEXT: v_cmp_lt_f16_sdwa s[6:7], v2, v3 src0_sel:WORD_1 src1_sel:DWORD +; GFX9-GISEL-NEXT: v_pk_mul_f16 v0, v0, v1 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x211e211e -; GFX9-GISEL-NEXT: v_pk_fma_f16 v0, v0, v1, v2 +; GFX9-GISEL-NEXT: v_cmp_gt_f16_e32 vcc, 0, v0 +; GFX9-GISEL-NEXT: v_cmp_lt_f16_sdwa s[6:7], v0, v1 src0_sel:WORD_1 src1_sel:DWORD +; GFX9-GISEL-NEXT: v_pk_add_f16 v0, v0, v2 ; GFX9-GISEL-NEXT: v_cmp_gt_f16_e64 s[4:5], 0, v0 -; GFX9-GISEL-NEXT: v_cmp_lt_f16_sdwa s[8:9], v0, v3 src0_sel:WORD_1 src1_sel:DWORD +; GFX9-GISEL-NEXT: v_cmp_lt_f16_sdwa s[8:9], v0, v1 src0_sel:WORD_1 src1_sel:DWORD ; GFX9-GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GFX9-GISEL-NEXT: s_or_b64 s[4:5], s[6:7], s[8:9] @@ -586,11 +570,10 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) { ; GFX10-SDAG-LABEL: test_D139469_v2f16: ; GFX10-SDAG: ; %bb.0: ; %bb ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: s_movk_i32 s4, 0x211e -; GFX10-SDAG-NEXT: v_pk_mul_f16 v1, 0x291e, v0 op_sel_hi:[0,1] -; GFX10-SDAG-NEXT: v_pk_fma_f16 v0, 0x291e, v0, s4 op_sel_hi:[0,1,0] +; GFX10-SDAG-NEXT: v_pk_mul_f16 v0, 0x291e, v0 op_sel_hi:[0,1] ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: v_pk_min_f16 v1, v1, v0 +; GFX10-SDAG-NEXT: v_pk_add_f16 v1, 0x211e, v0 op_sel_hi:[0,1] +; GFX10-SDAG-NEXT: v_pk_min_f16 v1, v0, v1 ; GFX10-SDAG-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1 ; GFX10-SDAG-NEXT: v_cmp_lt_f16_sdwa s4, v1, v2 src0_sel:WORD_1 src1_sel:DWORD ; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo @@ -600,14 +583,13 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) { ; GFX10-GISEL-LABEL: test_D139469_v2f16: ; GFX10-GISEL: ; %bb.0: ; %bb ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x211e211e -; GFX10-GISEL-NEXT: v_pk_mul_f16 v2, 0x291e291e, v0 -; GFX10-GISEL-NEXT: v_pk_fma_f16 v0, 0x291e291e, v0, v1 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2 -; GFX10-GISEL-NEXT: v_cmp_gt_f16_e64 s4, 0, v0 -; GFX10-GISEL-NEXT: v_cmp_lt_f16_sdwa s5, v2, v1 src0_sel:WORD_1 src1_sel:DWORD -; GFX10-GISEL-NEXT: v_cmp_lt_f16_sdwa s6, v0, v1 src0_sel:WORD_1 src1_sel:DWORD +; GFX10-GISEL-NEXT: v_pk_mul_f16 v0, 0x291e291e, v0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: v_pk_add_f16 v1, 0x211e211e, v0 +; GFX10-GISEL-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0 +; GFX10-GISEL-NEXT: v_cmp_lt_f16_sdwa s5, v0, v2 src0_sel:WORD_1 src1_sel:DWORD +; GFX10-GISEL-NEXT: v_cmp_gt_f16_e64 s4, 0, v1 +; GFX10-GISEL-NEXT: v_cmp_lt_f16_sdwa s6, v1, v2 src0_sel:WORD_1 src1_sel:DWORD ; GFX10-GISEL-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10-GISEL-NEXT: s_or_b32 s4, s5, s6 @@ -617,14 +599,13 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) { ; GFX11-SDAG-TRUE16-LABEL: test_D139469_v2f16: ; GFX11-SDAG-TRUE16: ; %bb.0: ; %bb ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: s_movk_i32 s0, 0x211e -; GFX11-SDAG-TRUE16-NEXT: v_pk_mul_f16 v1, 0x291e, v0 op_sel_hi:[0,1] -; GFX11-SDAG-TRUE16-NEXT: v_pk_fma_f16 v0, 0x291e, v0, s0 op_sel_hi:[0,1,0] +; GFX11-SDAG-TRUE16-NEXT: v_pk_mul_f16 v0, 0x291e, v0 op_sel_hi:[0,1] ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_pk_min_f16 v1, v1, v0 +; GFX11-SDAG-TRUE16-NEXT: v_pk_add_f16 v1, 0x211e, v0 op_sel_hi:[0,1] +; GFX11-SDAG-TRUE16-NEXT: v_pk_min_f16 v1, v0, v1 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1.l ; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1.h ; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -632,15 +613,14 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) { ; GFX11-SDAG-FAKE16-LABEL: test_D139469_v2f16: ; GFX11-SDAG-FAKE16: ; %bb.0: ; %bb ; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-FAKE16-NEXT: s_movk_i32 s0, 0x211e -; GFX11-SDAG-FAKE16-NEXT: v_pk_mul_f16 v1, 0x291e, v0 op_sel_hi:[0,1] -; GFX11-SDAG-FAKE16-NEXT: v_pk_fma_f16 v0, 0x291e, v0, s0 op_sel_hi:[0,1,0] +; GFX11-SDAG-FAKE16-NEXT: v_pk_mul_f16 v0, 0x291e, v0 op_sel_hi:[0,1] ; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-FAKE16-NEXT: v_pk_min_f16 v0, v1, v0 +; GFX11-SDAG-FAKE16-NEXT: v_pk_add_f16 v1, 0x211e, v0 op_sel_hi:[0,1] +; GFX11-SDAG-FAKE16-NEXT: v_pk_min_f16 v0, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-SDAG-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0 ; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-SDAG-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1 ; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -648,16 +628,14 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) { ; GFX11-GISEL-TRUE16-LABEL: test_D139469_v2f16: ; GFX11-GISEL-TRUE16: ; %bb.0: ; %bb ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x211e211e -; GFX11-GISEL-TRUE16-NEXT: v_pk_mul_f16 v2, 0x291e291e, v0 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-TRUE16-NEXT: v_pk_fma_f16 v0, 0x291e291e, v0, v1 -; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2.l -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s1, 0, v2.h -; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v0.l +; GFX11-GISEL-TRUE16-NEXT: v_pk_mul_f16 v0, 0x291e291e, v0 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-GISEL-TRUE16-NEXT: v_pk_add_f16 v1, 0x211e211e, v0 +; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0.l +; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s1, 0, v0.h +; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v1.l ; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s2, 0, v0.h +; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s2, 0, v1.h ; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s0, s1, s2 @@ -668,22 +646,20 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) { ; GFX11-GISEL-FAKE16-LABEL: test_D139469_v2f16: ; GFX11-GISEL-FAKE16: ; %bb.0: ; %bb ; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0x211e211e -; GFX11-GISEL-FAKE16-NEXT: v_pk_mul_f16 v2, 0x291e291e, v0 -; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-FAKE16-NEXT: v_pk_fma_f16 v0, 0x291e291e, v0, v1 -; GFX11-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2 -; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v0 -; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s1, 0, v1 -; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-FAKE16-NEXT: v_pk_mul_f16 v0, 0x291e291e, v0 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-GISEL-FAKE16-NEXT: v_pk_add_f16 v1, 0x211e211e, v0 +; GFX11-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0 +; GFX11-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v1 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s1, 0, v2 ; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s2, 0, v3 ; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s0, s1, s2 -; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 ; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -694,16 +670,14 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) { ; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-TRUE16-NEXT: s_movk_i32 s0, 0x211e -; GFX12-SDAG-TRUE16-NEXT: v_pk_mul_f16 v1, 0x291e, v0 op_sel_hi:[0,1] -; GFX12-SDAG-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-SDAG-TRUE16-NEXT: v_pk_fma_f16 v0, 0x291e, v0, s0 op_sel_hi:[0,1,0] +; GFX12-SDAG-TRUE16-NEXT: v_pk_mul_f16 v0, 0x291e, v0 op_sel_hi:[0,1] ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-TRUE16-NEXT: v_pk_min_num_f16 v1, v1, v0 +; GFX12-SDAG-TRUE16-NEXT: v_pk_add_f16 v1, 0x211e, v0 op_sel_hi:[0,1] +; GFX12-SDAG-TRUE16-NEXT: v_pk_min_num_f16 v1, v0, v1 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-SDAG-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1.l ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-SDAG-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1.h ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo @@ -716,17 +690,15 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) { ; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-FAKE16-NEXT: s_movk_i32 s0, 0x211e -; GFX12-SDAG-FAKE16-NEXT: v_pk_mul_f16 v1, 0x291e, v0 op_sel_hi:[0,1] -; GFX12-SDAG-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-SDAG-FAKE16-NEXT: v_pk_fma_f16 v0, 0x291e, v0, s0 op_sel_hi:[0,1,0] +; GFX12-SDAG-FAKE16-NEXT: v_pk_mul_f16 v0, 0x291e, v0 op_sel_hi:[0,1] ; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-FAKE16-NEXT: v_pk_min_num_f16 v0, v1, v0 +; GFX12-SDAG-FAKE16-NEXT: v_pk_add_f16 v1, 0x211e, v0 op_sel_hi:[0,1] +; GFX12-SDAG-FAKE16-NEXT: v_pk_min_num_f16 v0, v0, v1 +; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX12-SDAG-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0 ; GFX12-SDAG-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-SDAG-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1 ; GFX12-SDAG-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo @@ -739,16 +711,14 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) { ; GFX12-GISEL-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-GISEL-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x211e211e -; GFX12-GISEL-TRUE16-NEXT: v_pk_mul_f16 v2, 0x291e291e, v0 -; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-TRUE16-NEXT: v_pk_fma_f16 v0, 0x291e291e, v0, v1 -; GFX12-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2.l -; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s1, 0, v2.h -; GFX12-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v0.l +; GFX12-GISEL-TRUE16-NEXT: v_pk_mul_f16 v0, 0x291e291e, v0 +; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-GISEL-TRUE16-NEXT: v_pk_add_f16 v1, 0x211e211e, v0 +; GFX12-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0.l +; GFX12-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s1, 0, v0.h +; GFX12-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v1.l ; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX12-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s2, 0, v0.h +; GFX12-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s2, 0, v1.h ; GFX12-GISEL-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-GISEL-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-GISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 @@ -764,17 +734,15 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) { ; GFX12-GISEL-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-GISEL-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0x211e211e -; GFX12-GISEL-FAKE16-NEXT: v_pk_mul_f16 v2, 0x291e291e, v0 -; GFX12-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-FAKE16-NEXT: v_pk_fma_f16 v0, 0x291e291e, v0, v1 -; GFX12-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX12-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2 -; GFX12-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX12-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX12-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v0 -; GFX12-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s1, 0, v1 -; GFX12-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-GISEL-FAKE16-NEXT: v_pk_mul_f16 v0, 0x291e291e, v0 +; GFX12-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-GISEL-FAKE16-NEXT: v_pk_add_f16 v1, 0x211e211e, v0 +; GFX12-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX12-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0 +; GFX12-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX12-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v1 +; GFX12-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s1, 0, v2 ; GFX12-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s2, 0, v3 ; GFX12-GISEL-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-GISEL-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) diff --git a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll index c0f3726a5c192..b7de172495f42 100644 --- a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll +++ b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll @@ -83,25 +83,20 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f32(ptr addrspace(1) %out, flo ; VI-LABEL: multiple_use_fadd_fmac_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; VI-NEXT: s_load_dword s4, s[8:9], 0x8 -; VI-NEXT: s_load_dword s3, s[8:9], 0x2c +; VI-NEXT: s_load_dword s2, s[8:9], 0x2c +; VI-NEXT: s_load_dword s3, s[8:9], 0x8 ; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_u32 s0, s0, 4 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 -; VI-NEXT: s_add_u32 s2, s0, 4 -; VI-NEXT: v_add_f32_e64 v2, s4, s4 +; VI-NEXT: v_mac_f32_e64 v2, s3, 2.0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mac_f32_e64 v3, s4, 2.0 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_store_dword v[0:1], v3 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: multiple_use_fadd_fmac_f32: @@ -112,11 +107,8 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f32(ptr addrspace(1) %out, flo ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_f32_e64 v1, s2, s2 -; GFX10-NEXT: v_fma_f32 v2, s2, 2.0, s3 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_store_dword v0, v2, s[0:1] offset:4 +; GFX10-NEXT: v_fma_f32 v1, s2, 2.0, s3 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] offset:4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm ; @@ -128,17 +120,14 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f32(ptr addrspace(1) %out, flo ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_add_f32_e64 v1, s2, s2 -; GFX11-NEXT: v_fma_f32 v2, s2, 2.0, s3 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] offset:4 dlc +; GFX11-NEXT: v_fma_f32 v1, s2, 2.0, s3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] offset:4 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_endpgm %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1 %mul2 = fmul fast float %x, 2.0 %mad = fadd fast float %mul2, %y - store volatile float %mul2, ptr addrspace(1) %out + ; store volatile float %mul2, ptr addrspace(1) %out store volatile float %mad, ptr addrspace(1) %out.gep.1 ret void } @@ -151,19 +140,14 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f32(ptr addrspace(1) %out, flo ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_u32 s0, s0, 4 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: v_mad_f32 v2, |s2|, 2.0, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_add_u32 s4, s0, 4 -; VI-NEXT: v_add_f32_e64 v2, |s2|, |s2| ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: s_addc_u32 s5, s1, 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mad_f32 v3, |s2|, 2.0, v3 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: flat_store_dword v[0:1], v3 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: multiple_use_fadd_fmad_f32: @@ -171,11 +155,8 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f32(ptr addrspace(1) %out, flo ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_f32_e64 v1, |s2|, |s2| -; GFX10-NEXT: v_fma_f32 v2, |s2|, 2.0, s3 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_store_dword v0, v2, s[0:1] offset:4 +; GFX10-NEXT: v_fma_f32 v1, |s2|, 2.0, s3 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] offset:4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm ; @@ -184,18 +165,15 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f32(ptr addrspace(1) %out, flo ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_add_f32_e64 v1, |s2|, |s2| -; GFX11-NEXT: v_fma_f32 v2, |s2|, 2.0, s3 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] offset:4 dlc +; GFX11-NEXT: v_fma_f32 v1, |s2|, 2.0, s3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] offset:4 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_endpgm %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1 %x.abs = call float @llvm.fabs.f32(float %x) %mul2 = fmul fast float %x.abs, 2.0 %mad = fadd fast float %mul2, %y - store volatile float %mul2, ptr addrspace(1) %out + ; store volatile float %mul2, ptr addrspace(1) %out store volatile float %mad, ptr addrspace(1) %out.gep.1 ret void } @@ -549,50 +527,40 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 { ; VI-DENORM-LABEL: multiple_use_fadd_fmac_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dword s4, s[8:9], 0x8 +; VI-DENORM-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-DENORM-NEXT: s_add_i32 s12, s12, s17 ; VI-DENORM-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-DENORM-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; VI-DENORM-NEXT: s_lshr_b32 s3, s4, 16 +; VI-DENORM-NEXT: s_lshr_b32 s3, s2, 16 +; VI-DENORM-NEXT: s_add_u32 s0, s0, 2 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s3 -; VI-DENORM-NEXT: v_fma_f16 v3, s4, 2.0, v0 +; VI-DENORM-NEXT: s_addc_u32 s1, s1, 0 +; VI-DENORM-NEXT: v_fma_f16 v2, s2, 2.0, v0 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s0 -; VI-DENORM-NEXT: v_add_f16_e64 v2, s4, s4 -; VI-DENORM-NEXT: s_add_u32 s2, s0, 2 ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 -; VI-DENORM-NEXT: s_addc_u32 s3, s1, 0 ; VI-DENORM-NEXT: flat_store_short v[0:1], v2 ; VI-DENORM-NEXT: s_waitcnt vmcnt(0) -; VI-DENORM-NEXT: v_mov_b32_e32 v0, s2 -; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3 -; VI-DENORM-NEXT: flat_store_short v[0:1], v3 -; VI-DENORM-NEXT: s_waitcnt vmcnt(0) ; VI-DENORM-NEXT: s_endpgm ; ; VI-FLUSH-LABEL: multiple_use_fadd_fmac_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dword s4, s[8:9], 0x8 +; VI-FLUSH-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-FLUSH-NEXT: s_add_i32 s12, s12, s17 ; VI-FLUSH-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-FLUSH-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; VI-FLUSH-NEXT: s_lshr_b32 s3, s4, 16 +; VI-FLUSH-NEXT: s_lshr_b32 s3, s2, 16 +; VI-FLUSH-NEXT: s_add_u32 s0, s0, 2 +; VI-FLUSH-NEXT: v_mov_b32_e32 v2, s3 +; VI-FLUSH-NEXT: s_addc_u32 s1, s1, 0 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0 -; VI-FLUSH-NEXT: v_add_f16_e64 v2, s4, s4 -; VI-FLUSH-NEXT: s_add_u32 s2, s0, 2 +; VI-FLUSH-NEXT: v_mac_f16_e64 v2, s2, 2.0 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 -; VI-FLUSH-NEXT: v_mov_b32_e32 v3, s3 -; VI-FLUSH-NEXT: s_addc_u32 s3, s1, 0 ; VI-FLUSH-NEXT: flat_store_short v[0:1], v2 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) -; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2 -; VI-FLUSH-NEXT: v_mac_f16_e64 v3, s4, 2.0 -; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 -; VI-FLUSH-NEXT: flat_store_short v[0:1], v3 -; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) ; VI-FLUSH-NEXT: s_endpgm ; ; GFX10-DENORM-LABEL: multiple_use_fadd_fmac_f16: @@ -603,11 +571,8 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 ; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: s_lshr_b32 s3, s2, 16 -; GFX10-DENORM-NEXT: v_add_f16_e64 v1, s2, s2 -; GFX10-DENORM-NEXT: v_fma_f16 v2, s2, 2.0, s3 -; GFX10-DENORM-NEXT: global_store_short v0, v1, s[0:1] -; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-DENORM-NEXT: global_store_short v0, v2, s[0:1] offset:2 +; GFX10-DENORM-NEXT: v_fma_f16 v1, s2, 2.0, s3 +; GFX10-DENORM-NEXT: global_store_short v0, v1, s[0:1] offset:2 ; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-DENORM-NEXT: s_endpgm ; @@ -620,10 +585,8 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: v_add_f16_e64 v0, s2, s2 ; GFX10-FLUSH-NEXT: s_lshr_b32 s2, s2, 16 -; GFX10-FLUSH-NEXT: v_add_f16_e32 v2, s2, v0 -; GFX10-FLUSH-NEXT: global_store_short v1, v0, s[0:1] -; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-FLUSH-NEXT: global_store_short v1, v2, s[0:1] offset:2 +; GFX10-FLUSH-NEXT: v_add_f16_e32 v0, s2, v0 +; GFX10-FLUSH-NEXT: global_store_short v1, v0, s[0:1] offset:2 ; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-FLUSH-NEXT: s_endpgm ; @@ -635,11 +598,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 ; GFX11-DENORM-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-DENORM-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-TRUE16-NEXT: s_lshr_b32 s3, s2, 16 -; GFX11-DENORM-TRUE16-NEXT: v_add_f16_e64 v0.l, s2, s2 -; GFX11-DENORM-TRUE16-NEXT: v_fma_f16 v0.h, s2, 2.0, s3 -; GFX11-DENORM-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] dlc -; GFX11-DENORM-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-DENORM-TRUE16-NEXT: global_store_d16_hi_b16 v1, v0, s[0:1] offset:2 dlc +; GFX11-DENORM-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-DENORM-TRUE16-NEXT: v_fma_f16 v0.l, s2, 2.0, s3 +; GFX11-DENORM-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] offset:2 dlc ; GFX11-DENORM-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-DENORM-TRUE16-NEXT: s_endpgm ; @@ -651,11 +612,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 ; GFX11-DENORM-FAKE16-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-DENORM-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-FAKE16-NEXT: s_lshr_b32 s3, s2, 16 -; GFX11-DENORM-FAKE16-NEXT: v_add_f16_e64 v1, s2, s2 -; GFX11-DENORM-FAKE16-NEXT: v_fma_f16 v2, s2, 2.0, s3 -; GFX11-DENORM-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] dlc -; GFX11-DENORM-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-DENORM-FAKE16-NEXT: global_store_b16 v0, v2, s[0:1] offset:2 dlc +; GFX11-DENORM-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-DENORM-FAKE16-NEXT: v_fma_f16 v1, s2, 2.0, s3 +; GFX11-DENORM-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] offset:2 dlc ; GFX11-DENORM-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-DENORM-FAKE16-NEXT: s_endpgm ; @@ -669,10 +628,8 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 ; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e64 v0.l, s2, s2 ; GFX11-FLUSH-TRUE16-NEXT: s_lshr_b32 s2, s2, 16 ; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.h, s2, v0.l -; GFX11-FLUSH-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] dlc -; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FLUSH-TRUE16-NEXT: global_store_d16_hi_b16 v1, v0, s[0:1] offset:2 dlc +; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, s2, v0.l +; GFX11-FLUSH-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] offset:2 dlc ; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FLUSH-TRUE16-NEXT: s_endpgm ; @@ -686,10 +643,8 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 ; GFX11-FLUSH-FAKE16-NEXT: v_add_f16_e64 v0, s2, s2 ; GFX11-FLUSH-FAKE16-NEXT: s_lshr_b32 s2, s2, 16 ; GFX11-FLUSH-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-FLUSH-FAKE16-NEXT: v_add_f16_e32 v2, s2, v0 -; GFX11-FLUSH-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1] dlc -; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FLUSH-FAKE16-NEXT: global_store_b16 v1, v2, s[0:1] offset:2 dlc +; GFX11-FLUSH-FAKE16-NEXT: v_add_f16_e32 v0, s2, v0 +; GFX11-FLUSH-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1] offset:2 dlc ; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FLUSH-FAKE16-NEXT: s_endpgm %x = bitcast i16 %x.arg to half @@ -697,7 +652,7 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 %out.gep.1 = getelementptr half, ptr addrspace(1) %out, i32 1 %mul2 = fmul fast half %x, 2.0 %mad = fadd fast half %mul2, %y - store volatile half %mul2, ptr addrspace(1) %out + ; store volatile half %mul2, ptr addrspace(1) %out store volatile half %mad, ptr addrspace(1) %out.gep.1 ret void } @@ -705,50 +660,40 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 { ; VI-DENORM-LABEL: multiple_use_fadd_fmad_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dword s4, s[8:9], 0x8 +; VI-DENORM-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-DENORM-NEXT: s_add_i32 s12, s12, s17 ; VI-DENORM-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-DENORM-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; VI-DENORM-NEXT: s_lshr_b32 s3, s4, 16 +; VI-DENORM-NEXT: s_lshr_b32 s3, s2, 16 +; VI-DENORM-NEXT: s_add_u32 s0, s0, 2 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s3 -; VI-DENORM-NEXT: v_fma_f16 v3, |s4|, 2.0, v0 +; VI-DENORM-NEXT: s_addc_u32 s1, s1, 0 +; VI-DENORM-NEXT: v_fma_f16 v2, |s2|, 2.0, v0 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s0 -; VI-DENORM-NEXT: v_add_f16_e64 v2, |s4|, |s4| -; VI-DENORM-NEXT: s_add_u32 s2, s0, 2 ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 -; VI-DENORM-NEXT: s_addc_u32 s3, s1, 0 ; VI-DENORM-NEXT: flat_store_short v[0:1], v2 ; VI-DENORM-NEXT: s_waitcnt vmcnt(0) -; VI-DENORM-NEXT: v_mov_b32_e32 v0, s2 -; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3 -; VI-DENORM-NEXT: flat_store_short v[0:1], v3 -; VI-DENORM-NEXT: s_waitcnt vmcnt(0) ; VI-DENORM-NEXT: s_endpgm ; ; VI-FLUSH-LABEL: multiple_use_fadd_fmad_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dword s4, s[8:9], 0x8 +; VI-FLUSH-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-FLUSH-NEXT: s_add_i32 s12, s12, s17 ; VI-FLUSH-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-FLUSH-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; VI-FLUSH-NEXT: s_lshr_b32 s3, s4, 16 +; VI-FLUSH-NEXT: s_lshr_b32 s3, s2, 16 +; VI-FLUSH-NEXT: s_add_u32 s0, s0, 2 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s3 -; VI-FLUSH-NEXT: v_mad_f16 v3, |s4|, 2.0, v0 +; VI-FLUSH-NEXT: s_addc_u32 s1, s1, 0 +; VI-FLUSH-NEXT: v_mad_f16 v2, |s2|, 2.0, v0 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0 -; VI-FLUSH-NEXT: v_add_f16_e64 v2, |s4|, |s4| -; VI-FLUSH-NEXT: s_add_u32 s2, s0, 2 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 -; VI-FLUSH-NEXT: s_addc_u32 s3, s1, 0 ; VI-FLUSH-NEXT: flat_store_short v[0:1], v2 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) -; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2 -; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 -; VI-FLUSH-NEXT: flat_store_short v[0:1], v3 -; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) ; VI-FLUSH-NEXT: s_endpgm ; ; GFX10-DENORM-LABEL: multiple_use_fadd_fmad_f16: @@ -759,11 +704,8 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 ; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: s_lshr_b32 s3, s2, 16 -; GFX10-DENORM-NEXT: v_add_f16_e64 v1, |s2|, |s2| -; GFX10-DENORM-NEXT: v_fma_f16 v2, |s2|, 2.0, s3 -; GFX10-DENORM-NEXT: global_store_short v0, v1, s[0:1] -; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-DENORM-NEXT: global_store_short v0, v2, s[0:1] offset:2 +; GFX10-DENORM-NEXT: v_fma_f16 v1, |s2|, 2.0, s3 +; GFX10-DENORM-NEXT: global_store_short v0, v1, s[0:1] offset:2 ; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-DENORM-NEXT: s_endpgm ; @@ -776,10 +718,8 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: v_add_f16_e64 v0, |s2|, |s2| ; GFX10-FLUSH-NEXT: s_lshr_b32 s2, s2, 16 -; GFX10-FLUSH-NEXT: v_add_f16_e32 v2, s2, v0 -; GFX10-FLUSH-NEXT: global_store_short v1, v0, s[0:1] -; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-FLUSH-NEXT: global_store_short v1, v2, s[0:1] offset:2 +; GFX10-FLUSH-NEXT: v_add_f16_e32 v0, s2, v0 +; GFX10-FLUSH-NEXT: global_store_short v1, v0, s[0:1] offset:2 ; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-FLUSH-NEXT: s_endpgm ; @@ -791,11 +731,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 ; GFX11-DENORM-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-DENORM-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-TRUE16-NEXT: s_lshr_b32 s3, s2, 16 -; GFX11-DENORM-TRUE16-NEXT: v_add_f16_e64 v0.l, |s2|, |s2| -; GFX11-DENORM-TRUE16-NEXT: v_fma_f16 v0.h, |s2|, 2.0, s3 -; GFX11-DENORM-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] dlc -; GFX11-DENORM-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-DENORM-TRUE16-NEXT: global_store_d16_hi_b16 v1, v0, s[0:1] offset:2 dlc +; GFX11-DENORM-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-DENORM-TRUE16-NEXT: v_fma_f16 v0.l, |s2|, 2.0, s3 +; GFX11-DENORM-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] offset:2 dlc ; GFX11-DENORM-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-DENORM-TRUE16-NEXT: s_endpgm ; @@ -807,11 +745,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 ; GFX11-DENORM-FAKE16-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-DENORM-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-FAKE16-NEXT: s_lshr_b32 s3, s2, 16 -; GFX11-DENORM-FAKE16-NEXT: v_add_f16_e64 v1, |s2|, |s2| -; GFX11-DENORM-FAKE16-NEXT: v_fma_f16 v2, |s2|, 2.0, s3 -; GFX11-DENORM-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] dlc -; GFX11-DENORM-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-DENORM-FAKE16-NEXT: global_store_b16 v0, v2, s[0:1] offset:2 dlc +; GFX11-DENORM-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-DENORM-FAKE16-NEXT: v_fma_f16 v1, |s2|, 2.0, s3 +; GFX11-DENORM-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] offset:2 dlc ; GFX11-DENORM-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-DENORM-FAKE16-NEXT: s_endpgm ; @@ -825,10 +761,8 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 ; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e64 v0.l, |s2|, |s2| ; GFX11-FLUSH-TRUE16-NEXT: s_lshr_b32 s2, s2, 16 ; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.h, s2, v0.l -; GFX11-FLUSH-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] dlc -; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FLUSH-TRUE16-NEXT: global_store_d16_hi_b16 v1, v0, s[0:1] offset:2 dlc +; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, s2, v0.l +; GFX11-FLUSH-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] offset:2 dlc ; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FLUSH-TRUE16-NEXT: s_endpgm ; @@ -842,10 +776,8 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 ; GFX11-FLUSH-FAKE16-NEXT: v_add_f16_e64 v0, |s2|, |s2| ; GFX11-FLUSH-FAKE16-NEXT: s_lshr_b32 s2, s2, 16 ; GFX11-FLUSH-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-FLUSH-FAKE16-NEXT: v_add_f16_e32 v2, s2, v0 -; GFX11-FLUSH-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1] dlc -; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FLUSH-FAKE16-NEXT: global_store_b16 v1, v2, s[0:1] offset:2 dlc +; GFX11-FLUSH-FAKE16-NEXT: v_add_f16_e32 v0, s2, v0 +; GFX11-FLUSH-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1] offset:2 dlc ; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FLUSH-FAKE16-NEXT: s_endpgm %x = bitcast i16 %x.arg to half @@ -854,7 +786,7 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 %x.abs = call half @llvm.fabs.f16(half %x) %mul2 = fmul fast half %x.abs, 2.0 %mad = fadd fast half %mul2, %y - store volatile half %mul2, ptr addrspace(1) %out + ; store volatile half %mul2, ptr addrspace(1) %out store volatile half %mad, ptr addrspace(1) %out.gep.1 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/fpext-free.ll b/llvm/test/CodeGen/AMDGPU/fpext-free.ll index b88cb210c91e8..8402905752b9d 100644 --- a/llvm/test/CodeGen/AMDGPU/fpext-free.ll +++ b/llvm/test/CodeGen/AMDGPU/fpext-free.ll @@ -28,19 +28,13 @@ define float @fadd_fpext_fmul_f16_to_f32(half %x, half %y, float %z) #0 { ; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-F32FLUSH-LABEL: fadd_fpext_fmul_f16_to_f32: -; GFX9-F32FLUSH: ; %bb.0: ; %entry -; GFX9-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] -; GFX9-F32FLUSH-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-F32DENORM-LABEL: fadd_fpext_fmul_f16_to_f32: -; GFX9-F32DENORM: ; %bb.0: ; %entry -; GFX9-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-F32DENORM-NEXT: v_mul_f16_e32 v0, v0, v1 -; GFX9-F32DENORM-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX9-F32DENORM-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX9-F32DENORM-NEXT: s_setpc_b64 s[30:31] +; GFX89-LABEL: fadd_fpext_fmul_f16_to_f32: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX89-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX89-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX89-NEXT: s_setpc_b64 s[30:31] entry: %mul = fmul half %x, %y %mul.ext = fpext half %mul to float @@ -132,19 +126,13 @@ define float @fadd_fpext_fmul_f16_to_f32_commute(half %x, half %y, float %z) #0 ; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-F32FLUSH-LABEL: fadd_fpext_fmul_f16_to_f32_commute: -; GFX9-F32FLUSH: ; %bb.0: ; %entry -; GFX9-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] -; GFX9-F32FLUSH-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-F32DENORM-LABEL: fadd_fpext_fmul_f16_to_f32_commute: -; GFX9-F32DENORM: ; %bb.0: ; %entry -; GFX9-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-F32DENORM-NEXT: v_mul_f16_e32 v0, v0, v1 -; GFX9-F32DENORM-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX9-F32DENORM-NEXT: v_add_f32_e32 v0, v2, v0 -; GFX9-F32DENORM-NEXT: s_setpc_b64 s[30:31] +; GFX89-LABEL: fadd_fpext_fmul_f16_to_f32_commute: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX89-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX89-NEXT: v_add_f32_e32 v0, v2, v0 +; GFX89-NEXT: s_setpc_b64 s[30:31] entry: %mul = fmul half %x, %y %mul.ext = fpext half %mul to float @@ -176,9 +164,9 @@ define float @fadd_muladd_fpext_fmul_f16_to_f32(float %x, float %y, half %u, hal ; GFX9-F32FLUSH-LABEL: fadd_muladd_fpext_fmul_f16_to_f32: ; GFX9-F32FLUSH: ; %bb.0: ; %entry ; GFX9-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v2, v2, v3, v4 op_sel_hi:[1,1,0] -; GFX9-F32FLUSH-NEXT: v_mac_f32_e32 v2, v0, v1 -; GFX9-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-F32FLUSH-NEXT: v_mul_f16_e32 v2, v2, v3 +; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[0,0,1] +; GFX9-F32FLUSH-NEXT: v_add_f32_e32 v0, v0, v4 ; GFX9-F32FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-F32DENORM-LABEL: fadd_muladd_fpext_fmul_f16_to_f32: @@ -221,9 +209,9 @@ define float @fadd_muladd_fpext_fmul_f16_to_f32_commute(float %x, float %y, half ; GFX9-F32FLUSH-LABEL: fadd_muladd_fpext_fmul_f16_to_f32_commute: ; GFX9-F32FLUSH: ; %bb.0: ; %entry ; GFX9-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v2, v2, v3, v4 op_sel_hi:[1,1,0] -; GFX9-F32FLUSH-NEXT: v_mac_f32_e32 v2, v0, v1 -; GFX9-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-F32FLUSH-NEXT: v_mul_f16_e32 v2, v2, v3 +; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[0,0,1] +; GFX9-F32FLUSH-NEXT: v_add_f32_e32 v0, v4, v0 ; GFX9-F32FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-F32DENORM-LABEL: fadd_muladd_fpext_fmul_f16_to_f32_commute: @@ -264,9 +252,9 @@ define float @fadd_fmad_fpext_fmul_f16_to_f32(float %x, float %y, half %u, half ; GFX9-F32FLUSH-LABEL: fadd_fmad_fpext_fmul_f16_to_f32: ; GFX9-F32FLUSH: ; %bb.0: ; %entry ; GFX9-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v2, v2, v3, v4 op_sel_hi:[1,1,0] -; GFX9-F32FLUSH-NEXT: v_mac_f32_e32 v2, v0, v1 -; GFX9-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-F32FLUSH-NEXT: v_mul_f16_e32 v2, v2, v3 +; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[0,0,1] +; GFX9-F32FLUSH-NEXT: v_add_f32_e32 v0, v0, v4 ; GFX9-F32FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-F32DENORM-LABEL: fadd_fmad_fpext_fmul_f16_to_f32: @@ -307,22 +295,14 @@ define float @fadd_fma_fpext_fmul_f16_to_f32(float %x, float %y, half %u, half % ; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, v0, v4 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-F32FLUSH-LABEL: fadd_fma_fpext_fmul_f16_to_f32: -; GFX9-F32FLUSH: ; %bb.0: ; %entry -; GFX9-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v2, v2, v3, v4 op_sel_hi:[1,1,0] -; GFX9-F32FLUSH-NEXT: v_mac_f32_e32 v2, v0, v1 -; GFX9-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-F32FLUSH-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-F32DENORM-LABEL: fadd_fma_fpext_fmul_f16_to_f32: -; GFX9-F32DENORM: ; %bb.0: ; %entry -; GFX9-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-F32DENORM-NEXT: v_mul_f16_e32 v2, v2, v3 -; GFX9-F32DENORM-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX9-F32DENORM-NEXT: v_fma_f32 v0, v0, v1, v2 -; GFX9-F32DENORM-NEXT: v_add_f32_e32 v0, v0, v4 -; GFX9-F32DENORM-NEXT: s_setpc_b64 s[30:31] +; GFX89-LABEL: fadd_fma_fpext_fmul_f16_to_f32: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: v_mul_f16_e32 v2, v2, v3 +; GFX89-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX89-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX89-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX89-NEXT: s_setpc_b64 s[30:31] entry: %mul = fmul contract half %u, %v %mul.ext = fpext half %mul to float @@ -350,22 +330,14 @@ define float @fadd_fma_fpext_fmul_f16_to_f32_commute(float %x, float %y, half %u ; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, v4, v0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-F32FLUSH-LABEL: fadd_fma_fpext_fmul_f16_to_f32_commute: -; GFX9-F32FLUSH: ; %bb.0: ; %entry -; GFX9-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v2, v2, v3, v4 op_sel_hi:[1,1,0] -; GFX9-F32FLUSH-NEXT: v_mac_f32_e32 v2, v0, v1 -; GFX9-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-F32FLUSH-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-F32DENORM-LABEL: fadd_fma_fpext_fmul_f16_to_f32_commute: -; GFX9-F32DENORM: ; %bb.0: ; %entry -; GFX9-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-F32DENORM-NEXT: v_mul_f16_e32 v2, v2, v3 -; GFX9-F32DENORM-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX9-F32DENORM-NEXT: v_fma_f32 v0, v0, v1, v2 -; GFX9-F32DENORM-NEXT: v_add_f32_e32 v0, v4, v0 -; GFX9-F32DENORM-NEXT: s_setpc_b64 s[30:31] +; GFX89-LABEL: fadd_fma_fpext_fmul_f16_to_f32_commute: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: v_mul_f16_e32 v2, v2, v3 +; GFX89-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX89-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX89-NEXT: v_add_f32_e32 v0, v4, v0 +; GFX89-NEXT: s_setpc_b64 s[30:31] entry: %mul = fmul contract half %u, %v %mul.ext = fpext half %mul to float @@ -535,19 +507,13 @@ define float @fsub_fpext_fmul_f16_to_f32(half %x, half %y, float %z) #0 { ; GFX11-FAKE16-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-F32FLUSH-LABEL: fsub_fpext_fmul_f16_to_f32: -; GFX9-F32FLUSH: ; %bb.0: ; %entry -; GFX9-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, v0, v1, -v2 op_sel_hi:[1,1,0] -; GFX9-F32FLUSH-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-F32DENORM-LABEL: fsub_fpext_fmul_f16_to_f32: -; GFX9-F32DENORM: ; %bb.0: ; %entry -; GFX9-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-F32DENORM-NEXT: v_mul_f16_e32 v0, v0, v1 -; GFX9-F32DENORM-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX9-F32DENORM-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX9-F32DENORM-NEXT: s_setpc_b64 s[30:31] +; GFX89-LABEL: fsub_fpext_fmul_f16_to_f32: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX89-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX89-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX89-NEXT: s_setpc_b64 s[30:31] entry: %mul = fmul half %x, %y %mul.ext = fpext half %mul to float @@ -582,19 +548,13 @@ define float @fsub_fpext_fmul_f16_to_f32_commute(float %x, half %y, half %z) #0 ; GFX11-F32DENORM-FAKE16-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX11-F32DENORM-FAKE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-F32FLUSH-LABEL: fsub_fpext_fmul_f16_to_f32_commute: -; GFX9-F32FLUSH: ; %bb.0: ; %entry -; GFX9-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, -v1, v2, v0 op_sel_hi:[1,1,0] -; GFX9-F32FLUSH-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-F32DENORM-LABEL: fsub_fpext_fmul_f16_to_f32_commute: -; GFX9-F32DENORM: ; %bb.0: ; %entry -; GFX9-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-F32DENORM-NEXT: v_mul_f16_e32 v1, v1, v2 -; GFX9-F32DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX9-F32DENORM-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX9-F32DENORM-NEXT: s_setpc_b64 s[30:31] +; GFX89-LABEL: fsub_fpext_fmul_f16_to_f32_commute: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: v_mul_f16_e32 v1, v1, v2 +; GFX89-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX89-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX89-NEXT: s_setpc_b64 s[30:31] entry: %mul = fmul contract half %y, %z %mul.ext = fpext half %mul to float @@ -623,19 +583,13 @@ define float @fsub_fpext_fneg_fmul_f16_to_f32(half %x, half %y, float %z) #0 { ; GFX11-FAKE16-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-F32FLUSH-LABEL: fsub_fpext_fneg_fmul_f16_to_f32: -; GFX9-F32FLUSH: ; %bb.0: ; %entry -; GFX9-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, v0, -v1, -v2 op_sel_hi:[1,1,0] -; GFX9-F32FLUSH-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-F32DENORM-LABEL: fsub_fpext_fneg_fmul_f16_to_f32: -; GFX9-F32DENORM: ; %bb.0: ; %entry -; GFX9-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-F32DENORM-NEXT: v_mul_f16_e64 v0, v0, -v1 -; GFX9-F32DENORM-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX9-F32DENORM-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX9-F32DENORM-NEXT: s_setpc_b64 s[30:31] +; GFX89-LABEL: fsub_fpext_fneg_fmul_f16_to_f32: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: v_mul_f16_e64 v0, v0, -v1 +; GFX89-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX89-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX89-NEXT: s_setpc_b64 s[30:31] entry: %mul = fmul half %x, %y %neg.mul = fsub half -0.0, %mul @@ -665,19 +619,13 @@ define float @fsub_fneg_fpext_fmul_f16_to_f32(half %x, half %y, float %z) #0 { ; GFX11-FAKE16-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-F32FLUSH-LABEL: fsub_fneg_fpext_fmul_f16_to_f32: -; GFX9-F32FLUSH: ; %bb.0: ; %entry -; GFX9-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, v0, -v1, -v2 op_sel_hi:[1,1,0] -; GFX9-F32FLUSH-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-F32DENORM-LABEL: fsub_fneg_fpext_fmul_f16_to_f32: -; GFX9-F32DENORM: ; %bb.0: ; %entry -; GFX9-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-F32DENORM-NEXT: v_mul_f16_e64 v0, v0, -v1 -; GFX9-F32DENORM-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX9-F32DENORM-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX9-F32DENORM-NEXT: s_setpc_b64 s[30:31] +; GFX89-LABEL: fsub_fneg_fpext_fmul_f16_to_f32: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: v_mul_f16_e64 v0, v0, -v1 +; GFX89-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX89-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX89-NEXT: s_setpc_b64 s[30:31] entry: %mul = fmul half %x, %y %mul.ext = fpext half %mul to float @@ -710,9 +658,9 @@ define float @fsub_muladd_fpext_mul_f16_to_f32(float %x, float %y, float %z, hal ; GFX9-F32FLUSH-LABEL: fsub_muladd_fpext_mul_f16_to_f32: ; GFX9-F32FLUSH: ; %bb.0: ; %entry ; GFX9-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v2, v3, v4, -v2 op_sel_hi:[1,1,0] -; GFX9-F32FLUSH-NEXT: v_mac_f32_e32 v2, v0, v1 -; GFX9-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-F32FLUSH-NEXT: v_mul_f16_e32 v3, v3, v4 +; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, v0, v1, v3 op_sel_hi:[0,0,1] +; GFX9-F32FLUSH-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX9-F32FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-F32DENORM-LABEL: fsub_muladd_fpext_mul_f16_to_f32: @@ -797,8 +745,9 @@ define float @fsub_muladd_fpext_mul_f16_to_f32_commute(float %x, float %y, float ; GFX9-F32FLUSH-LABEL: fsub_muladd_fpext_mul_f16_to_f32_commute: ; GFX9-F32FLUSH: ; %bb.0: ; %entry ; GFX9-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, -v3, v4, v0 op_sel_hi:[1,1,0] -; GFX9-F32FLUSH-NEXT: v_mad_f32 v0, -v1, v2, v0 +; GFX9-F32FLUSH-NEXT: v_mul_f16_e32 v3, v3, v4 +; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v1, v1, v2, v3 op_sel_hi:[0,0,1] +; GFX9-F32FLUSH-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX9-F32FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-F32DENORM-LABEL: fsub_muladd_fpext_mul_f16_to_f32_commute: