From 3fce7cf8a10fa40418952632d3e9454a18386e7b Mon Sep 17 00:00:00 2001 From: lalaniket8 Date: Fri, 5 Dec 2025 16:09:36 +0530 Subject: [PATCH 1/2] FC for nonSSA Exec Mask Man instrs --- .../llvm/CodeGen/MachineRegisterInfo.h | 7 + llvm/lib/CodeGen/MachineRegisterInfo.cpp | 14 ++ .../lib/Target/AMDGPU/AMDGPUWaveTransform.cpp | 47 ++--- llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.cpp | 182 +++++++++++------- llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.h | 21 +- 5 files changed, 172 insertions(+), 99 deletions(-) diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h index 737b74ef3f761..bb44bb75f3801 100644 --- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h @@ -622,6 +622,13 @@ class MachineRegisterInfo { /// multiple definitions or no definition, return null. LLVM_ABI MachineInstr *getUniqueVRegDef(Register Reg) const; + /// getDomVRegDefInBasicBlock - Return the last machine instr that defines + /// the specified virtual register in the basic block, searching backwards + /// from instruction I (inclusive). Returns nullptr if no definition is found. + /// accepts end() sentinel value iterator as a valid parameter, will decrement + /// it to the previous instruction if it is end() + LLVM_ABI MachineBasicBlock::iterator getDomVRegDefInBasicBlock(Register Reg, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const; + /// clearKillFlags - Iterate over all the uses of the given register and /// clear the kill flag from the MachineOperand. This function is used by /// optimization passes which extend register lifetimes and need only diff --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp index 094315b3903ea..23b94cd14953f 100644 --- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp +++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp @@ -422,6 +422,20 @@ MachineInstr *MachineRegisterInfo::getUniqueVRegDef(Register Reg) const { return &*I; } +/// getDomVRegDefInBasicBlock - Return the last machine instr that defines +/// the specified virtual register in the basic block, searching backwards +/// from instruction I (exclusive). Returns MBB.end() if no definition is found. +MachineBasicBlock::iterator MachineRegisterInfo::getDomVRegDefInBasicBlock( + Register Reg, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { + if(I == MBB.begin()) return MBB.end(); + // Iterate backwards from I (exclusive) to the beginning of the basic block + do { + --I; + if (I->modifiesRegister(Reg, getTargetRegisterInfo())) return I; + } while (I != MBB.begin()); + return MBB.end(); +} + bool MachineRegisterInfo::hasOneNonDBGUse(Register RegNo) const { return hasSingleElement(use_nodbg_operands(RegNo)); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUWaveTransform.cpp b/llvm/lib/Target/AMDGPU/AMDGPUWaveTransform.cpp index 90cb044285bae..8e2896d7c5103 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUWaveTransform.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUWaveTransform.cpp @@ -1909,7 +1909,7 @@ void ControlFlowRewriter::rewrite() { Opcode = AMDGPU::S_CBRANCH_SCC1; } else { Register CondReg = Info.OrigCondition; - bool isCondRegSubsetOfExec = LMA.isSubsetOfExec(CondReg, *Node->Block); + bool isCondRegSubsetOfExec = LMA.isSubsetOfExec(CondReg, *Node->Block, Node->Block->end()); LLVM_DEBUG(dbgs() << "isSubsetOfExec(" << printReg(CondReg, MRI.getTargetRegisterInfo(), 0, &MRI) << "," << Node->Block->name() << ") : " << isCondRegSubsetOfExec << "\n"); if (!isCondRegSubsetOfExec) { @@ -1973,7 +1973,7 @@ void ControlFlowRewriter::rewrite() { // Step 2.1: Add conditions branching to LaneTarget to the Lane mask // Updater. // FIXME: we are creating a register here only to initialize the updater - Updater.init(LMU.createLaneMaskReg()); + Updater.init(); Updater.addReset(*LaneTarget->Block, GCNLaneMaskUpdater::ResetInMiddle); LLVM_DEBUG(dbgs() << "\nMark ResetInMiddle(X): " << LaneTarget->printableName() << '\n'); for (const auto &NodeDivergentPair : LaneTargetInfo.OriginBranch) { @@ -2023,7 +2023,7 @@ void ControlFlowRewriter::rewrite() { } } else { CondReg = LaneOrigin.CondReg; - bool isCondRegSubsetOfExec = LMA.isSubsetOfExec(LaneOrigin.CondReg, *LaneOrigin.Node->Block); + bool isCondRegSubsetOfExec = LMA.isSubsetOfExec(LaneOrigin.CondReg, *LaneOrigin.Node->Block, LaneOrigin.Node->Block->getFirstTerminator()); LLVM_DEBUG(dbgs() << "isSubsetOfExec(" << printReg(LaneOrigin.CondReg, MRI.getTargetRegisterInfo(), 0, &MRI) << "," << LaneOrigin.Node->Block->name() << ") : " << isCondRegSubsetOfExec << "\n"); if (!isCondRegSubsetOfExec) { Register Prev = CondReg; @@ -2120,7 +2120,7 @@ void ControlFlowRewriter::rewrite() { LLVM_DEBUG(dbgs() << "\nRejoin @ " << Secondary->printableName() << '\n'); Secondary->dump(); // FIXME: we are creating a register here only to initialize the updater - Updater.init(LMU.createLaneMaskReg()); + Updater.init(); Updater.addReset(*Secondary->Block, GCNLaneMaskUpdater::ResetInMiddle); LLVM_DEBUG(dbgs() << "\nMark ResetInMiddle(X): " << Secondary->printableName() << '\n'); @@ -2132,32 +2132,32 @@ void ControlFlowRewriter::rewrite() { Register PrimaryExec = PredInfo.PrimarySuccessorExec; LLVM_DEBUG(dbgs() << "Pred:" << Pred->Block->name() << "\nPrimaryExec:" << printReg(PrimaryExec,MRI.getTargetRegisterInfo(), 0, &MRI) << "\n"); - MachineInstr *PrimaryExecDef; - for (;;) { - PrimaryExecDef = MRI.getVRegDef(PrimaryExec); - if (PrimaryExecDef->getOpcode() != AMDGPU::COPY) - break; - PrimaryExec = PrimaryExecDef->getOperand(1).getReg(); - } + // MachineInstr *PrimaryExecDef; + // for (;;) { + // PrimaryExecDef = MRI.getVRegDef(PrimaryExec); + // if (PrimaryExecDef->getOpcode() != AMDGPU::COPY) + // break; + // PrimaryExec = PrimaryExecDef->getOperand(1).getReg(); + // } - LLVM_DEBUG(dbgs() << "PrimaryExecDef:"); - LLVM_DEBUG(PrimaryExecDef->dump()); - LLVM_DEBUG(dbgs() << "\n"); + // LLVM_DEBUG(dbgs() << "PrimaryExecDef:"); + // LLVM_DEBUG(PrimaryExecDef->dump()); + // LLVM_DEBUG(dbgs() << "\n"); // Rejoin = EXEC ^ PrimaryExec // // Fold immediately if PrimaryExec was obtained via XOR as well. Register Rejoin; - if (PrimaryExecDef->getParent() == Pred->Block && - PrimaryExecDef->getOpcode() == LMC.XorOpc && - PrimaryExecDef->getOperand(1).isReg() && - PrimaryExecDef->getOperand(2).isReg()) { - if (PrimaryExecDef->getOperand(1).getReg() == LMC.ExecReg) - Rejoin = PrimaryExecDef->getOperand(2).getReg(); - else if (PrimaryExecDef->getOperand(2).getReg() == LMC.ExecReg) - Rejoin = PrimaryExecDef->getOperand(1).getReg(); - } + // if (PrimaryExecDef->getParent() == Pred->Block && + // PrimaryExecDef->getOpcode() == LMC.XorOpc && + // PrimaryExecDef->getOperand(1).isReg() && + // PrimaryExecDef->getOperand(2).isReg()) { + // if (PrimaryExecDef->getOperand(1).getReg() == LMC.ExecReg) + // Rejoin = PrimaryExecDef->getOperand(2).getReg(); + // else if (PrimaryExecDef->getOperand(2).getReg() == LMC.ExecReg) + // Rejoin = PrimaryExecDef->getOperand(1).getReg(); + // } if (!Rejoin) { // Try to find a previously generated XOR (or merely masked) value @@ -2199,6 +2199,7 @@ void ControlFlowRewriter::rewrite() { } + Updater.insertAccumulatorResets(); Updater.cleanup(); LLVM_DEBUG(dbgs() << "CFG_BEGIN:" << Function.getName().str() << "_clean\n"); diff --git a/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.cpp b/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.cpp index 8bc1e7a552d4c..440d5a2018b95 100644 --- a/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.cpp +++ b/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.cpp @@ -31,47 +31,69 @@ bool GCNLaneMaskUtils::maybeLaneMask(Register Reg) const { /// Determine whether the lane-mask register \p Reg is a wave-wide constant. /// If so, the value is stored in \p Val. -bool GCNLaneMaskUtils::isConstantLaneMask(Register Reg, bool &Val) const { +bool GCNLaneMaskUtils::isConstantLaneMask(Register Reg, bool &Val, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const { MachineRegisterInfo &MRI = MF.getRegInfo(); + dbgs() << "isConstantLaneMask(" << printReg(Reg, MRI.getTargetRegisterInfo(), 0, &MRI) << "," << MBB.name() << ") : \n"; + dbgs() << "MI:"; + if(MI != MBB.end()) MI->dump(); + else dbgs() << "end of block"; + dbgs() << "\n"; - const MachineInstr *MI; for (;;) { - MI = MRI.getVRegDef(Reg); - if (!MI) { + MI = MRI.getDomVRegDefInBasicBlock(Reg, MBB, MI); + if (MI == MBB.end()) { // This can happen when called from GCNLaneMaskUpdater, where Reg can // be a placeholder that has not yet been filled in. + dbgs() << "MI == MBB.end(), return false\n"; return false; } - if (MI->getOpcode() == AMDGPU::IMPLICIT_DEF) + dbgs() << "MI:"; + MI->dump(); + dbgs() << "\n"; + + if (MI->getOpcode() == AMDGPU::IMPLICIT_DEF){ + dbgs() << "MI->getOpcode() == AMDGPU::IMPLICIT_DEF, return true;\n"; return true; + } if (MI->getOpcode() != AMDGPU::COPY) break; Reg = MI->getOperand(1).getReg(); - if (!Register::isVirtualRegister(Reg)) - return false; - if (!maybeLaneMask(Reg)) - return false; + if (!Register::isVirtualRegister(Reg)){ + dbgs() << "!Register::isVirtualRegister(Reg), return false\n"; + return false;} + if (!maybeLaneMask(Reg)){ + dbgs() << "!maybeLaneMask(Reg), return false\n"; + return false;} } - if (MI->getOpcode() != LMC.MovOpc) - return false; + dbgs() << "MI after loop:"; + MI->dump(); + dbgs() << "\n"; - if (!MI->getOperand(1).isImm()) - return false; + if (MI->getOpcode() != LMC.MovOpc){ + dbgs() << "MI->getOpcode() != LMC.MovOpc, return false\n"; + return false;} + + if (!MI->getOperand(1).isImm()){ + dbgs() << "!MI->getOperand(1).isImm(), return false\n"; + return false;} int64_t Imm = MI->getOperand(1).getImm(); if (Imm == 0) { + dbgs() << "Imm == 0, Val = false, return true\n"; Val = false; return true; } if (Imm == -1) { + dbgs() << "Imm == -1, Val = true, return true\n"; Val = true; return true; } + dbgs() << "End of isConstantLaneMask, return false\n"; return false; } @@ -105,13 +127,14 @@ void GCNLaneMaskUtils::buildMergeLaneMasks(MachineBasicBlock &MBB, const DebugLoc &DL, Register DstReg, Register PrevReg, Register CurReg, GCNLaneMaskAnalysis *LMA, - bool accumulating) const { + bool accumulating, + bool isPrevZeroReg) const { const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); bool PrevVal = false; - bool PrevConstant = !PrevReg || isConstantLaneMask(PrevReg, PrevVal); + bool PrevConstant = !PrevReg || isPrevZeroReg; bool CurVal = false; - bool CurConstant = isConstantLaneMask(CurReg, CurVal); + bool CurConstant = isConstantLaneMask(CurReg, CurVal, MBB, I); MachineRegisterInfo &MRI = MF.getRegInfo(); Printable destRegPrintable = printReg(DstReg , MRI.getTargetRegisterInfo(), 0, &MRI); @@ -125,7 +148,11 @@ void GCNLaneMaskUtils::buildMergeLaneMasks(MachineBasicBlock &MBB, dbgs() << "\t\t Create instr : " << destRegPrintable << " = (" << prevRegPrintable << " & ~EXEC) | (" << curRegPrintable << " & EXEC) : \n"; dbgs() << "\t\tPrevConstant:" << PrevConstant << " CurConstant:" << CurConstant << "\n"; dbgs() << "\t\tPrevVal:" << PrevVal << " CurVal:" << CurVal << "\n"; - + dbgs() << "\t\tIterator I:"; + if(I != MBB.end()) I->dump(); + else dbgs() << "end of block"; + dbgs() << "\n"; + assert(PrevReg || !accumulating); if (PrevConstant && CurConstant) {// is wave wide constant? @@ -164,7 +191,7 @@ void GCNLaneMaskUtils::buildMergeLaneMasks(MachineBasicBlock &MBB, } } if (!CurConstant) { - bool isCurRegSubsetOfExec = LMA && LMA->isSubsetOfExec(CurReg, MBB); + bool isCurRegSubsetOfExec = LMA && LMA->isSubsetOfExec(CurReg, MBB, I); dbgs() << "isSubsetOfExec(" << printReg(CurReg, MRI.getTargetRegisterInfo(), 0, &MRI) << "," << MBB.name() << ") : " << isCurRegSubsetOfExec << "\n"; if ((PrevConstant && PrevVal) || isCurRegSubsetOfExec) { CurMaskedReg = CurReg; @@ -218,22 +245,26 @@ void GCNLaneMaskUtils::buildMergeLaneMasks(MachineBasicBlock &MBB, /// (Reg & EXEC) == Reg when used in \p UseBlock. bool GCNLaneMaskAnalysis::isSubsetOfExec(Register Reg, MachineBasicBlock &UseBlock, + MachineBasicBlock::iterator I, unsigned RemainingDepth) { MachineRegisterInfo &MRI = LMU.function()->getRegInfo(); - MachineInstr *DefInstr = nullptr; + MachineBasicBlock::iterator DefInstr = UseBlock.end(); const AMDGPU::LaneMaskConstants &LMC = LMU.getLaneMaskConsts(); for (;;) { if (!Register::isVirtualRegister(Reg)) { if (Reg == LMC.ExecReg && - (!DefInstr || DefInstr->getParent() == &UseBlock)) + (DefInstr == UseBlock.end() || DefInstr->getParent() == &UseBlock)) return true; return false; } - DefInstr = MRI.getVRegDef(Reg); + DefInstr = MRI.getDomVRegDefInBasicBlock(Reg, UseBlock, I); + if(DefInstr == UseBlock.end()) + return false; if (DefInstr->getOpcode() == AMDGPU::COPY) { Reg = DefInstr->getOperand(1).getReg(); + I = DefInstr; continue; } @@ -272,7 +303,7 @@ bool GCNLaneMaskAnalysis::isSubsetOfExec(Register Reg, if ((LikeOr || IsAnd || IsAndN2) && (DefInstr->getOperand(1).isReg() && DefInstr->getOperand(2).isReg())) { bool FirstIsSubset = isSubsetOfExec(DefInstr->getOperand(1).getReg(), - UseBlock, RemainingDepth); + UseBlock, DefInstr, RemainingDepth); if (!FirstIsSubset && (LikeOr || IsAndN2)) return SubsetOfExec.try_emplace(Reg, false).first->second; @@ -282,7 +313,7 @@ bool GCNLaneMaskAnalysis::isSubsetOfExec(Register Reg, } bool SecondIsSubset = isSubsetOfExec(DefInstr->getOperand(2).getReg(), - UseBlock, RemainingDepth); + UseBlock, DefInstr, RemainingDepth); if (!SecondIsSubset) return SubsetOfExec.try_emplace(Reg, false).first->second; @@ -294,18 +325,18 @@ bool GCNLaneMaskAnalysis::isSubsetOfExec(Register Reg, } /// Initialize the updater. -void GCNLaneMaskUpdater::init(Register Reg) { +void GCNLaneMaskUpdater::init() { Processed = false; Blocks.clear(); // SSAUpdater.Initialize(LMU.getLaneMaskConsts().LaneMaskRC); - SSAUpdater.Initialize(Reg); + Accumulator = {}; } /// Optional cleanup, may remove stray instructions. void GCNLaneMaskUpdater::cleanup() { Processed = false; Blocks.clear(); - + Accumulator = {}; MachineRegisterInfo &MRI = LMU.function()->getRegInfo(); if (ZeroReg && MRI.use_empty(ZeroReg)) { @@ -362,7 +393,7 @@ Register GCNLaneMaskUpdater::getValueInMiddleOfBlock(MachineBasicBlock &Block) { dbgs() << "GCNLaneMaskUpdater::getValueInMiddleOfBlock(" << Block.name() << ")\n"; if (!Processed) process(); - Register reg = SSAUpdater.GetValueInMiddleOfBlock(&Block); + Register reg = Accumulator; dbgs() << "GCNLaneMaskUpdater::getValueInMiddleOfBlock(" << Block.name() << "," << printReg(reg, MRI.getTargetRegisterInfo(), 0, &MRI) << ")\n"; return reg; } @@ -377,7 +408,7 @@ Register GCNLaneMaskUpdater::getValueAtEndOfBlock(MachineBasicBlock &Block) { dbgs() << "GCNLaneMaskUpdater::getValueAtEndOfBlock(" << Block.name() << ")\n"; if (!Processed) process(); - Register reg = SSAUpdater.GetValueAtEndOfBlock(&Block); + Register reg = Accumulator; dbgs() << "GCNLaneMaskUpdater::getValueAtEndOfBlock(" << Block.name() << "," << printReg(reg, MRI.getTargetRegisterInfo(), 0, &MRI) << ")\n"; return reg; } @@ -390,8 +421,8 @@ Register GCNLaneMaskUpdater::getValueAfterMerge(MachineBasicBlock &Block) { Register reg = {}; auto BlockIt = findBlockInfo(Block); if (BlockIt != Blocks.end()) { - if (BlockIt->Merged){ - reg = BlockIt->Merged; + if (BlockIt->Value){ + reg = Accumulator; dbgs() << "GCNLaneMaskUpdater::getValueAfterMerge(" << Block.name() << "," << printReg(reg, MRI.getTargetRegisterInfo(), 0, &MRI) << ") returning Merged.\n"; return reg; } @@ -404,7 +435,7 @@ Register GCNLaneMaskUpdater::getValueAfterMerge(MachineBasicBlock &Block) { // We didn't merge anything in the block, but the block may still be // ResetAtEnd, in which case we need the pre-reset value. - reg = SSAUpdater.GetValueInMiddleOfBlock(&Block); + reg = Accumulator; dbgs() << "GCNLaneMaskUpdater::getValueAfterMerge(" << Block.name() << "," << printReg(reg, MRI.getTargetRegisterInfo(), 0, &MRI) << ")\n"; return reg; } @@ -453,6 +484,29 @@ getSaluInsertionAtEnd(MachineBasicBlock &MBB) { llvm_unreachable("SCC used by terminator but no def in block"); } +void GCNLaneMaskUpdater::insertAccumulatorResets() { + dbgs() << "GCNLaneMaskUpdater::insertAccumulatorResets()\n"; + const SIInstrInfo *TII = LMU.function()->getSubtarget().getInstrInfo(); + for (auto &Entry : AccumulatorResetBlocks) { + MachineBasicBlock *B = Entry.first; + DenseSet &Accumulators = Entry.second; + for (Register ACC : Accumulators) { + //get first branch instruction + MachineBasicBlock::iterator I = B->getFirstTerminator(); + while(I != B->end() && !I->isBranch()) I++; + if(I == B->end()) I--; + dbgs() << " Resetting accumulator: " << printReg(ACC, MRI.getTargetRegisterInfo(), 0, &MRI) << "@" << B->name()<< "\n"; + dbgs() << " insertion point:"; + if(I == B->end()) + dbgs() << " end of block"; + else + I->dump(); + dbgs() << "\n"; + BuildMI(*B, I, {}, TII->get(LMU.getLaneMaskConsts().MovOpc), ACC).addImm(0)->dump(); + } + } +} + /// Internal method to insert merge instructions. void GCNLaneMaskUpdater::process() { dbgs() << "\n\tGCNLaneMaskUpdater::process() begins\n"; @@ -471,38 +525,28 @@ void GCNLaneMaskUpdater::process() { dbgs() << "\tZeroReg:" << printReg(ZeroReg, MRI.getTargetRegisterInfo(), 0, &MRI) << "\n"; dbgs() << "\n\tAdding available values:\n"; + if (!Accumulator) { + Accumulator = LMU.createLaneMaskReg(); + dbgs() << "\tCreating Accumulator:" << printReg(Accumulator, MRI.getTargetRegisterInfo(), 0, &MRI) << "\n"; + BuildMI(Entry, Entry.getFirstTerminator(), {}, + TII->get(LMU.getLaneMaskConsts().MovOpc), Accumulator) + .addImm(0); + } + dbgs() << "\n\tMachineSSAUpdater ready, begin merging\n"; + // Add available values. for (BlockInfo &Info : Blocks) { dbgs() << "\tAdd avail value for BlockInfo:" << Info.Block->name() << "\n\t"; assert(Accumulating || !Info.Flags); assert(Info.Flags || Info.Value); - - if (Info.Value){ - Info.Merged = LMU.createLaneMaskReg(); - dbgs() << "creating Info.Merged:" << printReg(Info.Merged, MRI.getTargetRegisterInfo(), 0, &MRI) << " for block " << Info.Block->name() << "\n\t"; - } - Info.dump(MRI); - //Info.Value and not ResetAtEnd, then Info.Merged, else ZeroReg - Register val = (Info.Value && !(Info.Flags & ResetAtEnd)) ? Info.Merged : ZeroReg; - dbgs() << "\t\t(Info.Value && !(Info.Flags & ResetAtEnd)) : " << (Info.Value && !(Info.Flags & ResetAtEnd)) << " => "; - if((Info.Value && !(Info.Flags & ResetAtEnd))) - dbgs() << "Info.Merged\n"; - else - dbgs() << "ZeroReg\n"; - SSAUpdater.AddAvailableValue(Info.Block,val); - dbgs() << "\n"; - - } - - if (Accumulating && !SSAUpdater.HasValueForBlock(&Entry)){ - dbgs() << "\tAdd avail value for Entry block : ZeroReg\n"; - SSAUpdater.AddAvailableValue(&Entry, ZeroReg); + if(!Info.Value || (Info.Flags & ResetAtEnd)){ + dbgs() << " !Info.Value || (Info.Flags & ResetAtEnd) is true\n"; + dbgs() << " AccumulatorResetBlocks[" << Info.Block->name() << "]:" << printReg(Accumulator, MRI.getTargetRegisterInfo(), 0, &MRI) << "\n"; + AccumulatorResetBlocks[Info.Block].insert(Accumulator); + } } - - dbgs() << "\n\tMachineSSAUpdater ready, begin merging\n"; - - + // Once the SSA updater is ready, we can fill in all merge code, relying // on the SSA updater to insert required PHIs. for (BlockInfo &Info : Blocks) { @@ -516,11 +560,8 @@ void GCNLaneMaskUpdater::process() { Register Previous; if (Info.Block != &LMU.function()->front() && !(Info.Flags & ResetInMiddle)) { - Previous = SSAUpdater.GetValueInMiddleOfBlock(Info.Block); - if (Accumulating) { - assert(!MRI.getVRegDef(Previous) || - MRI.getVRegDef(Previous)->getOpcode() != AMDGPU::IMPLICIT_DEF); - } else { + Previous = Accumulator; + if (!Accumulating) { MachineInstr *PrevInstr = MRI.getVRegDef(Previous); if (PrevInstr && PrevInstr->getOpcode() == AMDGPU::IMPLICIT_DEF) { PotentiallyDead.insert(PrevInstr); @@ -539,10 +580,19 @@ void GCNLaneMaskUpdater::process() { // Insert merge logic. MachineBasicBlock::iterator insertPt = getSaluInsertionAtEnd(*Info.Block); - LMU.buildMergeLaneMasks(*Info.Block, insertPt, {}, Info.Merged, Previous, - Info.Value, LMA, Accumulating); - - if (Info.Flags & ResetAtEnd) { + LMU.buildMergeLaneMasks(*Info.Block, insertPt, {}, Accumulator, Previous, + Info.Value, LMA, Accumulating, (Previous == ZeroReg)); + + + /*if (Info.Flags & ResetAtEnd) { + // We enter this if block if Info.Block is Ti and Ri + // Here we check if Accumulator was set by a simple copy, if so, we use the corresponding register + // This is a copy propogation optimization. + // It depends on getting the latest def of Accumulator in Info.Block and checking if it has no uses. + // TODO : Swithing off this optimization for nonSSA context since Accumulator will + // have a use at the end of Info.Block : Set Accumumlator to 0 (since Info.Block is Ri) + // Will implement a nonSSA variant for the same. + MachineInstr *mergeInstr = MRI.getVRegDef(Info.Merged); dbgs() << "\tmergeInstr:"; mergeInstr->dump(); @@ -555,7 +605,7 @@ void GCNLaneMaskUpdater::process() { dbgs() << "\tErase mergeInstr\n"; mergeInstr->eraseFromParent(); } - } + }*/ } Processed = true; diff --git a/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.h b/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.h index 2903f93fd98e1..7084d5f362344 100644 --- a/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.h +++ b/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.h @@ -44,14 +44,15 @@ class GCNLaneMaskUtils { const AMDGPU::LaneMaskConstants &getLaneMaskConsts() const { return LMC; } bool maybeLaneMask(Register Reg) const; - bool isConstantLaneMask(Register Reg, bool &Val) const; + bool isConstantLaneMask(Register Reg, bool &Val, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const; Register createLaneMaskReg() const; void buildMergeLaneMasks(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, Register PrevReg, Register CurReg, GCNLaneMaskAnalysis *LMA = nullptr, - bool Accumulating = false) const; + bool Accumulating = false, + bool isPrevZeroReg = false) const; }; /// Lazy analyses of lane masks. @@ -64,7 +65,8 @@ class GCNLaneMaskAnalysis { public: GCNLaneMaskAnalysis(MachineFunction &MF) : LMU(MF) {} - bool isSubsetOfExec(Register Reg, MachineBasicBlock &UseBlock, + bool isSubsetOfExec(Register Reg, MachineBasicBlock &UseBlock, + MachineBasicBlock::iterator I, unsigned RemainingDepth = 5); }; @@ -106,7 +108,6 @@ class GCNLaneMaskUpdater { private: GCNLaneMaskUtils LMU; GCNLaneMaskAnalysis *LMA = nullptr; - MachineSSAUpdater SSAUpdater; MachineRegisterInfo &MRI; bool Accumulating = false; @@ -116,7 +117,6 @@ class GCNLaneMaskUpdater { MachineBasicBlock *Block; unsigned Flags = 0; // ResetFlags Register Value; - Register Merged; explicit BlockInfo(MachineBasicBlock *Block) : Block(Block) {} @@ -124,7 +124,6 @@ class GCNLaneMaskUpdater { dbgs() << "BlockInfo{"; dbgs() << " Block:" << Block->name() << ","; dbgs() << " Value:" << printReg(Value, MRI.getTargetRegisterInfo(), 0, &MRI) << ","; - dbgs() << " Merged:" << printReg(Merged, MRI.getTargetRegisterInfo(), 0, &MRI) << ","; dbgs() << " Flags:"; if(Flags & ResetAtEnd) dbgs() << "ResetAtEnd,"; if(Flags & ResetInMiddle) dbgs() << "ResetInMiddle,"; @@ -136,13 +135,15 @@ class GCNLaneMaskUpdater { Register ZeroReg; DenseSet PotentiallyDead; - + DenseMap> AccumulatorResetBlocks; public: - GCNLaneMaskUpdater(MachineFunction &MF) : LMU(MF), SSAUpdater(MF), MRI(MF.getRegInfo()) {} + Register Accumulator; + + GCNLaneMaskUpdater(MachineFunction &MF) : LMU(MF), MRI(MF.getRegInfo()) {} void setLaneMaskAnalysis(GCNLaneMaskAnalysis *Analysis) { LMA = Analysis; } - void init(Register Reg); + void init(); void cleanup(); void setAccumulating(bool Val) { Accumulating = Val; } @@ -153,7 +154,7 @@ class GCNLaneMaskUpdater { Register getValueInMiddleOfBlock(MachineBasicBlock &Block); Register getValueAtEndOfBlock(MachineBasicBlock &Block); Register getValueAfterMerge(MachineBasicBlock &Block); - + void insertAccumulatorResets(); private: void process(); SmallVectorImpl::iterator findBlockInfo(MachineBasicBlock &Block); From 1334f38f30647b3e0dcac936a8822c31a46aa164 Mon Sep 17 00:00:00 2001 From: lalaniket8 Date: Wed, 10 Dec 2025 14:27:45 +0530 Subject: [PATCH 2/2] cleanup --- .../lib/Target/AMDGPU/AMDGPUWaveTransform.cpp | 1 - llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.cpp | 106 +++++++++--------- llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.h | 4 - 3 files changed, 53 insertions(+), 58 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUWaveTransform.cpp b/llvm/lib/Target/AMDGPU/AMDGPUWaveTransform.cpp index 8e2896d7c5103..a219de6ebedd7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUWaveTransform.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUWaveTransform.cpp @@ -1951,7 +1951,6 @@ void ControlFlowRewriter::rewrite() { RegMap; GCNLaneMaskUpdater Updater(Function); Updater.setLaneMaskAnalysis(&LMA); - Updater.setAccumulating(true); for (WaveNode *LaneTarget : NodeOrder) { LLVM_DEBUG(dbgs() << "\nPROCESSING NODE:" << LaneTarget->printableName() << "\n\n"); diff --git a/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.cpp b/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.cpp index 440d5a2018b95..8e91adff83a0c 100644 --- a/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.cpp +++ b/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.cpp @@ -108,8 +108,7 @@ Register GCNLaneMaskUtils::createLaneMaskReg() const { /// DstReg = (PrevReg & ~EXEC) | (CurReg & EXEC) /// /// before \p I in basic block \p MBB. Some simplifications are applied on the -/// fly based on constant inputs and analysis via \p LMA, and further -/// simplifications can be requested in "accumulating" mode. +/// fly based on constant inputs and analysis via \p LMA /// /// \param DstReg The virtual register into which the merged mask is written. /// \param PrevReg The virtual register with the "previous" lane mask value; @@ -118,16 +117,11 @@ Register GCNLaneMaskUtils::createLaneMaskReg() const { /// be merged into "previous". /// \param LMA If non-null, used to test whether CurReg may already be a subset /// of EXEC. -/// \param accumulating Indicates that we should assume PrevReg is already -/// properly masked, i.e. use PrevReg directly instead of -/// (PrevReg & ~EXEC), and don't add extra 1-bits to DstReg -/// beyond (CurReg & EXEC). void GCNLaneMaskUtils::buildMergeLaneMasks(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, Register PrevReg, Register CurReg, GCNLaneMaskAnalysis *LMA, - bool accumulating, bool isPrevZeroReg) const { const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); @@ -153,7 +147,7 @@ void GCNLaneMaskUtils::buildMergeLaneMasks(MachineBasicBlock &MBB, else dbgs() << "end of block"; dbgs() << "\n"; - assert(PrevReg || !accumulating); + assert(PrevReg); if (PrevConstant && CurConstant) {// is wave wide constant? if (PrevVal == CurVal) { @@ -178,17 +172,7 @@ void GCNLaneMaskUtils::buildMergeLaneMasks(MachineBasicBlock &MBB, Register PrevMaskedReg; Register CurMaskedReg; if (!PrevConstant) { - if (accumulating || (CurConstant && CurVal)) { - PrevMaskedReg = PrevReg; - } else { - PrevMaskedReg = createLaneMaskReg(); - dbgs() << "\t "; - PrevMaskedBuilt = - BuildMI(MBB, I, DL, TII->get(LMC.AndN2Opc), PrevMaskedReg) - .addReg(PrevReg) - .addReg(LMC.ExecReg); - PrevMaskedBuilt->dump(); - } + PrevMaskedReg = PrevReg; } if (!CurConstant) { bool isCurRegSubsetOfExec = LMA && LMA->isSubsetOfExec(CurReg, MBB, I); @@ -254,14 +238,17 @@ bool GCNLaneMaskAnalysis::isSubsetOfExec(Register Reg, for (;;) { if (!Register::isVirtualRegister(Reg)) { if (Reg == LMC.ExecReg && - (DefInstr == UseBlock.end() || DefInstr->getParent() == &UseBlock)) - return true; + (DefInstr == UseBlock.end() || DefInstr->getParent() == &UseBlock)){ + dbgs() << "Reg is EXEC in same BB, return true\n"; + return true;} + dbgs() << "Reg is not EXEC or is in other BB, return false\n"; return false; } DefInstr = MRI.getDomVRegDefInBasicBlock(Reg, UseBlock, I); - if(DefInstr == UseBlock.end()) - return false; + if(DefInstr == UseBlock.end()){ + dbgs() << "DefInstr == UseBlock.end(), return false\n"; + return false;} if (DefInstr->getOpcode() == AMDGPU::COPY) { Reg = DefInstr->getOperand(1).getReg(); I = DefInstr; @@ -270,57 +257,80 @@ bool GCNLaneMaskAnalysis::isSubsetOfExec(Register Reg, if (DefInstr->getOpcode() == LMC.MovOpc) { if (DefInstr->getOperand(1).isImm() && - DefInstr->getOperand(1).getImm() == 0) - return true; + DefInstr->getOperand(1).getImm() == 0){ + dbgs() << "MOV 0, return true\n"; + return true;} + dbgs() << "MOV is not imm or not 0, return false\n"; return false; } break; } - if (DefInstr->getParent() != &UseBlock) - return false; + dbgs() << "DefInstr:"; + DefInstr->dump(); + dbgs() << "\n"; + + if (DefInstr->getParent() != &UseBlock){ + dbgs() << "DefInstr->getParent() != &UseBlock, return false\n"; + return false;} auto CacheIt = SubsetOfExec.find(Reg); - if (CacheIt != SubsetOfExec.end()) + if (CacheIt != SubsetOfExec.end()){ + dbgs() << "CacheIt != SubsetOfExec.end(), return CacheIt->second: " << CacheIt->second << " \n"; return CacheIt->second; + } // V_CMP_xx always return a subset of EXEC. if (DefInstr->isCompare() && (SIInstrInfo::isVOPC(*DefInstr) || SIInstrInfo::isVOP3(*DefInstr))) { SubsetOfExec[Reg] = true; + dbgs() << "DefInstr is VOPC or VOP3, return true\n"; return true; } if (!RemainingDepth--) - return false; + {dbgs() << "RemainingDepth-- is 0, return false\n"; + return false;} bool LikeOr = DefInstr->getOpcode() == LMC.OrOpc || DefInstr->getOpcode() == LMC.XorOpc || DefInstr->getOpcode() == LMC.CSelectOpc; bool IsAnd = DefInstr->getOpcode() == LMC.AndOpc; bool IsAndN2 = DefInstr->getOpcode() == LMC.AndN2Opc; + dbgs() << "LikeOr: " << LikeOr << " IsAnd: " << IsAnd << " IsAndN2: " << IsAndN2 << "\n"; if ((LikeOr || IsAnd || IsAndN2) && (DefInstr->getOperand(1).isReg() && DefInstr->getOperand(2).isReg())) { bool FirstIsSubset = isSubsetOfExec(DefInstr->getOperand(1).getReg(), UseBlock, DefInstr, RemainingDepth); - if (!FirstIsSubset && (LikeOr || IsAndN2)) - return SubsetOfExec.try_emplace(Reg, false).first->second; + + dbgs() << "FirstIsSubset: " << FirstIsSubset << "\n"; + + if (!FirstIsSubset && (LikeOr || IsAndN2)){ + bool res = SubsetOfExec.try_emplace(Reg, false).first->second; + dbgs() << "FirstIsSubset is false and (LikeOr || IsAndN2), return res: " << res << "\n"; + return res;} if (FirstIsSubset && (IsAnd || IsAndN2)) { SubsetOfExec[Reg] = true; + dbgs() << "FirstIsSubset is true and (IsAnd || IsAndN2), return true\n"; return true; } bool SecondIsSubset = isSubsetOfExec(DefInstr->getOperand(2).getReg(), UseBlock, DefInstr, RemainingDepth); - if (!SecondIsSubset) - return SubsetOfExec.try_emplace(Reg, false).first->second; + dbgs() << "SecondIsSubset: " << SecondIsSubset << "\n"; + if (!SecondIsSubset){ + bool res = SubsetOfExec.try_emplace(Reg, false).first->second; + dbgs() << "SecondIsSubset is false, return res: " << res << "\n"; + return res;} SubsetOfExec[Reg] = true; + dbgs() << "SecondIsSubset is true, return true\n"; return true; } + dbgs() << "Enod of function ,return false\n"; return false; } @@ -329,19 +339,19 @@ void GCNLaneMaskUpdater::init() { Processed = false; Blocks.clear(); // SSAUpdater.Initialize(LMU.getLaneMaskConsts().LaneMaskRC); - Accumulator = {}; + Accumulator = AMDGPU::NoRegister; } /// Optional cleanup, may remove stray instructions. void GCNLaneMaskUpdater::cleanup() { Processed = false; Blocks.clear(); - Accumulator = {}; + Accumulator = AMDGPU::NoRegister; MachineRegisterInfo &MRI = LMU.function()->getRegInfo(); if (ZeroReg && MRI.use_empty(ZeroReg)) { MRI.getVRegDef(ZeroReg)->eraseFromParent(); - ZeroReg = {}; + ZeroReg = AMDGPU::NoRegister; } for (MachineInstr *MI : PotentiallyDead) { @@ -401,7 +411,7 @@ Register GCNLaneMaskUpdater::getValueInMiddleOfBlock(MachineBasicBlock &Block) { /// Return the value at the end of the given block, i.e. after any change that /// was registered via \ref addAvailable. /// -/// Note: If \p Block is the reset block in accumulating mode with ResetAtEnd +/// Note: If \p Block is the reset block with ResetAtEnd /// reset mode, then this value will be 0. You likely want /// \ref getPreReset instead. Register GCNLaneMaskUpdater::getValueAtEndOfBlock(MachineBasicBlock &Block) { @@ -418,7 +428,7 @@ Register GCNLaneMaskUpdater::getValueAfterMerge(MachineBasicBlock &Block) { dbgs() << "GCNLaneMaskUpdater::getValueAfterMerge(" << Block.name() << ")\n"; if (!Processed) process(); - Register reg = {}; + Register reg = AMDGPU::NoRegister; auto BlockIt = findBlockInfo(Block); if (BlockIt != Blocks.end()) { if (BlockIt->Value){ @@ -515,8 +525,7 @@ void GCNLaneMaskUpdater::process() { LMU.function()->getSubtarget().getInstrInfo(); MachineBasicBlock &Entry = LMU.function()->front(); - // Prepare an all-zero value for the default and reset in accumulating mode. - if (Accumulating && !ZeroReg) { + if (!ZeroReg) { ZeroReg = LMU.createLaneMaskReg(); BuildMI(Entry, Entry.getFirstTerminator(), {}, TII->get(LMU.getLaneMaskConsts().MovOpc), ZeroReg) @@ -537,7 +546,6 @@ void GCNLaneMaskUpdater::process() { // Add available values. for (BlockInfo &Info : Blocks) { dbgs() << "\tAdd avail value for BlockInfo:" << Info.Block->name() << "\n\t"; - assert(Accumulating || !Info.Flags); assert(Info.Flags || Info.Value); Info.dump(MRI); if(!Info.Value || (Info.Flags & ResetAtEnd)){ @@ -561,27 +569,19 @@ void GCNLaneMaskUpdater::process() { if (Info.Block != &LMU.function()->front() && !(Info.Flags & ResetInMiddle)) { Previous = Accumulator; - if (!Accumulating) { - MachineInstr *PrevInstr = MRI.getVRegDef(Previous); - if (PrevInstr && PrevInstr->getOpcode() == AMDGPU::IMPLICIT_DEF) { - PotentiallyDead.insert(PrevInstr); - Previous = {}; - } - } } else { dbgs() << "\tEither one of the following 2 conds are true:\n"; dbgs() << "\tInfo.Block == &LMU.function()->front():" << (Info.Block == &LMU.function()->front()) << "\n"; dbgs() << "\tInfo.Flags & ResetInMiddle:" << (Info.Flags & ResetInMiddle) << "\n"; - if (Accumulating){ - Previous = ZeroReg; - dbgs() << "\tBlock:" << Info.Block->name() << " Previous is ZeroReg:" << printReg(Previous , MRI.getTargetRegisterInfo(), 0, &MRI) << "\n"; - } + Previous = ZeroReg; + dbgs() << "\tBlock:" << Info.Block->name() << " Previous is ZeroReg:" << printReg(Previous , MRI.getTargetRegisterInfo(), 0, &MRI) << "\n"; + } // Insert merge logic. MachineBasicBlock::iterator insertPt = getSaluInsertionAtEnd(*Info.Block); LMU.buildMergeLaneMasks(*Info.Block, insertPt, {}, Accumulator, Previous, - Info.Value, LMA, Accumulating, (Previous == ZeroReg)); + Info.Value, LMA, (Previous == ZeroReg)); /*if (Info.Flags & ResetAtEnd) { diff --git a/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.h b/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.h index 7084d5f362344..e20bc1fa39e3c 100644 --- a/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.h +++ b/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.h @@ -51,7 +51,6 @@ class GCNLaneMaskUtils { MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, Register PrevReg, Register CurReg, GCNLaneMaskAnalysis *LMA = nullptr, - bool Accumulating = false, bool isPrevZeroReg = false) const; }; @@ -109,7 +108,6 @@ class GCNLaneMaskUpdater { GCNLaneMaskUtils LMU; GCNLaneMaskAnalysis *LMA = nullptr; MachineRegisterInfo &MRI; - bool Accumulating = false; bool Processed = false; @@ -146,8 +144,6 @@ class GCNLaneMaskUpdater { void init(); void cleanup(); - void setAccumulating(bool Val) { Accumulating = Val; } - void addReset(MachineBasicBlock &Block, ResetFlags Flags); void addAvailable(MachineBasicBlock &Block, Register Value);