diff --git a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp index 18b0f617ca232..4ab99edd64baa 100644 --- a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp +++ b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp @@ -21,6 +21,218 @@ using namespace llvm; +/// \returns \p Len urem \p OpSize, checking for optimization opportunities. +/// \p OpSizeVal must be the integer value of the \c ConstantInt \p OpSize. +static Value *getRuntimeLoopRemainder(IRBuilderBase &B, Value *Len, + Value *OpSize, unsigned OpSizeVal) { + // For powers of 2, we can and by (OpSizeVal - 1) instead of using urem. + if (isPowerOf2_32(OpSizeVal)) + return B.CreateAnd(Len, OpSizeVal - 1); + return B.CreateURem(Len, OpSize); +} + +/// \returns (\p Len udiv \p OpSize) mul \p OpSize, checking for optimization +/// opportunities. +/// If \p RTLoopRemainder is provided, it must be the result of +/// \c getRuntimeLoopRemainder() with the same arguments. +static Value *getRuntimeLoopUnits(IRBuilderBase &B, Value *Len, Value *OpSize, + unsigned OpSizeVal, + Value *RTLoopRemainder = nullptr) { + if (!RTLoopRemainder) + RTLoopRemainder = getRuntimeLoopRemainder(B, Len, OpSize, OpSizeVal); + return B.CreateSub(Len, RTLoopRemainder); +} + +namespace { +/// Container for the return values of insertLoopExpansion. +struct LoopExpansionInfo { + /// The instruction at the end of the main loop body. + Instruction *MainLoopIP = nullptr; + + /// The unit index in the main loop body. + Value *MainLoopIndex = nullptr; + + /// The instruction at the end of the residual loop body. Can be nullptr if no + /// residual is required. + Instruction *ResidualLoopIP = nullptr; + + /// The unit index in the residual loop body. Can be nullptr if no residual is + /// required. + Value *ResidualLoopIndex = nullptr; +}; +} // namespace + +/// Insert the control flow and loop counters for a memcpy/memset loop +/// expansion. +/// +/// This function inserts IR corresponding to the following C code before +/// \p InsertBefore: +/// \code +/// LoopUnits = (Len / MainLoopStep) * MainLoopStep; +/// ResidualUnits = Len - LoopUnits; +/// MainLoopIndex = 0; +/// if (LoopUnits > 0) { +/// do { +/// // MainLoopIP +/// MainLoopIndex += MainLoopStep; +/// } while (MainLoopIndex < LoopUnits); +/// } +/// for (size_t i = 0; i < ResidualUnits; i += ResidualLoopStep) { +/// ResidualLoopIndex = LoopUnits + i; +/// // ResidualLoopIP +/// } +/// \endcode +/// +/// \p MainLoopStep and \p ResidualLoopStep determine by how many "units" the +/// loop index is increased in each iteration of the main and residual loops, +/// respectively. In most cases, the "unit" will be bytes, but larger units are +/// useful for lowering memset.pattern. +/// +/// The computation of \c LoopUnits and \c ResidualUnits is performed at compile +/// time if \p Len is a \c ConstantInt. +/// The second (residual) loop is omitted if \p ResidualLoopStep is 0 or equal +/// to \p MainLoopStep. +/// The generated \c MainLoopIP, \c MainLoopIndex, \c ResidualLoopIP, and +/// \c ResidualLoopIndex are returned in a \c LoopExpansionInfo object. +static LoopExpansionInfo insertLoopExpansion(Instruction *InsertBefore, + Value *Len, unsigned MainLoopStep, + unsigned ResidualLoopStep, + StringRef BBNamePrefix) { + assert((ResidualLoopStep == 0 || MainLoopStep % ResidualLoopStep == 0) && + "ResidualLoopStep must divide MainLoopStep if specified"); + assert(ResidualLoopStep <= MainLoopStep && + "ResidualLoopStep cannot be larger than MainLoopStep"); + assert(MainLoopStep > 0 && "MainLoopStep must be non-zero"); + LoopExpansionInfo LEI; + BasicBlock *PreLoopBB = InsertBefore->getParent(); + BasicBlock *PostLoopBB = PreLoopBB->splitBasicBlock( + InsertBefore, BBNamePrefix + "-post-expansion"); + Function *ParentFunc = PreLoopBB->getParent(); + LLVMContext &Ctx = PreLoopBB->getContext(); + IRBuilder<> PreLoopBuilder(PreLoopBB->getTerminator()); + + // Calculate the main loop trip count and remaining units to cover after the + // loop. + Type *LenType = Len->getType(); + IntegerType *ILenType = cast(LenType); + ConstantInt *CIMainLoopStep = ConstantInt::get(ILenType, MainLoopStep); + + Value *LoopUnits = Len; + Value *ResidualUnits = nullptr; + // We can make a conditional branch unconditional if we know that the + // MainLoop must be executed at least once. + bool MustTakeMainLoop = false; + if (MainLoopStep != 1) { + if (auto *CLen = dyn_cast(Len)) { + uint64_t TotalUnits = CLen->getZExtValue(); + uint64_t LoopEndCount = alignDown(TotalUnits, MainLoopStep); + uint64_t ResidualCount = TotalUnits - LoopEndCount; + LoopUnits = ConstantInt::get(LenType, LoopEndCount); + ResidualUnits = ConstantInt::get(LenType, ResidualCount); + MustTakeMainLoop = LoopEndCount > 0; + // As an optimization, we could skip generating the residual loop if + // ResidualCount is known to be 0. However, current uses of this function + // don't request a residual loop if the length is constant (they generate + // a (potentially empty) sequence of loads and stores instead), so this + // optimization would have no effect here. + } else { + ResidualUnits = getRuntimeLoopRemainder(PreLoopBuilder, Len, + CIMainLoopStep, MainLoopStep); + LoopUnits = getRuntimeLoopUnits(PreLoopBuilder, Len, CIMainLoopStep, + MainLoopStep, ResidualUnits); + } + } else if (auto *CLen = dyn_cast(Len)) { + MustTakeMainLoop = CLen->getZExtValue() > 0; + } + + BasicBlock *MainLoopBB = BasicBlock::Create( + Ctx, BBNamePrefix + "-expansion-main-body", ParentFunc, PostLoopBB); + IRBuilder<> LoopBuilder(MainLoopBB); + + PHINode *LoopIndex = LoopBuilder.CreatePHI(LenType, 2, "loop-index"); + LEI.MainLoopIndex = LoopIndex; + LoopIndex->addIncoming(ConstantInt::get(LenType, 0U), PreLoopBB); + + Value *NewIndex = + LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(LenType, MainLoopStep)); + LoopIndex->addIncoming(NewIndex, MainLoopBB); + + // One argument of the addition is a loop-variant PHI, so it must be an + // Instruction (i.e., it cannot be a Constant). + LEI.MainLoopIP = cast(NewIndex); + + if (ResidualLoopStep > 0 && ResidualLoopStep < MainLoopStep) { + // Loop body for the residual accesses. + BasicBlock *ResLoopBB = + BasicBlock::Create(Ctx, BBNamePrefix + "-expansion-residual-body", + PreLoopBB->getParent(), PostLoopBB); + // BB to check if the residual loop is needed. + BasicBlock *ResidualCondBB = + BasicBlock::Create(Ctx, BBNamePrefix + "-expansion-residual-cond", + PreLoopBB->getParent(), ResLoopBB); + + // Enter the MainLoop unless no main loop iteration is required. + ConstantInt *Zero = ConstantInt::get(ILenType, 0U); + if (MustTakeMainLoop) + PreLoopBuilder.CreateBr(MainLoopBB); + else + PreLoopBuilder.CreateCondBr(PreLoopBuilder.CreateICmpNE(LoopUnits, Zero), + MainLoopBB, ResidualCondBB); + PreLoopBB->getTerminator()->eraseFromParent(); + + // Stay in the MainLoop until we have handled all the LoopUnits. Then go to + // the residual condition BB. + LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, LoopUnits), + MainLoopBB, ResidualCondBB); + + // Determine if we need to branch to the residual loop or bypass it. + IRBuilder<> RCBuilder(ResidualCondBB); + RCBuilder.CreateCondBr(RCBuilder.CreateICmpNE(ResidualUnits, Zero), + ResLoopBB, PostLoopBB); + + IRBuilder<> ResBuilder(ResLoopBB); + PHINode *ResidualIndex = + ResBuilder.CreatePHI(LenType, 2, "residual-loop-index"); + ResidualIndex->addIncoming(Zero, ResidualCondBB); + + // Add the offset at the end of the main loop to the loop counter of the + // residual loop to get the proper index. + Value *FullOffset = ResBuilder.CreateAdd(LoopUnits, ResidualIndex); + LEI.ResidualLoopIndex = FullOffset; + + Value *ResNewIndex = ResBuilder.CreateAdd( + ResidualIndex, ConstantInt::get(LenType, ResidualLoopStep)); + ResidualIndex->addIncoming(ResNewIndex, ResLoopBB); + + // One argument of the addition is a loop-variant PHI, so it must be an + // Instruction (i.e., it cannot be a Constant). + LEI.ResidualLoopIP = cast(ResNewIndex); + + // Stay in the residual loop until all ResidualUnits are handled. + ResBuilder.CreateCondBr( + ResBuilder.CreateICmpULT(ResNewIndex, ResidualUnits), ResLoopBB, + PostLoopBB); + } else { + // There is no need for a residual loop after the main loop. We do however + // need to patch up the control flow by creating the terminators for the + // preloop block and the main loop. + + // Enter the MainLoop unless no main loop iteration is required. + if (MustTakeMainLoop) { + PreLoopBuilder.CreateBr(MainLoopBB); + } else { + ConstantInt *Zero = ConstantInt::get(ILenType, 0U); + PreLoopBuilder.CreateCondBr(PreLoopBuilder.CreateICmpNE(LoopUnits, Zero), + MainLoopBB, PostLoopBB); + } + PreLoopBB->getTerminator()->eraseFromParent(); + // Stay in the MainLoop until we have handled all the LoopUnits. + LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, LoopUnits), + MainLoopBB, PostLoopBB); + } + return LEI; +} + void llvm::createMemCpyLoopKnownSize( Instruction *InsertBefore, Value *SrcAddr, Value *DstAddr, ConstantInt *CopyLen, Align SrcAlign, Align DstAlign, bool SrcIsVolatile, @@ -31,7 +243,6 @@ void llvm::createMemCpyLoopKnownSize( return; BasicBlock *PreLoopBB = InsertBefore->getParent(); - BasicBlock *PostLoopBB = nullptr; Function *ParentFunc = PreLoopBB->getParent(); LLVMContext &Ctx = PreLoopBB->getContext(); const DataLayout &DL = ParentFunc->getDataLayout(); @@ -56,37 +267,32 @@ void llvm::createMemCpyLoopKnownSize( uint64_t LoopEndCount = alignDown(CopyLen->getZExtValue(), LoopOpSize); + // Skip the loop expansion entirely if the loop would never be taken. if (LoopEndCount != 0) { - // Split - PostLoopBB = PreLoopBB->splitBasicBlock(InsertBefore, "memcpy-split"); - BasicBlock *LoopBB = - BasicBlock::Create(Ctx, "load-store-loop", ParentFunc, PostLoopBB); - PreLoopBB->getTerminator()->setSuccessor(0, LoopBB); - - IRBuilder<> PLBuilder(PreLoopBB->getTerminator()); + LoopExpansionInfo LEI = insertLoopExpansion(InsertBefore, CopyLen, + LoopOpSize, 0, "static-memcpy"); + // Fill MainLoopBB + IRBuilder<> MainLoopBuilder(LEI.MainLoopIP); Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize)); Align PartSrcAlign(commonAlignment(SrcAlign, LoopOpSize)); - IRBuilder<> LoopBuilder(LoopBB); - PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 2, "loop-index"); - LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0U), PreLoopBB); - // Loop Body - // If we used LoopOpType as GEP element type, we would iterate over the // buffers in TypeStoreSize strides while copying TypeAllocSize bytes, i.e., // we would miss bytes if TypeStoreSize != TypeAllocSize. Therefore, use // byte offsets computed from the TypeStoreSize. - Value *SrcGEP = LoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr, LoopIndex); - LoadInst *Load = LoopBuilder.CreateAlignedLoad(LoopOpType, SrcGEP, - PartSrcAlign, SrcIsVolatile); + Value *SrcGEP = + MainLoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr, LEI.MainLoopIndex); + LoadInst *Load = MainLoopBuilder.CreateAlignedLoad( + LoopOpType, SrcGEP, PartSrcAlign, SrcIsVolatile); if (!CanOverlap) { // Set alias scope for loads. Load->setMetadata(LLVMContext::MD_alias_scope, MDNode::get(Ctx, NewScope)); } - Value *DstGEP = LoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, LoopIndex); - StoreInst *Store = LoopBuilder.CreateAlignedStore( + Value *DstGEP = + MainLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, LEI.MainLoopIndex); + StoreInst *Store = MainLoopBuilder.CreateAlignedStore( Load, DstGEP, PartDstAlign, DstIsVolatile); if (!CanOverlap) { // Indicate that stores don't overlap loads. @@ -96,96 +302,63 @@ void llvm::createMemCpyLoopKnownSize( Load->setAtomic(AtomicOrdering::Unordered); Store->setAtomic(AtomicOrdering::Unordered); } - Value *NewIndex = LoopBuilder.CreateAdd( - LoopIndex, ConstantInt::get(TypeOfCopyLen, LoopOpSize)); - LoopIndex->addIncoming(NewIndex, LoopBB); - - // Create the loop branch condition. - Constant *LoopEndCI = ConstantInt::get(TypeOfCopyLen, LoopEndCount); - LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, LoopEndCI), - LoopBB, PostLoopBB); + assert(!LEI.ResidualLoopIP && !LEI.ResidualLoopIndex && + "No residual loop was requested"); } + // Copy the remaining bytes with straight-line code. uint64_t BytesCopied = LoopEndCount; uint64_t RemainingBytes = CopyLen->getZExtValue() - BytesCopied; - if (RemainingBytes) { - BasicBlock::iterator InsertIt = PostLoopBB ? PostLoopBB->getFirstNonPHIIt() - : InsertBefore->getIterator(); - IRBuilder<> RBuilder(InsertIt->getParent(), InsertIt); + if (RemainingBytes == 0) + return; - SmallVector RemainingOps; - TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes, - SrcAS, DstAS, SrcAlign, DstAlign, - AtomicElementSize); + IRBuilder<> RBuilder(InsertBefore); + SmallVector RemainingOps; + TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes, + SrcAS, DstAS, SrcAlign, DstAlign, + AtomicElementSize); - for (auto *OpTy : RemainingOps) { - Align PartSrcAlign(commonAlignment(SrcAlign, BytesCopied)); - Align PartDstAlign(commonAlignment(DstAlign, BytesCopied)); - - unsigned OperandSize = DL.getTypeStoreSize(OpTy); - assert( - (!AtomicElementSize || OperandSize % *AtomicElementSize == 0) && - "Atomic memcpy lowering is not supported for selected operand size"); - - Value *SrcGEP = RBuilder.CreateInBoundsGEP( - Int8Type, SrcAddr, ConstantInt::get(TypeOfCopyLen, BytesCopied)); - LoadInst *Load = - RBuilder.CreateAlignedLoad(OpTy, SrcGEP, PartSrcAlign, SrcIsVolatile); - if (!CanOverlap) { - // Set alias scope for loads. - Load->setMetadata(LLVMContext::MD_alias_scope, - MDNode::get(Ctx, NewScope)); - } - Value *DstGEP = RBuilder.CreateInBoundsGEP( - Int8Type, DstAddr, ConstantInt::get(TypeOfCopyLen, BytesCopied)); - StoreInst *Store = RBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign, - DstIsVolatile); - if (!CanOverlap) { - // Indicate that stores don't overlap loads. - Store->setMetadata(LLVMContext::MD_noalias, MDNode::get(Ctx, NewScope)); - } - if (AtomicElementSize) { - Load->setAtomic(AtomicOrdering::Unordered); - Store->setAtomic(AtomicOrdering::Unordered); - } - BytesCopied += OperandSize; + for (auto *OpTy : RemainingOps) { + Align PartSrcAlign(commonAlignment(SrcAlign, BytesCopied)); + Align PartDstAlign(commonAlignment(DstAlign, BytesCopied)); + + unsigned OperandSize = DL.getTypeStoreSize(OpTy); + assert((!AtomicElementSize || OperandSize % *AtomicElementSize == 0) && + "Atomic memcpy lowering is not supported for selected operand size"); + + Value *SrcGEP = RBuilder.CreateInBoundsGEP( + Int8Type, SrcAddr, ConstantInt::get(TypeOfCopyLen, BytesCopied)); + LoadInst *Load = + RBuilder.CreateAlignedLoad(OpTy, SrcGEP, PartSrcAlign, SrcIsVolatile); + if (!CanOverlap) { + // Set alias scope for loads. + Load->setMetadata(LLVMContext::MD_alias_scope, + MDNode::get(Ctx, NewScope)); + } + Value *DstGEP = RBuilder.CreateInBoundsGEP( + Int8Type, DstAddr, ConstantInt::get(TypeOfCopyLen, BytesCopied)); + StoreInst *Store = + RBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign, DstIsVolatile); + if (!CanOverlap) { + // Indicate that stores don't overlap loads. + Store->setMetadata(LLVMContext::MD_noalias, MDNode::get(Ctx, NewScope)); } + if (AtomicElementSize) { + Load->setAtomic(AtomicOrdering::Unordered); + Store->setAtomic(AtomicOrdering::Unordered); + } + BytesCopied += OperandSize; } assert(BytesCopied == CopyLen->getZExtValue() && "Bytes copied should match size in the call!"); } -// \returns \p Len urem \p OpSize, checking for optimization opportunities. -static Value *getRuntimeLoopRemainder(const DataLayout &DL, IRBuilderBase &B, - Value *Len, Value *OpSize, - unsigned OpSizeVal) { - // For powers of 2, we can and by (OpSizeVal - 1) instead of using urem. - if (isPowerOf2_32(OpSizeVal)) - return B.CreateAnd(Len, OpSizeVal - 1); - return B.CreateURem(Len, OpSize); -} - -// \returns (\p Len udiv \p OpSize) mul \p OpSize, checking for optimization -// opportunities. -// If RTLoopRemainder is provided, it must be the result of -// getRuntimeLoopRemainder() with the same arguments. -static Value *getRuntimeLoopBytes(const DataLayout &DL, IRBuilderBase &B, - Value *Len, Value *OpSize, unsigned OpSizeVal, - Value *RTLoopRemainder = nullptr) { - if (!RTLoopRemainder) - RTLoopRemainder = getRuntimeLoopRemainder(DL, B, Len, OpSize, OpSizeVal); - return B.CreateSub(Len, RTLoopRemainder); -} - void llvm::createMemCpyLoopUnknownSize( Instruction *InsertBefore, Value *SrcAddr, Value *DstAddr, Value *CopyLen, Align SrcAlign, Align DstAlign, bool SrcIsVolatile, bool DstIsVolatile, bool CanOverlap, const TargetTransformInfo &TTI, std::optional AtomicElementSize) { BasicBlock *PreLoopBB = InsertBefore->getParent(); - BasicBlock *PostLoopBB = - PreLoopBB->splitBasicBlock(InsertBefore, "post-loop-memcpy-expansion"); - Function *ParentFunc = PreLoopBB->getParent(); const DataLayout &DL = ParentFunc->getDataLayout(); LLVMContext &Ctx = PreLoopBB->getContext(); @@ -205,50 +378,39 @@ void llvm::createMemCpyLoopUnknownSize( assert((!AtomicElementSize || LoopOpSize % *AtomicElementSize == 0) && "Atomic memcpy lowering is not supported for selected operand size"); - IRBuilder<> PLBuilder(PreLoopBB->getTerminator()); - - // Calculate the loop trip count, and remaining bytes to copy after the loop. - Type *CopyLenType = CopyLen->getType(); - IntegerType *ILengthType = dyn_cast(CopyLenType); - assert(ILengthType && - "expected size argument to memcpy to be an integer type!"); Type *Int8Type = Type::getInt8Ty(Ctx); - bool LoopOpIsInt8 = LoopOpType == Int8Type; - ConstantInt *CILoopOpSize = ConstantInt::get(ILengthType, LoopOpSize); - Value *RuntimeLoopBytes = CopyLen; - Value *RuntimeResidualBytes = nullptr; - if (!LoopOpIsInt8) { - RuntimeResidualBytes = getRuntimeLoopRemainder(DL, PLBuilder, CopyLen, - CILoopOpSize, LoopOpSize); - RuntimeLoopBytes = getRuntimeLoopBytes(DL, PLBuilder, CopyLen, CILoopOpSize, - LoopOpSize, RuntimeResidualBytes); - } + Type *ResidualLoopOpType = AtomicElementSize + ? Type::getIntNTy(Ctx, *AtomicElementSize * 8) + : Int8Type; + unsigned ResidualLoopOpSize = DL.getTypeStoreSize(ResidualLoopOpType); + assert(ResidualLoopOpSize == (AtomicElementSize ? *AtomicElementSize : 1) && + "Store size is expected to match type size"); - BasicBlock *LoopBB = - BasicBlock::Create(Ctx, "loop-memcpy-expansion", ParentFunc, PostLoopBB); - IRBuilder<> LoopBuilder(LoopBB); + LoopExpansionInfo LEI = insertLoopExpansion( + InsertBefore, CopyLen, LoopOpSize, ResidualLoopOpSize, "dynamic-memcpy"); + // Fill MainLoopBB + IRBuilder<> MainLoopBuilder(LEI.MainLoopIP); Align PartSrcAlign(commonAlignment(SrcAlign, LoopOpSize)); Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize)); - PHINode *LoopIndex = LoopBuilder.CreatePHI(CopyLenType, 2, "loop-index"); - LoopIndex->addIncoming(ConstantInt::get(CopyLenType, 0U), PreLoopBB); - // If we used LoopOpType as GEP element type, we would iterate over the // buffers in TypeStoreSize strides while copying TypeAllocSize bytes, i.e., // we would miss bytes if TypeStoreSize != TypeAllocSize. Therefore, use byte // offsets computed from the TypeStoreSize. - Value *SrcGEP = LoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr, LoopIndex); - LoadInst *Load = LoopBuilder.CreateAlignedLoad(LoopOpType, SrcGEP, - PartSrcAlign, SrcIsVolatile); + Value *SrcGEP = + MainLoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr, LEI.MainLoopIndex); + LoadInst *Load = MainLoopBuilder.CreateAlignedLoad( + LoopOpType, SrcGEP, PartSrcAlign, SrcIsVolatile); if (!CanOverlap) { // Set alias scope for loads. Load->setMetadata(LLVMContext::MD_alias_scope, MDNode::get(Ctx, NewScope)); } - Value *DstGEP = LoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, LoopIndex); - StoreInst *Store = - LoopBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign, DstIsVolatile); + Value *DstGEP = + MainLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, LEI.MainLoopIndex); + StoreInst *Store = MainLoopBuilder.CreateAlignedStore( + Load, DstGEP, PartDstAlign, DstIsVolatile); if (!CanOverlap) { // Indicate that stores don't overlap loads. Store->setMetadata(LLVMContext::MD_noalias, MDNode::get(Ctx, NewScope)); @@ -257,95 +419,35 @@ void llvm::createMemCpyLoopUnknownSize( Load->setAtomic(AtomicOrdering::Unordered); Store->setAtomic(AtomicOrdering::Unordered); } - Value *NewIndex = LoopBuilder.CreateAdd( - LoopIndex, ConstantInt::get(CopyLenType, LoopOpSize)); - LoopIndex->addIncoming(NewIndex, LoopBB); - - bool RequiresResidual = - !LoopOpIsInt8 && !(AtomicElementSize && LoopOpSize == AtomicElementSize); - if (RequiresResidual) { - Type *ResLoopOpType = AtomicElementSize - ? Type::getIntNTy(Ctx, *AtomicElementSize * 8) - : Int8Type; - unsigned ResLoopOpSize = DL.getTypeStoreSize(ResLoopOpType); - assert((ResLoopOpSize == AtomicElementSize ? *AtomicElementSize : 1) && - "Store size is expected to match type size"); - - Align ResSrcAlign(commonAlignment(PartSrcAlign, ResLoopOpSize)); - Align ResDstAlign(commonAlignment(PartDstAlign, ResLoopOpSize)); - - // Loop body for the residual copy. - BasicBlock *ResLoopBB = BasicBlock::Create( - Ctx, "loop-memcpy-residual", PreLoopBB->getParent(), PostLoopBB); - // Residual loop header. - BasicBlock *ResHeaderBB = BasicBlock::Create( - Ctx, "loop-memcpy-residual-header", PreLoopBB->getParent(), nullptr); - - // Need to update the pre-loop basic block to branch to the correct place. - // branch to the main loop if the count is non-zero, branch to the residual - // loop if the copy size is smaller then 1 iteration of the main loop but - // non-zero and finally branch to after the residual loop if the memcpy - // size is zero. - ConstantInt *Zero = ConstantInt::get(ILengthType, 0U); - PLBuilder.CreateCondBr(PLBuilder.CreateICmpNE(RuntimeLoopBytes, Zero), - LoopBB, ResHeaderBB); - PreLoopBB->getTerminator()->eraseFromParent(); - LoopBuilder.CreateCondBr( - LoopBuilder.CreateICmpULT(NewIndex, RuntimeLoopBytes), LoopBB, - ResHeaderBB); - - // Determine if we need to branch to the residual loop or bypass it. - IRBuilder<> RHBuilder(ResHeaderBB); - RHBuilder.CreateCondBr(RHBuilder.CreateICmpNE(RuntimeResidualBytes, Zero), - ResLoopBB, PostLoopBB); + // Fill ResidualLoopBB. + if (!LEI.ResidualLoopIP) + return; - // Copy the residual with single byte load/store loop. - IRBuilder<> ResBuilder(ResLoopBB); - PHINode *ResidualIndex = - ResBuilder.CreatePHI(CopyLenType, 2, "residual-loop-index"); - ResidualIndex->addIncoming(Zero, ResHeaderBB); + Align ResSrcAlign(commonAlignment(PartSrcAlign, ResidualLoopOpSize)); + Align ResDstAlign(commonAlignment(PartDstAlign, ResidualLoopOpSize)); - Value *FullOffset = ResBuilder.CreateAdd(RuntimeLoopBytes, ResidualIndex); - Value *SrcGEP = ResBuilder.CreateInBoundsGEP(Int8Type, SrcAddr, FullOffset); - LoadInst *Load = ResBuilder.CreateAlignedLoad(ResLoopOpType, SrcGEP, - ResSrcAlign, SrcIsVolatile); - if (!CanOverlap) { - // Set alias scope for loads. - Load->setMetadata(LLVMContext::MD_alias_scope, - MDNode::get(Ctx, NewScope)); - } - Value *DstGEP = ResBuilder.CreateInBoundsGEP(Int8Type, DstAddr, FullOffset); - StoreInst *Store = - ResBuilder.CreateAlignedStore(Load, DstGEP, ResDstAlign, DstIsVolatile); - if (!CanOverlap) { - // Indicate that stores don't overlap loads. - Store->setMetadata(LLVMContext::MD_noalias, MDNode::get(Ctx, NewScope)); - } - if (AtomicElementSize) { - Load->setAtomic(AtomicOrdering::Unordered); - Store->setAtomic(AtomicOrdering::Unordered); - } - Value *ResNewIndex = ResBuilder.CreateAdd( - ResidualIndex, ConstantInt::get(CopyLenType, ResLoopOpSize)); - ResidualIndex->addIncoming(ResNewIndex, ResLoopBB); - - // Create the loop branch condition. - ResBuilder.CreateCondBr( - ResBuilder.CreateICmpULT(ResNewIndex, RuntimeResidualBytes), ResLoopBB, - PostLoopBB); - } else { - // In this case the loop operand type was a byte, and there is no need for a - // residual loop to copy the remaining memory after the main loop. - // We do however need to patch up the control flow by creating the - // terminators for the preloop block and the memcpy loop. - ConstantInt *Zero = ConstantInt::get(ILengthType, 0U); - PLBuilder.CreateCondBr(PLBuilder.CreateICmpNE(RuntimeLoopBytes, Zero), - LoopBB, PostLoopBB); - PreLoopBB->getTerminator()->eraseFromParent(); - LoopBuilder.CreateCondBr( - LoopBuilder.CreateICmpULT(NewIndex, RuntimeLoopBytes), LoopBB, - PostLoopBB); + IRBuilder<> ResLoopBuilder(LEI.ResidualLoopIP); + Value *ResSrcGEP = ResLoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr, + LEI.ResidualLoopIndex); + LoadInst *ResLoad = ResLoopBuilder.CreateAlignedLoad( + ResidualLoopOpType, ResSrcGEP, ResSrcAlign, SrcIsVolatile); + if (!CanOverlap) { + // Set alias scope for loads. + ResLoad->setMetadata(LLVMContext::MD_alias_scope, + MDNode::get(Ctx, NewScope)); + } + Value *ResDstGEP = ResLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, + LEI.ResidualLoopIndex); + StoreInst *ResStore = ResLoopBuilder.CreateAlignedStore( + ResLoad, ResDstGEP, ResDstAlign, DstIsVolatile); + if (!CanOverlap) { + // Indicate that stores don't overlap loads. + ResStore->setMetadata(LLVMContext::MD_noalias, MDNode::get(Ctx, NewScope)); + } + if (AtomicElementSize) { + ResLoad->setAtomic(AtomicOrdering::Unordered); + ResStore->setAtomic(AtomicOrdering::Unordered); } } @@ -439,9 +541,9 @@ static void createMemMoveLoopUnknownSize(Instruction *InsertBefore, Value *RuntimeLoopRemainder = nullptr; Value *SkipResidualCondition = nullptr; if (RequiresResidual) { - RuntimeLoopRemainder = getRuntimeLoopRemainder(DL, PLBuilder, CopyLen, - CILoopOpSize, LoopOpSize); - RuntimeLoopBytes = getRuntimeLoopBytes(DL, PLBuilder, CopyLen, CILoopOpSize, + RuntimeLoopRemainder = + getRuntimeLoopRemainder(PLBuilder, CopyLen, CILoopOpSize, LoopOpSize); + RuntimeLoopBytes = getRuntimeLoopUnits(PLBuilder, CopyLen, CILoopOpSize, LoopOpSize, RuntimeLoopRemainder); SkipResidualCondition = PLBuilder.CreateICmpEQ(RuntimeLoopRemainder, Zero, "skip_residual"); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll index e0016b0a5a64d..993fb7eeb3aa9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll @@ -12,7 +12,7 @@ define amdgpu_cs void @memcpy_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src) ; LOOP-NEXT: s_mov_b32 s3, 0xf000 ; LOOP-NEXT: v_mov_b32_e32 v5, s1 ; LOOP-NEXT: v_mov_b32_e32 v4, s0 -; LOOP-NEXT: .LBB0_1: ; %load-store-loop +; LOOP-NEXT: .LBB0_1: ; %static-memcpy-expansion-main-body ; LOOP-NEXT: ; =>This Inner Loop Header: Depth=1 ; LOOP-NEXT: v_add_i32_e32 v6, vcc, v2, v4 ; LOOP-NEXT: v_addc_u32_e32 v7, vcc, v3, v5, vcc @@ -177,7 +177,7 @@ define amdgpu_cs void @memcpy_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src) ; LOOP-NEXT: buffer_store_byte v30, v[6:7], s[0:3], 0 addr64 offset:30 ; LOOP-NEXT: buffer_store_byte v13, v[6:7], s[0:3], 0 addr64 offset:31 ; LOOP-NEXT: s_cbranch_vccnz .LBB0_1 -; LOOP-NEXT: ; %bb.2: ; %memcpy-split +; LOOP-NEXT: ; %bb.2: ; %static-memcpy-post-expansion ; LOOP-NEXT: s_mov_b32 s2, 0 ; LOOP-NEXT: s_mov_b32 s3, 0xf000 ; LOOP-NEXT: s_mov_b64 s[0:1], 0 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll index 931a62298812f..1e79c4f63cd42 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll @@ -255,7 +255,7 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7) ; SDAG-GFX942-NEXT: s_mov_b32 s17, s10 ; SDAG-GFX942-NEXT: s_mov_b32 s2, s9 ; SDAG-GFX942-NEXT: s_or_b64 s[12:13], s[2:3], s[16:17] -; SDAG-GFX942-NEXT: .LBB0_1: ; %load-store-loop +; SDAG-GFX942-NEXT: .LBB0_1: ; %static-memcpy-expansion-main-body ; SDAG-GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; SDAG-GFX942-NEXT: s_add_i32 s1, s0, s16 ; SDAG-GFX942-NEXT: v_mov_b32_e32 v0, s1 @@ -312,7 +312,7 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7) ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) ; SDAG-GFX942-NEXT: buffer_store_dwordx4 a[0:3], v0, s[12:15], 0 offen offset:240 ; SDAG-GFX942-NEXT: s_cbranch_scc1 .LBB0_1 -; SDAG-GFX942-NEXT: ; %bb.2: ; %memcpy-split +; SDAG-GFX942-NEXT: ; %bb.2: ; %static-memcpy-post-expansion ; SDAG-GFX942-NEXT: s_endpgm ; ; SDAG-GFX1100-LABEL: memcpy_known: @@ -341,7 +341,7 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7) ; SDAG-GFX1100-NEXT: s_mov_b32 s3, s16 ; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; SDAG-GFX1100-NEXT: s_or_b64 s[12:13], s[2:3], s[16:17] -; SDAG-GFX1100-NEXT: .LBB0_1: ; %load-store-loop +; SDAG-GFX1100-NEXT: .LBB0_1: ; %static-memcpy-expansion-main-body ; SDAG-GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1 ; SDAG-GFX1100-NEXT: s_add_i32 s1, s0, s16 ; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -400,7 +400,7 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7) ; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) ; SDAG-GFX1100-NEXT: buffer_store_b128 v[60:63], v64, s[12:15], 0 offen offset:240 ; SDAG-GFX1100-NEXT: s_cbranch_scc1 .LBB0_1 -; SDAG-GFX1100-NEXT: ; %bb.2: ; %memcpy-split +; SDAG-GFX1100-NEXT: ; %bb.2: ; %static-memcpy-post-expansion ; SDAG-GFX1100-NEXT: s_endpgm ; ; GISEL-GFX942-LABEL: memcpy_known: @@ -419,7 +419,7 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7) ; GISEL-GFX942-NEXT: s_mov_b32 s5, s14 ; GISEL-GFX942-NEXT: s_mov_b32 s6, s15 ; GISEL-GFX942-NEXT: v_mov_b32_e32 v1, s16 -; GISEL-GFX942-NEXT: .LBB0_1: ; %load-store-loop +; GISEL-GFX942-NEXT: .LBB0_1: ; %static-memcpy-expansion-main-body ; GISEL-GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GISEL-GFX942-NEXT: v_add_u32_e32 v62, s0, v1 ; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v62, s[8:11], 0 offen @@ -477,7 +477,7 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7) ; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen offset:240 ; GISEL-GFX942-NEXT: s_cbranch_vccnz .LBB0_1 -; GISEL-GFX942-NEXT: ; %bb.2: ; %memcpy-split +; GISEL-GFX942-NEXT: ; %bb.2: ; %static-memcpy-post-expansion ; GISEL-GFX942-NEXT: s_endpgm ; ; GISEL-GFX1100-LABEL: memcpy_known: @@ -497,7 +497,7 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7) ; GISEL-GFX1100-NEXT: s_mov_b32 s12, s9 ; GISEL-GFX1100-NEXT: s_mov_b32 s13, s10 ; GISEL-GFX1100-NEXT: s_mov_b32 s14, s11 -; GISEL-GFX1100-NEXT: .LBB0_1: ; %load-store-loop +; GISEL-GFX1100-NEXT: .LBB0_1: ; %static-memcpy-expansion-main-body ; GISEL-GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1 ; GISEL-GFX1100-NEXT: v_add_nc_u32_e32 v61, s0, v0 ; GISEL-GFX1100-NEXT: v_add_nc_u32_e32 v65, s8, v0 @@ -553,7 +553,7 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7) ; GISEL-GFX1100-NEXT: buffer_store_b128 v[61:64], v65, s[12:15], 0 offen offset:240 ; GISEL-GFX1100-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0x2000, v0 ; GISEL-GFX1100-NEXT: s_cbranch_vccnz .LBB0_1 -; GISEL-GFX1100-NEXT: ; %bb.2: ; %memcpy-split +; GISEL-GFX1100-NEXT: ; %bb.2: ; %static-memcpy-post-expansion ; GISEL-GFX1100-NEXT: s_endpgm call void @llvm.memcpy.p7.p7.i32(ptr addrspace(7) noundef nonnull align 16 %dst, ptr addrspace(7) noundef nonnull align 16 %src, i32 8192, i1 false) ret void @@ -787,7 +787,7 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp ; SDAG-GFX942-NEXT: s_mov_b32 s17, s10 ; SDAG-GFX942-NEXT: s_mov_b32 s2, s9 ; SDAG-GFX942-NEXT: s_or_b64 s[12:13], s[2:3], s[16:17] -; SDAG-GFX942-NEXT: .LBB1_1: ; %load-store-loop +; SDAG-GFX942-NEXT: .LBB1_1: ; %static-memcpy-expansion-main-body ; SDAG-GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; SDAG-GFX942-NEXT: s_add_i32 s1, s0, s16 ; SDAG-GFX942-NEXT: v_mov_b32_e32 v0, s1 @@ -844,7 +844,7 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) ; SDAG-GFX942-NEXT: buffer_store_dwordx4 a[0:3], v0, s[12:15], 0 offen offset:240 ; SDAG-GFX942-NEXT: s_cbranch_scc1 .LBB1_1 -; SDAG-GFX942-NEXT: ; %bb.2: ; %memcpy-split +; SDAG-GFX942-NEXT: ; %bb.2: ; %static-memcpy-post-expansion ; SDAG-GFX942-NEXT: s_endpgm ; ; SDAG-GFX1100-LABEL: memcpy_known_medium: @@ -873,7 +873,7 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp ; SDAG-GFX1100-NEXT: s_mov_b32 s3, s16 ; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; SDAG-GFX1100-NEXT: s_or_b64 s[12:13], s[2:3], s[16:17] -; SDAG-GFX1100-NEXT: .LBB1_1: ; %load-store-loop +; SDAG-GFX1100-NEXT: .LBB1_1: ; %static-memcpy-expansion-main-body ; SDAG-GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1 ; SDAG-GFX1100-NEXT: s_add_i32 s1, s0, s16 ; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -932,7 +932,7 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp ; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) ; SDAG-GFX1100-NEXT: buffer_store_b128 v[60:63], v64, s[12:15], 0 offen offset:240 ; SDAG-GFX1100-NEXT: s_cbranch_scc1 .LBB1_1 -; SDAG-GFX1100-NEXT: ; %bb.2: ; %memcpy-split +; SDAG-GFX1100-NEXT: ; %bb.2: ; %static-memcpy-post-expansion ; SDAG-GFX1100-NEXT: s_endpgm ; ; GISEL-GFX942-LABEL: memcpy_known_medium: @@ -951,7 +951,7 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp ; GISEL-GFX942-NEXT: s_mov_b32 s5, s14 ; GISEL-GFX942-NEXT: s_mov_b32 s6, s15 ; GISEL-GFX942-NEXT: v_mov_b32_e32 v1, s16 -; GISEL-GFX942-NEXT: .LBB1_1: ; %load-store-loop +; GISEL-GFX942-NEXT: .LBB1_1: ; %static-memcpy-expansion-main-body ; GISEL-GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GISEL-GFX942-NEXT: v_add_u32_e32 v62, s0, v1 ; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v62, s[8:11], 0 offen @@ -1009,7 +1009,7 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp ; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen offset:240 ; GISEL-GFX942-NEXT: s_cbranch_vccnz .LBB1_1 -; GISEL-GFX942-NEXT: ; %bb.2: ; %memcpy-split +; GISEL-GFX942-NEXT: ; %bb.2: ; %static-memcpy-post-expansion ; GISEL-GFX942-NEXT: s_endpgm ; ; GISEL-GFX1100-LABEL: memcpy_known_medium: @@ -1029,7 +1029,7 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp ; GISEL-GFX1100-NEXT: s_mov_b32 s12, s9 ; GISEL-GFX1100-NEXT: s_mov_b32 s13, s10 ; GISEL-GFX1100-NEXT: s_mov_b32 s14, s11 -; GISEL-GFX1100-NEXT: .LBB1_1: ; %load-store-loop +; GISEL-GFX1100-NEXT: .LBB1_1: ; %static-memcpy-expansion-main-body ; GISEL-GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1 ; GISEL-GFX1100-NEXT: v_add_nc_u32_e32 v61, s0, v0 ; GISEL-GFX1100-NEXT: v_add_nc_u32_e32 v65, s8, v0 @@ -1085,7 +1085,7 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp ; GISEL-GFX1100-NEXT: buffer_store_b128 v[61:64], v65, s[12:15], 0 offen offset:240 ; GISEL-GFX1100-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0x100, v0 ; GISEL-GFX1100-NEXT: s_cbranch_vccnz .LBB1_1 -; GISEL-GFX1100-NEXT: ; %bb.2: ; %memcpy-split +; GISEL-GFX1100-NEXT: ; %bb.2: ; %static-memcpy-post-expansion ; GISEL-GFX1100-NEXT: s_endpgm call void @llvm.memcpy.p7.p7.i32(ptr addrspace(7) noundef nonnull align 16 %dst, ptr addrspace(7) noundef nonnull align 16 %src, i32 256, i1 false) ret void diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll index ad0b4fd8d902e..83d6f4f5882b4 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll @@ -19,9 +19,9 @@ define void @memcpy_known(ptr addrspace(7) inreg %src, ptr addrspace(7) inreg %d ; CHECK-NEXT: [[DST_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 1 ; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0 ; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1 -; CHECK-NEXT: br label %[[LOAD_STORE_LOOP:.*]] -; CHECK: [[LOAD_STORE_LOOP]]: -; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOAD_STORE_LOOP]] ] +; CHECK-NEXT: br label %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY:.*]] +; CHECK: [[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]]: +; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]] ] ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SRC_OFF]], [[LOOP_INDEX]] ; CHECK-NEXT: [[DOTOFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[TMP1]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_0:%.*]] = shufflevector <4 x i32> [[DOTOFF_0]], <4 x i32> poison, <64 x i32> @@ -135,8 +135,8 @@ define void @memcpy_known(ptr addrspace(7) inreg %src, ptr addrspace(7) inreg %d ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTOFF_240]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0) ; CHECK-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256 ; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 8192 -; CHECK-NEXT: br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]] -; CHECK: [[MEMCPY_SPLIT]]: +; CHECK-NEXT: br i1 [[TMP5]], label %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]], label %[[STATIC_MEMCPY_POST_LOOP_EXPANSION:.*]] +; CHECK: [[STATIC_MEMCPY_POST_LOOP_EXPANSION]]: ; CHECK-NEXT: ret void ; call void @llvm.memcpy.p7.p7.i32(ptr addrspace(7) noundef nonnull align 16 %dst, ptr addrspace(7) noundef nonnull align 16 %src, i32 8192, i1 false) @@ -211,9 +211,9 @@ define void @memcpy_known_i64(ptr addrspace(7) inreg %src, ptr addrspace(7) inre ; CHECK-NEXT: [[DST_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 1 ; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0 ; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1 -; CHECK-NEXT: br label %[[LOAD_STORE_LOOP:.*]] -; CHECK: [[LOAD_STORE_LOOP]]: -; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOAD_STORE_LOOP]] ] +; CHECK-NEXT: br label %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY:.*]] +; CHECK: [[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]]: +; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]] ] ; CHECK-NEXT: [[LOOP_INDEX_C:%.*]] = trunc i64 [[LOOP_INDEX]] to i32 ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SRC_OFF]], [[LOOP_INDEX_C]] ; CHECK-NEXT: [[DOTOFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP1]], i32 0, i32 0) @@ -329,8 +329,8 @@ define void @memcpy_known_i64(ptr addrspace(7) inreg %src, ptr addrspace(7) inre ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTOFF_240]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0) ; CHECK-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 ; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 8192 -; CHECK-NEXT: br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]] -; CHECK: [[MEMCPY_SPLIT]]: +; CHECK-NEXT: br i1 [[TMP5]], label %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]], label %[[STATIC_MEMCPY_POST_LOOP_EXPANSION:.*]] +; CHECK: [[STATIC_MEMCPY_POST_LOOP_EXPANSION]]: ; CHECK-NEXT: ret void ; call void @llvm.memcpy.p7.p7.i64(ptr addrspace(7) %dst, ptr addrspace(7) %src, i64 8192, i1 false) @@ -366,18 +366,21 @@ define void @memcpy_unknown(ptr addrspace(7) inreg %src, ptr addrspace(7) inreg ; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[LENGTH]], 15 ; CHECK-NEXT: [[TMP2:%.*]] = sub i32 [[LENGTH]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[TMP3]], label %[[LOOP_MEMCPY_EXPANSION:.*]], label %[[LOOP_MEMCPY_RESIDUAL_HEADER:.*]] -; CHECK: [[LOOP_MEMCPY_EXPANSION]]: -; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP7:%.*]], %[[LOOP_MEMCPY_EXPANSION]] ] +; CHECK-NEXT: br i1 [[TMP3]], label %[[DYNAMIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY:.*]], label %[[DYNAMIC_MEMCPY_LOOP_EXPANSION_RESIDUAL_COND:.*]] +; CHECK: [[DYNAMIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]]: +; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP7:%.*]], %[[DYNAMIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]] ] ; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SRC_OFF]], [[LOOP_INDEX]] ; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP4]], i32 0, i32 0) ; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[DST_OFF]], [[LOOP_INDEX]] ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[TMP5]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP6]], i32 0, i32 0) ; CHECK-NEXT: [[TMP7]] = add i32 [[LOOP_INDEX]], 16 ; CHECK-NEXT: [[TMP8:%.*]] = icmp ult i32 [[TMP7]], [[TMP2]] -; CHECK-NEXT: br i1 [[TMP8]], label %[[LOOP_MEMCPY_EXPANSION]], label %[[LOOP_MEMCPY_RESIDUAL_HEADER]] -; CHECK: [[LOOP_MEMCPY_RESIDUAL:.*]]: -; CHECK-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, %[[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP13:%.*]], %[[LOOP_MEMCPY_RESIDUAL]] ] +; CHECK-NEXT: br i1 [[TMP8]], label %[[DYNAMIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]], label %[[DYNAMIC_MEMCPY_LOOP_EXPANSION_RESIDUAL_COND]] +; CHECK: [[DYNAMIC_MEMCPY_LOOP_EXPANSION_RESIDUAL_COND]]: +; CHECK-NEXT: [[TMP15:%.*]] = icmp ne i32 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[TMP15]], label %[[DYNAMIC_MEMCPY_LOOP_EXPANSION_RESIDUAL_BODY:.*]], label %[[DYNAMIC_MEMCPY_POST_LOOP_EXPANSION:.*]] +; CHECK: [[DYNAMIC_MEMCPY_LOOP_EXPANSION_RESIDUAL_BODY]]: +; CHECK-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, %[[DYNAMIC_MEMCPY_LOOP_EXPANSION_RESIDUAL_COND]] ], [ [[TMP13:%.*]], %[[DYNAMIC_MEMCPY_LOOP_EXPANSION_RESIDUAL_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP2]], [[RESIDUAL_LOOP_INDEX]] ; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[SRC_OFF]], [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP10]], i32 0, i32 0) @@ -385,12 +388,9 @@ define void @memcpy_unknown(ptr addrspace(7) inreg %src, ptr addrspace(7) inreg ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[TMP11]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP12]], i32 0, i32 0) ; CHECK-NEXT: [[TMP13]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1 ; CHECK-NEXT: [[TMP14:%.*]] = icmp ult i32 [[TMP13]], [[TMP1]] -; CHECK-NEXT: br i1 [[TMP14]], label %[[LOOP_MEMCPY_RESIDUAL]], label %[[POST_LOOP_MEMCPY_EXPANSION:.*]] -; CHECK: [[POST_LOOP_MEMCPY_EXPANSION]]: +; CHECK-NEXT: br i1 [[TMP14]], label %[[DYNAMIC_MEMCPY_LOOP_EXPANSION_RESIDUAL_BODY]], label %[[DYNAMIC_MEMCPY_POST_LOOP_EXPANSION]] +; CHECK: [[DYNAMIC_MEMCPY_POST_LOOP_EXPANSION]]: ; CHECK-NEXT: ret void -; CHECK: [[LOOP_MEMCPY_RESIDUAL_HEADER]]: -; CHECK-NEXT: [[TMP15:%.*]] = icmp ne i32 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[TMP15]], label %[[LOOP_MEMCPY_RESIDUAL]], label %[[POST_LOOP_MEMCPY_EXPANSION]] ; call void @llvm.memcpy.p7.p7.i32(ptr addrspace(7) %dst, ptr addrspace(7) %src, i32 %length, i1 false) ret void @@ -401,9 +401,9 @@ define void @memcpy_known_p1_to_p7(ptr addrspace(1) inreg %src, ptr addrspace(7) ; CHECK-SAME: ptr addrspace(1) inreg [[SRC:%.*]], { ptr addrspace(8), i32 } inreg [[DST:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[DST_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 0 ; CHECK-NEXT: [[DST_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 1 -; CHECK-NEXT: br label %[[LOAD_STORE_LOOP:.*]] -; CHECK: [[LOAD_STORE_LOOP]]: -; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOAD_STORE_LOOP]] ] +; CHECK-NEXT: br label %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY:.*]] +; CHECK: [[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]]: +; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]] ] ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i32 [[LOOP_INDEX]] ; CHECK-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 16 ; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[DST_OFF]], [[LOOP_INDEX]] @@ -456,8 +456,8 @@ define void @memcpy_known_p1_to_p7(ptr addrspace(1) inreg %src, ptr addrspace(7) ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_60]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0) ; CHECK-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256 ; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 8192 -; CHECK-NEXT: br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]] -; CHECK: [[MEMCPY_SPLIT]]: +; CHECK-NEXT: br i1 [[TMP5]], label %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]], label %[[STATIC_MEMCPY_POST_LOOP_EXPANSION:.*]] +; CHECK: [[STATIC_MEMCPY_POST_LOOP_EXPANSION]]: ; CHECK-NEXT: ret void ; call void @llvm.memcpy.p7.p1.i32(ptr addrspace(7) noundef nonnull align 16 %dst, ptr addrspace(1) noundef nonnull align 16 %src, i32 8192, i1 false) @@ -469,9 +469,9 @@ define void @memcpy_known_p7_to_p1(ptr addrspace(7) inreg %src, ptr addrspace(1) ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[SRC:%.*]], ptr addrspace(1) inreg [[DST:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0 ; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1 -; CHECK-NEXT: br label %[[LOAD_STORE_LOOP:.*]] -; CHECK: [[LOAD_STORE_LOOP]]: -; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOAD_STORE_LOOP]] ] +; CHECK-NEXT: br label %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY:.*]] +; CHECK: [[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]]: +; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]] ] ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SRC_OFF]], [[LOOP_INDEX]] ; CHECK-NEXT: [[DOTOFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[TMP1]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_0:%.*]] = shufflevector <4 x i32> [[DOTOFF_0]], <4 x i32> poison, <64 x i32> @@ -540,8 +540,8 @@ define void @memcpy_known_p7_to_p1(ptr addrspace(7) inreg %src, ptr addrspace(1) ; CHECK-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 16 ; CHECK-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256 ; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 8192 -; CHECK-NEXT: br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]] -; CHECK: [[MEMCPY_SPLIT]]: +; CHECK-NEXT: br i1 [[TMP5]], label %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]], label %[[STATIC_MEMCPY_POST_LOOP_EXPANSION:.*]] +; CHECK: [[STATIC_MEMCPY_POST_LOOP_EXPANSION]]: ; CHECK-NEXT: ret void ; call void @llvm.memcpy.p1.p7.i32(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(7) noundef nonnull align 16 %src, i32 8192, i1 false) @@ -582,9 +582,9 @@ define void @memcpy_known_p7_to_p3_long(ptr addrspace(7) inreg %src, ptr addrspa ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[SRC:%.*]], ptr addrspace(3) inreg [[DST:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0 ; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1 -; CHECK-NEXT: br label %[[LOAD_STORE_LOOP:.*]] -; CHECK: [[LOAD_STORE_LOOP]]: -; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOAD_STORE_LOOP]] ] +; CHECK-NEXT: br label %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY:.*]] +; CHECK: [[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]]: +; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]] ] ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SRC_OFF]], [[LOOP_INDEX]] ; CHECK-NEXT: [[DOTOFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[TMP1]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_0:%.*]] = shufflevector <4 x i32> [[DOTOFF_0]], <4 x i32> poison, <64 x i32> @@ -653,8 +653,8 @@ define void @memcpy_known_p7_to_p3_long(ptr addrspace(7) inreg %src, ptr addrspa ; CHECK-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(3) [[TMP3]], align 16 ; CHECK-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256 ; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 8192 -; CHECK-NEXT: br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]] -; CHECK: [[MEMCPY_SPLIT]]: +; CHECK-NEXT: br i1 [[TMP5]], label %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]], label %[[STATIC_MEMCPY_POST_LOOP_EXPANSION:.*]] +; CHECK: [[STATIC_MEMCPY_POST_LOOP_EXPANSION]]: ; CHECK-NEXT: ret void ; call void @llvm.memcpy.p3.p7.i32(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(7) noundef nonnull align 16 %src, i32 8192, i1 false) @@ -676,9 +676,9 @@ define void @memcpy.inline_known(ptr addrspace(7) inreg %src, ptr addrspace(7) i ; CHECK-NEXT: [[DST_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 1 ; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0 ; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1 -; CHECK-NEXT: br label %[[LOAD_STORE_LOOP:.*]] -; CHECK: [[LOAD_STORE_LOOP]]: -; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOAD_STORE_LOOP]] ] +; CHECK-NEXT: br label %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY:.*]] +; CHECK: [[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]]: +; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]] ] ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SRC_OFF]], [[LOOP_INDEX]] ; CHECK-NEXT: [[DOTOFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[TMP1]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_0:%.*]] = shufflevector <4 x i32> [[DOTOFF_0]], <4 x i32> poison, <64 x i32> @@ -792,8 +792,8 @@ define void @memcpy.inline_known(ptr addrspace(7) inreg %src, ptr addrspace(7) i ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTOFF_240]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0) ; CHECK-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256 ; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 8192 -; CHECK-NEXT: br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]] -; CHECK: [[MEMCPY_SPLIT]]: +; CHECK-NEXT: br i1 [[TMP5]], label %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]], label %[[STATIC_MEMCPY_POST_LOOP_EXPANSION:.*]] +; CHECK: [[STATIC_MEMCPY_POST_LOOP_EXPANSION]]: ; CHECK-NEXT: ret void ; call void @llvm.memcpy.inline.p7.p7.i32(ptr addrspace(7) noundef nonnull align 16 %dst, ptr addrspace(7) noundef nonnull align 16 %src, i32 8192, i1 false) @@ -868,9 +868,9 @@ define void @memcpy.inline_known_i64(ptr addrspace(7) inreg %src, ptr addrspace( ; CHECK-NEXT: [[DST_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 1 ; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0 ; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1 -; CHECK-NEXT: br label %[[LOAD_STORE_LOOP:.*]] -; CHECK: [[LOAD_STORE_LOOP]]: -; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOAD_STORE_LOOP]] ] +; CHECK-NEXT: br label %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY:.*]] +; CHECK: [[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]]: +; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]] ] ; CHECK-NEXT: [[LOOP_INDEX_C:%.*]] = trunc i64 [[LOOP_INDEX]] to i32 ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SRC_OFF]], [[LOOP_INDEX_C]] ; CHECK-NEXT: [[DOTOFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP1]], i32 0, i32 0) @@ -986,8 +986,8 @@ define void @memcpy.inline_known_i64(ptr addrspace(7) inreg %src, ptr addrspace( ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTOFF_240]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0) ; CHECK-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 ; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 8192 -; CHECK-NEXT: br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]] -; CHECK: [[MEMCPY_SPLIT]]: +; CHECK-NEXT: br i1 [[TMP5]], label %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]], label %[[STATIC_MEMCPY_POST_LOOP_EXPANSION:.*]] +; CHECK: [[STATIC_MEMCPY_POST_LOOP_EXPANSION]]: ; CHECK-NEXT: ret void ; call void @llvm.memcpy.inline.p7.p7.i64(ptr addrspace(7) %dst, ptr addrspace(7) %src, i64 8192, i1 false) @@ -1023,18 +1023,21 @@ define void @memcpy.inline_unknown(ptr addrspace(7) inreg %src, ptr addrspace(7) ; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[LENGTH]], 15 ; CHECK-NEXT: [[TMP2:%.*]] = sub i32 [[LENGTH]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[TMP3]], label %[[LOOP_MEMCPY_EXPANSION:.*]], label %[[LOOP_MEMCPY_RESIDUAL_HEADER:.*]] -; CHECK: [[LOOP_MEMCPY_EXPANSION]]: -; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP7:%.*]], %[[LOOP_MEMCPY_EXPANSION]] ] +; CHECK-NEXT: br i1 [[TMP3]], label %[[DYNAMIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY:.*]], label %[[DYNAMIC_MEMCPY_LOOP_EXPANSION_RESIDUAL_COND:.*]] +; CHECK: [[DYNAMIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]]: +; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP7:%.*]], %[[DYNAMIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]] ] ; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SRC_OFF]], [[LOOP_INDEX]] ; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP4]], i32 0, i32 0) ; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[DST_OFF]], [[LOOP_INDEX]] ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[TMP5]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP6]], i32 0, i32 0) ; CHECK-NEXT: [[TMP7]] = add i32 [[LOOP_INDEX]], 16 ; CHECK-NEXT: [[TMP8:%.*]] = icmp ult i32 [[TMP7]], [[TMP2]] -; CHECK-NEXT: br i1 [[TMP8]], label %[[LOOP_MEMCPY_EXPANSION]], label %[[LOOP_MEMCPY_RESIDUAL_HEADER]] -; CHECK: [[LOOP_MEMCPY_RESIDUAL:.*]]: -; CHECK-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, %[[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP13:%.*]], %[[LOOP_MEMCPY_RESIDUAL]] ] +; CHECK-NEXT: br i1 [[TMP8]], label %[[DYNAMIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]], label %[[DYNAMIC_MEMCPY_LOOP_EXPANSION_RESIDUAL_COND]] +; CHECK: [[DYNAMIC_MEMCPY_LOOP_EXPANSION_RESIDUAL_COND]]: +; CHECK-NEXT: [[TMP15:%.*]] = icmp ne i32 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[TMP15]], label %[[DYNAMIC_MEMCPY_LOOP_EXPANSION_RESIDUAL_BODY:.*]], label %[[DYNAMIC_MEMCPY_POST_LOOP_EXPANSION:.*]] +; CHECK: [[DYNAMIC_MEMCPY_LOOP_EXPANSION_RESIDUAL_BODY]]: +; CHECK-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, %[[DYNAMIC_MEMCPY_LOOP_EXPANSION_RESIDUAL_COND]] ], [ [[TMP13:%.*]], %[[DYNAMIC_MEMCPY_LOOP_EXPANSION_RESIDUAL_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP2]], [[RESIDUAL_LOOP_INDEX]] ; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[SRC_OFF]], [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP10]], i32 0, i32 0) @@ -1042,12 +1045,9 @@ define void @memcpy.inline_unknown(ptr addrspace(7) inreg %src, ptr addrspace(7) ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[TMP11]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP12]], i32 0, i32 0) ; CHECK-NEXT: [[TMP13]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1 ; CHECK-NEXT: [[TMP14:%.*]] = icmp ult i32 [[TMP13]], [[TMP1]] -; CHECK-NEXT: br i1 [[TMP14]], label %[[LOOP_MEMCPY_RESIDUAL]], label %[[POST_LOOP_MEMCPY_EXPANSION:.*]] -; CHECK: [[POST_LOOP_MEMCPY_EXPANSION]]: +; CHECK-NEXT: br i1 [[TMP14]], label %[[DYNAMIC_MEMCPY_LOOP_EXPANSION_RESIDUAL_BODY]], label %[[DYNAMIC_MEMCPY_POST_LOOP_EXPANSION]] +; CHECK: [[DYNAMIC_MEMCPY_POST_LOOP_EXPANSION]]: ; CHECK-NEXT: ret void -; CHECK: [[LOOP_MEMCPY_RESIDUAL_HEADER]]: -; CHECK-NEXT: [[TMP15:%.*]] = icmp ne i32 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[TMP15]], label %[[LOOP_MEMCPY_RESIDUAL]], label %[[POST_LOOP_MEMCPY_EXPANSION]] ; call void @llvm.memcpy.inline.p7.p7.i32(ptr addrspace(7) %dst, ptr addrspace(7) %src, i32 %length, i1 false) ret void @@ -1058,9 +1058,9 @@ define void @memcpy.inline_known_p1_to_p7(ptr addrspace(1) inreg %src, ptr addrs ; CHECK-SAME: ptr addrspace(1) inreg [[SRC:%.*]], { ptr addrspace(8), i32 } inreg [[DST:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[DST_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 0 ; CHECK-NEXT: [[DST_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 1 -; CHECK-NEXT: br label %[[LOAD_STORE_LOOP:.*]] -; CHECK: [[LOAD_STORE_LOOP]]: -; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOAD_STORE_LOOP]] ] +; CHECK-NEXT: br label %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY:.*]] +; CHECK: [[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]]: +; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]] ] ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i32 [[LOOP_INDEX]] ; CHECK-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 16 ; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[DST_OFF]], [[LOOP_INDEX]] @@ -1113,8 +1113,8 @@ define void @memcpy.inline_known_p1_to_p7(ptr addrspace(1) inreg %src, ptr addrs ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_60]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0) ; CHECK-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256 ; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 8192 -; CHECK-NEXT: br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]] -; CHECK: [[MEMCPY_SPLIT]]: +; CHECK-NEXT: br i1 [[TMP5]], label %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]], label %[[STATIC_MEMCPY_POST_LOOP_EXPANSION:.*]] +; CHECK: [[STATIC_MEMCPY_POST_LOOP_EXPANSION]]: ; CHECK-NEXT: ret void ; call void @llvm.memcpy.inline.p7.p1.i32(ptr addrspace(7) noundef nonnull align 16 %dst, ptr addrspace(1) noundef nonnull align 16 %src, i32 8192, i1 false) @@ -1126,9 +1126,9 @@ define void @memcpy.inline_known_p7_to_p1(ptr addrspace(7) inreg %src, ptr addrs ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[SRC:%.*]], ptr addrspace(1) inreg [[DST:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0 ; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1 -; CHECK-NEXT: br label %[[LOAD_STORE_LOOP:.*]] -; CHECK: [[LOAD_STORE_LOOP]]: -; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOAD_STORE_LOOP]] ] +; CHECK-NEXT: br label %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY:.*]] +; CHECK: [[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]]: +; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]] ] ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SRC_OFF]], [[LOOP_INDEX]] ; CHECK-NEXT: [[DOTOFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[TMP1]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_0:%.*]] = shufflevector <4 x i32> [[DOTOFF_0]], <4 x i32> poison, <64 x i32> @@ -1197,8 +1197,8 @@ define void @memcpy.inline_known_p7_to_p1(ptr addrspace(7) inreg %src, ptr addrs ; CHECK-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 16 ; CHECK-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256 ; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 8192 -; CHECK-NEXT: br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]] -; CHECK: [[MEMCPY_SPLIT]]: +; CHECK-NEXT: br i1 [[TMP5]], label %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]], label %[[STATIC_MEMCPY_POST_LOOP_EXPANSION:.*]] +; CHECK: [[STATIC_MEMCPY_POST_LOOP_EXPANSION]]: ; CHECK-NEXT: ret void ; call void @llvm.memcpy.inline.p1.p7.i32(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(7) noundef nonnull align 16 %src, i32 8192, i1 false) @@ -1239,9 +1239,9 @@ define void @memcpy.inline_known_p7_to_p3_long(ptr addrspace(7) inreg %src, ptr ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[SRC:%.*]], ptr addrspace(3) inreg [[DST:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0 ; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1 -; CHECK-NEXT: br label %[[LOAD_STORE_LOOP:.*]] -; CHECK: [[LOAD_STORE_LOOP]]: -; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOAD_STORE_LOOP]] ] +; CHECK-NEXT: br label %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY:.*]] +; CHECK: [[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]]: +; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]] ] ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SRC_OFF]], [[LOOP_INDEX]] ; CHECK-NEXT: [[DOTOFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[TMP1]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_0:%.*]] = shufflevector <4 x i32> [[DOTOFF_0]], <4 x i32> poison, <64 x i32> @@ -1310,8 +1310,8 @@ define void @memcpy.inline_known_p7_to_p3_long(ptr addrspace(7) inreg %src, ptr ; CHECK-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(3) [[TMP3]], align 16 ; CHECK-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256 ; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 8192 -; CHECK-NEXT: br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]] -; CHECK: [[MEMCPY_SPLIT]]: +; CHECK-NEXT: br i1 [[TMP5]], label %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]], label %[[STATIC_MEMCPY_POST_LOOP_EXPANSION:.*]] +; CHECK: [[STATIC_MEMCPY_POST_LOOP_EXPANSION]]: ; CHECK-NEXT: ret void ; call void @llvm.memcpy.inline.p3.p7.i32(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(7) noundef nonnull align 16 %src, i32 8192, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll index 5a9f53ec0077d..20a34dc997bbc 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll @@ -43,7 +43,7 @@ define amdgpu_kernel void @max_size_small_static_memcpy_caller0(ptr addrspace(1) ; ; ALL-LABEL: @max_size_small_static_memcpy_caller0( ; ALL-NEXT: br label [[LOAD_STORE_LOOP:%.*]] -; ALL: load-store-loop: +; ALL: static-memcpy-expansion-main-body: ; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] ; ALL-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 1 @@ -52,7 +52,7 @@ define amdgpu_kernel void @max_size_small_static_memcpy_caller0(ptr addrspace(1) ; ALL-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 ; ALL-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024 ; ALL-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] -; ALL: memcpy-split: +; ALL: static-memcpy-post-expansion: ; ALL-NEXT: ret void ; call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 1024, i1 false) @@ -63,7 +63,7 @@ define amdgpu_kernel void @max_size_small_static_memcpy_caller0(ptr addrspace(1) define amdgpu_kernel void @min_size_large_static_memcpy_caller0(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 { ; OPT-LABEL: @min_size_large_static_memcpy_caller0( ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] -; OPT: load-store-loop: +; OPT: static-memcpy-expansion-main-body: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] ; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 1 @@ -72,7 +72,7 @@ define amdgpu_kernel void @min_size_large_static_memcpy_caller0(ptr addrspace(1) ; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 ; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] -; OPT: memcpy-split: +; OPT: static-memcpy-post-expansion: ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024 ; OPT-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(1) [[TMP6]], align 1 ; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024 @@ -201,7 +201,7 @@ define amdgpu_kernel void @variable_memcpy_caller0(ptr addrspace(1) %dst, ptr ad ; OPT-NEXT: [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]] ; OPT-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0 ; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] -; OPT: loop-memcpy-expansion: +; OPT: dynamic-memcpy-expansion-main-body: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] ; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 1 @@ -210,8 +210,11 @@ define amdgpu_kernel void @variable_memcpy_caller0(ptr addrspace(1) %dst, ptr ad ; OPT-NEXT: [[TMP8]] = add i64 [[LOOP_INDEX]], 16 ; OPT-NEXT: [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP3]] ; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] -; OPT: loop-memcpy-residual: -; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] +; OPT: dynamic-memcpy-expansion-residual-cond: +; OPT-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP2]], 0 +; OPT-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL:%.*]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] +; OPT: dynamic-memcpy-expansion-residual-body: +; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL]] ] ; OPT-NEXT: [[TMP10:%.*]] = add i64 [[TMP3]], [[RESIDUAL_LOOP_INDEX]] ; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[TMP10]] ; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 1 @@ -219,12 +222,9 @@ define amdgpu_kernel void @variable_memcpy_caller0(ptr addrspace(1) %dst, ptr ad ; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 1 ; OPT-NEXT: [[TMP14]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1 ; OPT-NEXT: [[TMP15:%.*]] = icmp ult i64 [[TMP14]], [[TMP2]] -; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] -; OPT: post-loop-memcpy-expansion: +; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] +; OPT: dynamic-memcpy-post-expansion: ; OPT-NEXT: ret void -; OPT: loop-memcpy-residual-header: -; OPT-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP2]], 0 -; OPT-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] ; call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 %n, i1 false) ret void @@ -236,7 +236,7 @@ define amdgpu_kernel void @variable_memcpy_caller1(ptr addrspace(1) %dst, ptr ad ; OPT-NEXT: [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]] ; OPT-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0 ; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] -; OPT: loop-memcpy-expansion: +; OPT: dynamic-memcpy-expansion-main-body: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] ; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 1 @@ -245,8 +245,11 @@ define amdgpu_kernel void @variable_memcpy_caller1(ptr addrspace(1) %dst, ptr ad ; OPT-NEXT: [[TMP8]] = add i64 [[LOOP_INDEX]], 16 ; OPT-NEXT: [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP3]] ; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] -; OPT: loop-memcpy-residual: -; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] +; OPT: dynamic-memcpy-expansion-residual-cond: +; OPT-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP2]], 0 +; OPT-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL:%.*]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] +; OPT: dynamic-memcpy-expansion-residual-body: +; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL]] ] ; OPT-NEXT: [[TMP10:%.*]] = add i64 [[TMP3]], [[RESIDUAL_LOOP_INDEX]] ; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[TMP10]] ; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 1 @@ -254,12 +257,9 @@ define amdgpu_kernel void @variable_memcpy_caller1(ptr addrspace(1) %dst, ptr ad ; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 1 ; OPT-NEXT: [[TMP14]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1 ; OPT-NEXT: [[TMP15:%.*]] = icmp ult i64 [[TMP14]], [[TMP2]] -; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] -; OPT: post-loop-memcpy-expansion: +; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] +; OPT: dynamic-memcpy-post-expansion: ; OPT-NEXT: ret void -; OPT: loop-memcpy-residual-header: -; OPT-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP2]], 0 -; OPT-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] ; call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 %n, i1 false) ret void @@ -271,7 +271,7 @@ define amdgpu_kernel void @memcpy_multi_use_one_function(ptr addrspace(1) %dst0, ; OPT-NEXT: [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]] ; OPT-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0 ; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION2:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER5:%.*]] -; OPT: loop-memcpy-expansion2: +; OPT: dynamic-memcpy-expansion-main-body2: ; OPT-NEXT: [[LOOP_INDEX3:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION2]] ] ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX3]] ; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 1 @@ -280,8 +280,11 @@ define amdgpu_kernel void @memcpy_multi_use_one_function(ptr addrspace(1) %dst0, ; OPT-NEXT: [[TMP8]] = add i64 [[LOOP_INDEX3]], 16 ; OPT-NEXT: [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP3]] ; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION2]], label [[LOOP_MEMCPY_RESIDUAL_HEADER5]] -; OPT: loop-memcpy-residual4: -; OPT-NEXT: [[RESIDUAL_LOOP_INDEX6:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER5]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL4:%.*]] ] +; OPT: dynamic-memcpy-expansion-residual-cond5: +; OPT-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP2]], 0 +; OPT-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL4:%.*]], label [[POST_LOOP_MEMCPY_EXPANSION1:%.*]] +; OPT: dynamic-memcpy-expansion-residual-body4: +; OPT-NEXT: [[RESIDUAL_LOOP_INDEX6:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER5]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL4]] ] ; OPT-NEXT: [[TMP10:%.*]] = add i64 [[TMP3]], [[RESIDUAL_LOOP_INDEX6]] ; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[TMP10]] ; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 1 @@ -289,13 +292,13 @@ define amdgpu_kernel void @memcpy_multi_use_one_function(ptr addrspace(1) %dst0, ; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 1 ; OPT-NEXT: [[TMP14]] = add i64 [[RESIDUAL_LOOP_INDEX6]], 1 ; OPT-NEXT: [[TMP15:%.*]] = icmp ult i64 [[TMP14]], [[TMP2]] -; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL4]], label [[POST_LOOP_MEMCPY_EXPANSION1:%.*]] -; OPT: post-loop-memcpy-expansion1: +; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL4]], label [[POST_LOOP_MEMCPY_EXPANSION1]] +; OPT: dynamic-memcpy-post-expansion1: ; OPT-NEXT: [[TMP17:%.*]] = and i64 [[M:%.*]], 15 ; OPT-NEXT: [[TMP18:%.*]] = sub i64 [[M]], [[TMP17]] ; OPT-NEXT: [[TMP19:%.*]] = icmp ne i64 [[TMP18]], 0 ; OPT-NEXT: br i1 [[TMP19]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] -; OPT: loop-memcpy-expansion: +; OPT: dynamic-memcpy-expansion-main-body: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[POST_LOOP_MEMCPY_EXPANSION1]] ], [ [[TMP23:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] ; OPT-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[LOOP_INDEX]] ; OPT-NEXT: [[TMP21:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP20]], align 1 @@ -304,8 +307,11 @@ define amdgpu_kernel void @memcpy_multi_use_one_function(ptr addrspace(1) %dst0, ; OPT-NEXT: [[TMP23]] = add i64 [[LOOP_INDEX]], 16 ; OPT-NEXT: [[TMP24:%.*]] = icmp ult i64 [[TMP23]], [[TMP18]] ; OPT-NEXT: br i1 [[TMP24]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] -; OPT: loop-memcpy-residual: -; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP29:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] +; OPT: dynamic-memcpy-expansion-residual-cond: +; OPT-NEXT: [[TMP31:%.*]] = icmp ne i64 [[TMP17]], 0 +; OPT-NEXT: br i1 [[TMP31]], label [[LOOP_MEMCPY_RESIDUAL:%.*]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] +; OPT: dynamic-memcpy-expansion-residual-body: +; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP29:%.*]], [[LOOP_MEMCPY_RESIDUAL]] ] ; OPT-NEXT: [[TMP25:%.*]] = add i64 [[TMP18]], [[RESIDUAL_LOOP_INDEX]] ; OPT-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[TMP25]] ; OPT-NEXT: [[TMP27:%.*]] = load i8, ptr addrspace(1) [[TMP26]], align 1 @@ -313,15 +319,9 @@ define amdgpu_kernel void @memcpy_multi_use_one_function(ptr addrspace(1) %dst0, ; OPT-NEXT: store i8 [[TMP27]], ptr addrspace(1) [[TMP28]], align 1 ; OPT-NEXT: [[TMP29]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1 ; OPT-NEXT: [[TMP30:%.*]] = icmp ult i64 [[TMP29]], [[TMP17]] -; OPT-NEXT: br i1 [[TMP30]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] -; OPT: post-loop-memcpy-expansion: +; OPT-NEXT: br i1 [[TMP30]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] +; OPT: dynamic-memcpy-post-expansion: ; OPT-NEXT: ret void -; OPT: loop-memcpy-residual-header: -; OPT-NEXT: [[TMP31:%.*]] = icmp ne i64 [[TMP17]], 0 -; OPT-NEXT: br i1 [[TMP31]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] -; OPT: loop-memcpy-residual-header5: -; OPT-NEXT: [[TMP32:%.*]] = icmp ne i64 [[TMP2]], 0 -; OPT-NEXT: br i1 [[TMP32]], label [[LOOP_MEMCPY_RESIDUAL4]], label [[POST_LOOP_MEMCPY_EXPANSION1]] ; call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst0, ptr addrspace(1) %src, i64 %n, i1 false) call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst1, ptr addrspace(1) %src, i64 %m, i1 false) @@ -334,7 +334,7 @@ define amdgpu_kernel void @memcpy_alt_type(ptr addrspace(1) %dst, ptr addrspace( ; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]] ; OPT-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 ; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] -; OPT: loop-memcpy-expansion: +; OPT: dynamic-memcpy-expansion-main-body: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]] ; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP5]], align 1 @@ -343,8 +343,11 @@ define amdgpu_kernel void @memcpy_alt_type(ptr addrspace(1) %dst, ptr addrspace( ; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 16 ; OPT-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]] ; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] -; OPT: loop-memcpy-residual: -; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] +; OPT: dynamic-memcpy-expansion-residual-cond: +; OPT-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0 +; OPT-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL:%.*]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] +; OPT: dynamic-memcpy-expansion-residual-body: +; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL]] ] ; OPT-NEXT: [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]] ; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[TMP10]] ; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 1 @@ -352,12 +355,9 @@ define amdgpu_kernel void @memcpy_alt_type(ptr addrspace(1) %dst, ptr addrspace( ; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 1 ; OPT-NEXT: [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1 ; OPT-NEXT: [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]] -; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] -; OPT: post-loop-memcpy-expansion: +; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] +; OPT: dynamic-memcpy-post-expansion: ; OPT-NEXT: ret void -; OPT: loop-memcpy-residual-header: -; OPT-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0 -; OPT-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] ; call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %n, i1 false) ret void @@ -370,7 +370,7 @@ define amdgpu_kernel void @memcpy_multi_use_one_function_keep_small(ptr addrspac ; MAX1024-NEXT: [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]] ; MAX1024-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0 ; MAX1024-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] -; MAX1024: loop-memcpy-expansion: +; MAX1024: dynamic-memcpy-expansion-main-body: ; MAX1024-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] ; MAX1024-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] ; MAX1024-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 1 @@ -379,8 +379,11 @@ define amdgpu_kernel void @memcpy_multi_use_one_function_keep_small(ptr addrspac ; MAX1024-NEXT: [[TMP8]] = add i64 [[LOOP_INDEX]], 16 ; MAX1024-NEXT: [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP3]] ; MAX1024-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] -; MAX1024: loop-memcpy-residual: -; MAX1024-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] +; MAX1024: dynamic-memcpy-expansion-residual-cond: +; MAX1024-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP2]], 0 +; MAX1024-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL:%.*]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] +; MAX1024: dynamic-memcpy-expansion-residual-body: +; MAX1024-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL]] ] ; MAX1024-NEXT: [[TMP10:%.*]] = add i64 [[TMP3]], [[RESIDUAL_LOOP_INDEX]] ; MAX1024-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[TMP10]] ; MAX1024-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 1 @@ -388,20 +391,17 @@ define amdgpu_kernel void @memcpy_multi_use_one_function_keep_small(ptr addrspac ; MAX1024-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 1 ; MAX1024-NEXT: [[TMP14]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1 ; MAX1024-NEXT: [[TMP15:%.*]] = icmp ult i64 [[TMP14]], [[TMP2]] -; MAX1024-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] -; MAX1024: post-loop-memcpy-expansion: +; MAX1024-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] +; MAX1024: dynamic-memcpy-post-expansion: ; MAX1024-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[DST1:%.*]], ptr addrspace(1) [[SRC]], i64 102, i1 false) ; MAX1024-NEXT: ret void -; MAX1024: loop-memcpy-residual-header: -; MAX1024-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP2]], 0 -; MAX1024-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] ; ; ALL-LABEL: @memcpy_multi_use_one_function_keep_small( ; ALL-NEXT: [[TMP2:%.*]] = and i64 [[N:%.*]], 15 ; ALL-NEXT: [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]] ; ALL-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0 ; ALL-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] -; ALL: loop-memcpy-expansion: +; ALL: dynamic-memcpy-expansion-main-body: ; ALL-NEXT: [[LOOP_INDEX1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] ; ALL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX1]] ; ALL-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 1 @@ -410,8 +410,11 @@ define amdgpu_kernel void @memcpy_multi_use_one_function_keep_small(ptr addrspac ; ALL-NEXT: [[TMP8]] = add i64 [[LOOP_INDEX1]], 16 ; ALL-NEXT: [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP3]] ; ALL-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] -; ALL: loop-memcpy-residual: -; ALL-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] +; ALL: dynamic-memcpy-expansion-residual-cond: +; ALL-NEXT: [[TMP27:%.*]] = icmp ne i64 [[TMP2]], 0 +; ALL-NEXT: br i1 [[TMP27]], label [[LOOP_MEMCPY_RESIDUAL:%.*]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] +; ALL: dynamic-memcpy-expansion-residual-body: +; ALL-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL]] ] ; ALL-NEXT: [[TMP10:%.*]] = add i64 [[TMP3]], [[RESIDUAL_LOOP_INDEX]] ; ALL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[TMP10]] ; ALL-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 1 @@ -419,8 +422,8 @@ define amdgpu_kernel void @memcpy_multi_use_one_function_keep_small(ptr addrspac ; ALL-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 1 ; ALL-NEXT: [[TMP14]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1 ; ALL-NEXT: [[TMP15:%.*]] = icmp ult i64 [[TMP14]], [[TMP2]] -; ALL-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] -; ALL: post-loop-memcpy-expansion: +; ALL-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] +; ALL: dynamic-memcpy-post-expansion: ; ALL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 0 ; ALL-NEXT: [[TMP17:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP16]], align 1 ; ALL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST1:%.*]], i64 0 @@ -454,9 +457,6 @@ define amdgpu_kernel void @memcpy_multi_use_one_function_keep_small(ptr addrspac ; ALL-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST1]], i64 100 ; ALL-NEXT: store i16 [[TMP25]], ptr addrspace(1) [[TMP26]], align 1 ; ALL-NEXT: ret void -; ALL: loop-memcpy-residual-header: -; ALL-NEXT: [[TMP27:%.*]] = icmp ne i64 [[TMP2]], 0 -; ALL-NEXT: br i1 [[TMP27]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] ; call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst0, ptr addrspace(1) %src, i64 %n, i1 false) call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst1, ptr addrspace(1) %src, i64 102, i1 false) @@ -466,7 +466,7 @@ define amdgpu_kernel void @memcpy_multi_use_one_function_keep_small(ptr addrspac define amdgpu_kernel void @memcpy_global_align4_global_align4_1028(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 { ; OPT-LABEL: @memcpy_global_align4_global_align4_1028( ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] -; OPT: load-store-loop: +; OPT: static-memcpy-expansion-main-body: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] ; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4 @@ -475,7 +475,7 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1028(ptr addrspace ; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 ; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] -; OPT: memcpy-split: +; OPT: static-memcpy-post-expansion: ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024 ; OPT-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) [[TMP6]], align 4 ; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024 @@ -489,7 +489,7 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1028(ptr addrspace define amdgpu_kernel void @memcpy_global_align4_global_align4_1025(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 { ; OPT-LABEL: @memcpy_global_align4_global_align4_1025( ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] -; OPT: load-store-loop: +; OPT: static-memcpy-expansion-main-body: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] ; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4 @@ -498,7 +498,7 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1025(ptr addrspace ; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 ; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] -; OPT: memcpy-split: +; OPT: static-memcpy-post-expansion: ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024 ; OPT-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(1) [[TMP6]], align 4 ; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024 @@ -512,7 +512,7 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1025(ptr addrspace define amdgpu_kernel void @memcpy_global_align4_global_align4_1026(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 { ; OPT-LABEL: @memcpy_global_align4_global_align4_1026( ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] -; OPT: load-store-loop: +; OPT: static-memcpy-expansion-main-body: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] ; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4 @@ -521,7 +521,7 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1026(ptr addrspace ; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 ; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] -; OPT: memcpy-split: +; OPT: static-memcpy-post-expansion: ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024 ; OPT-NEXT: [[TMP7:%.*]] = load i16, ptr addrspace(1) [[TMP6]], align 4 ; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024 @@ -535,7 +535,7 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1026(ptr addrspace define amdgpu_kernel void @memcpy_global_align4_global_align4_1032(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 { ; OPT-LABEL: @memcpy_global_align4_global_align4_1032( ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] -; OPT: load-store-loop: +; OPT: static-memcpy-expansion-main-body: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] ; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4 @@ -544,7 +544,7 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1032(ptr addrspace ; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 ; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] -; OPT: memcpy-split: +; OPT: static-memcpy-post-expansion: ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024 ; OPT-NEXT: [[TMP7:%.*]] = load i64, ptr addrspace(1) [[TMP6]], align 4 ; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024 @@ -558,7 +558,7 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1032(ptr addrspace define amdgpu_kernel void @memcpy_global_align4_global_align4_1034(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 { ; OPT-LABEL: @memcpy_global_align4_global_align4_1034( ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] -; OPT: load-store-loop: +; OPT: static-memcpy-expansion-main-body: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] ; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4 @@ -567,7 +567,7 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1034(ptr addrspace ; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 ; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] -; OPT: memcpy-split: +; OPT: static-memcpy-post-expansion: ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024 ; OPT-NEXT: [[TMP7:%.*]] = load i64, ptr addrspace(1) [[TMP6]], align 4 ; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024 @@ -585,7 +585,7 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1034(ptr addrspace define amdgpu_kernel void @memcpy_global_align4_global_align4_1035(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 { ; OPT-LABEL: @memcpy_global_align4_global_align4_1035( ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] -; OPT: load-store-loop: +; OPT: static-memcpy-expansion-main-body: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] ; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4 @@ -594,7 +594,7 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1035(ptr addrspace ; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 ; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] -; OPT: memcpy-split: +; OPT: static-memcpy-post-expansion: ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024 ; OPT-NEXT: [[TMP7:%.*]] = load i64, ptr addrspace(1) [[TMP6]], align 4 ; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024 @@ -616,7 +616,7 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1035(ptr addrspace define amdgpu_kernel void @memcpy_global_align4_global_align4_1036(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 { ; OPT-LABEL: @memcpy_global_align4_global_align4_1036( ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] -; OPT: load-store-loop: +; OPT: static-memcpy-expansion-main-body: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] ; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4 @@ -625,7 +625,7 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1036(ptr addrspace ; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 ; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] -; OPT: memcpy-split: +; OPT: static-memcpy-post-expansion: ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024 ; OPT-NEXT: [[TMP7:%.*]] = load i64, ptr addrspace(1) [[TMP6]], align 4 ; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024 @@ -643,7 +643,7 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1036(ptr addrspace define amdgpu_kernel void @memcpy_global_align4_global_align4_1039(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 { ; OPT-LABEL: @memcpy_global_align4_global_align4_1039( ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] -; OPT: load-store-loop: +; OPT: static-memcpy-expansion-main-body: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] ; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4 @@ -652,7 +652,7 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1039(ptr addrspace ; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 ; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] -; OPT: memcpy-split: +; OPT: static-memcpy-post-expansion: ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024 ; OPT-NEXT: [[TMP7:%.*]] = load i64, ptr addrspace(1) [[TMP6]], align 4 ; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024 @@ -678,7 +678,7 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1039(ptr addrspace define amdgpu_kernel void @memcpy_global_align2_global_align2_1039(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 { ; OPT-LABEL: @memcpy_global_align2_global_align2_1039( ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] -; OPT: load-store-loop: +; OPT: static-memcpy-expansion-main-body: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] ; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 2 @@ -687,7 +687,7 @@ define amdgpu_kernel void @memcpy_global_align2_global_align2_1039(ptr addrspace ; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 ; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] -; OPT: memcpy-split: +; OPT: static-memcpy-post-expansion: ; OPT-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024 ; OPT-NEXT: [[TMP16:%.*]] = load i64, ptr addrspace(1) [[TMP15]], align 2 ; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024 @@ -713,7 +713,7 @@ define amdgpu_kernel void @memcpy_global_align2_global_align2_1039(ptr addrspace define amdgpu_kernel void @memcpy_global_align4_global_align4_1027(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 { ; OPT-LABEL: @memcpy_global_align4_global_align4_1027( ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] -; OPT: load-store-loop: +; OPT: static-memcpy-expansion-main-body: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] ; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4 @@ -722,7 +722,7 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1027(ptr addrspace ; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 ; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] -; OPT: memcpy-split: +; OPT: static-memcpy-post-expansion: ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024 ; OPT-NEXT: [[TMP7:%.*]] = load i16, ptr addrspace(1) [[TMP6]], align 4 ; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024 @@ -740,7 +740,7 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1027(ptr addrspace define amdgpu_kernel void @memcpy_global_align2_global_align4_1027(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 { ; OPT-LABEL: @memcpy_global_align2_global_align4_1027( ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] -; OPT: load-store-loop: +; OPT: static-memcpy-expansion-main-body: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] ; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4 @@ -749,7 +749,7 @@ define amdgpu_kernel void @memcpy_global_align2_global_align4_1027(ptr addrspace ; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 ; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] -; OPT: memcpy-split: +; OPT: static-memcpy-post-expansion: ; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024 ; OPT-NEXT: [[TMP10:%.*]] = load i16, ptr addrspace(1) [[TMP9]], align 4 ; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024 @@ -767,7 +767,7 @@ define amdgpu_kernel void @memcpy_global_align2_global_align4_1027(ptr addrspace define amdgpu_kernel void @memcpy_global_align4_global_align2_1027(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 { ; OPT-LABEL: @memcpy_global_align4_global_align2_1027( ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] -; OPT: load-store-loop: +; OPT: static-memcpy-expansion-main-body: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] ; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 2 @@ -776,7 +776,7 @@ define amdgpu_kernel void @memcpy_global_align4_global_align2_1027(ptr addrspace ; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 ; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] -; OPT: memcpy-split: +; OPT: static-memcpy-post-expansion: ; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024 ; OPT-NEXT: [[TMP10:%.*]] = load i16, ptr addrspace(1) [[TMP9]], align 2 ; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024 @@ -794,7 +794,7 @@ define amdgpu_kernel void @memcpy_global_align4_global_align2_1027(ptr addrspace define amdgpu_kernel void @memcpy_private_align4_private_align4_1027(ptr addrspace(5) %dst, ptr addrspace(5) %src) #0 { ; OPT-LABEL: @memcpy_private_align4_private_align4_1027( ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] -; OPT: load-store-loop: +; OPT: static-memcpy-expansion-main-body: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]] ; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 4 @@ -803,7 +803,7 @@ define amdgpu_kernel void @memcpy_private_align4_private_align4_1027(ptr addrspa ; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256 ; OPT-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1024 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] -; OPT: memcpy-split: +; OPT: static-memcpy-post-expansion: ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1024 ; OPT-NEXT: [[TMP7:%.*]] = load i16, ptr addrspace(5) [[TMP6]], align 4 ; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1024 @@ -821,7 +821,7 @@ define amdgpu_kernel void @memcpy_private_align4_private_align4_1027(ptr addrspa define amdgpu_kernel void @memcpy_private_align2_private_align4_1027(ptr addrspace(5) %dst, ptr addrspace(5) %src) #0 { ; OPT-LABEL: @memcpy_private_align2_private_align4_1027( ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] -; OPT: load-store-loop: +; OPT: static-memcpy-expansion-main-body: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]] ; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 4 @@ -830,7 +830,7 @@ define amdgpu_kernel void @memcpy_private_align2_private_align4_1027(ptr addrspa ; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256 ; OPT-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1024 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] -; OPT: memcpy-split: +; OPT: static-memcpy-post-expansion: ; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1024 ; OPT-NEXT: [[TMP10:%.*]] = load i16, ptr addrspace(5) [[TMP9]], align 4 ; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1024 @@ -848,7 +848,7 @@ define amdgpu_kernel void @memcpy_private_align2_private_align4_1027(ptr addrspa define amdgpu_kernel void @memcpy_private_align1_private_align4_1027(ptr addrspace(5) %dst, ptr addrspace(5) %src) #0 { ; OPT-LABEL: @memcpy_private_align1_private_align4_1027( ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] -; OPT: load-store-loop: +; OPT: static-memcpy-expansion-main-body: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]] ; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 4 @@ -857,7 +857,7 @@ define amdgpu_kernel void @memcpy_private_align1_private_align4_1027(ptr addrspa ; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256 ; OPT-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1024 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] -; OPT: memcpy-split: +; OPT: static-memcpy-post-expansion: ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1024 ; OPT-NEXT: [[TMP7:%.*]] = load i16, ptr addrspace(5) [[TMP6]], align 4 ; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1024 @@ -875,7 +875,7 @@ define amdgpu_kernel void @memcpy_private_align1_private_align4_1027(ptr addrspa define amdgpu_kernel void @memcpy_private_align4_private_align2_1027(ptr addrspace(5) %dst, ptr addrspace(5) %src) #0 { ; OPT-LABEL: @memcpy_private_align4_private_align2_1027( ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] -; OPT: load-store-loop: +; OPT: static-memcpy-expansion-main-body: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]] ; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 2 @@ -884,7 +884,7 @@ define amdgpu_kernel void @memcpy_private_align4_private_align2_1027(ptr addrspa ; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256 ; OPT-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1024 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] -; OPT: memcpy-split: +; OPT: static-memcpy-post-expansion: ; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1024 ; OPT-NEXT: [[TMP10:%.*]] = load i16, ptr addrspace(5) [[TMP9]], align 2 ; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1024 @@ -902,7 +902,7 @@ define amdgpu_kernel void @memcpy_private_align4_private_align2_1027(ptr addrspa define amdgpu_kernel void @memcpy_private_align4_private_align1_1027(ptr addrspace(5) %dst, ptr addrspace(5) %src) #0 { ; OPT-LABEL: @memcpy_private_align4_private_align1_1027( ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] -; OPT: load-store-loop: +; OPT: static-memcpy-expansion-main-body: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]] ; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 1 @@ -911,7 +911,7 @@ define amdgpu_kernel void @memcpy_private_align4_private_align1_1027(ptr addrspa ; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256 ; OPT-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1024 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] -; OPT: memcpy-split: +; OPT: static-memcpy-post-expansion: ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1024 ; OPT-NEXT: [[TMP7:%.*]] = load i16, ptr addrspace(5) [[TMP6]], align 1 ; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1024 @@ -929,7 +929,7 @@ define amdgpu_kernel void @memcpy_private_align4_private_align1_1027(ptr addrspa define amdgpu_kernel void @memcpy_private_align2_private_align2_1027(ptr addrspace(5) %dst, ptr addrspace(5) %src) #0 { ; OPT-LABEL: @memcpy_private_align2_private_align2_1027( ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] -; OPT: load-store-loop: +; OPT: static-memcpy-expansion-main-body: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]] ; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 2 @@ -938,7 +938,7 @@ define amdgpu_kernel void @memcpy_private_align2_private_align2_1027(ptr addrspa ; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256 ; OPT-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1024 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] -; OPT: memcpy-split: +; OPT: static-memcpy-post-expansion: ; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1024 ; OPT-NEXT: [[TMP10:%.*]] = load i16, ptr addrspace(5) [[TMP9]], align 2 ; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1024 @@ -959,7 +959,7 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_variable(ptr addrs ; OPT-NEXT: [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]] ; OPT-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0 ; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] -; OPT: loop-memcpy-expansion: +; OPT: dynamic-memcpy-expansion-main-body: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] ; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 4 @@ -968,8 +968,11 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_variable(ptr addrs ; OPT-NEXT: [[TMP8]] = add i64 [[LOOP_INDEX]], 16 ; OPT-NEXT: [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP3]] ; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] -; OPT: loop-memcpy-residual: -; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] +; OPT: dynamic-memcpy-expansion-residual-cond: +; OPT-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP2]], 0 +; OPT-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL:%.*]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] +; OPT: dynamic-memcpy-expansion-residual-body: +; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL]] ] ; OPT-NEXT: [[TMP10:%.*]] = add i64 [[TMP3]], [[RESIDUAL_LOOP_INDEX]] ; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[TMP10]] ; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 1 @@ -977,12 +980,9 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_variable(ptr addrs ; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 1 ; OPT-NEXT: [[TMP14]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1 ; OPT-NEXT: [[TMP15:%.*]] = icmp ult i64 [[TMP14]], [[TMP2]] -; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] -; OPT: post-loop-memcpy-expansion: +; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] +; OPT: dynamic-memcpy-post-expansion: ; OPT-NEXT: ret void -; OPT: loop-memcpy-residual-header: -; OPT-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP2]], 0 -; OPT-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] ; call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 %dst, ptr addrspace(1) align 4 %src, i64 %n, i1 false) ret void @@ -994,7 +994,7 @@ define amdgpu_kernel void @memcpy_global_align2_global_align2_variable(ptr addrs ; OPT-NEXT: [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]] ; OPT-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0 ; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] -; OPT: loop-memcpy-expansion: +; OPT: dynamic-memcpy-expansion-main-body: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] ; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 2 @@ -1003,8 +1003,11 @@ define amdgpu_kernel void @memcpy_global_align2_global_align2_variable(ptr addrs ; OPT-NEXT: [[TMP8]] = add i64 [[LOOP_INDEX]], 16 ; OPT-NEXT: [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP3]] ; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] -; OPT: loop-memcpy-residual: -; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] +; OPT: dynamic-memcpy-expansion-residual-cond: +; OPT-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP2]], 0 +; OPT-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL:%.*]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] +; OPT: dynamic-memcpy-expansion-residual-body: +; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL]] ] ; OPT-NEXT: [[TMP10:%.*]] = add i64 [[TMP3]], [[RESIDUAL_LOOP_INDEX]] ; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[TMP10]] ; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 1 @@ -1012,12 +1015,9 @@ define amdgpu_kernel void @memcpy_global_align2_global_align2_variable(ptr addrs ; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 1 ; OPT-NEXT: [[TMP14]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1 ; OPT-NEXT: [[TMP15:%.*]] = icmp ult i64 [[TMP14]], [[TMP2]] -; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] -; OPT: post-loop-memcpy-expansion: +; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] +; OPT: dynamic-memcpy-post-expansion: ; OPT-NEXT: ret void -; OPT: loop-memcpy-residual-header: -; OPT-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP2]], 0 -; OPT-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] ; call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 2 %dst, ptr addrspace(1) align 2 %src, i64 %n, i1 false) ret void @@ -1029,7 +1029,7 @@ define amdgpu_kernel void @memcpy_global_align1_global_align1_variable(ptr addrs ; OPT-NEXT: [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]] ; OPT-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0 ; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] -; OPT: loop-memcpy-expansion: +; OPT: dynamic-memcpy-expansion-main-body: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] ; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 1 @@ -1038,8 +1038,11 @@ define amdgpu_kernel void @memcpy_global_align1_global_align1_variable(ptr addrs ; OPT-NEXT: [[TMP8]] = add i64 [[LOOP_INDEX]], 16 ; OPT-NEXT: [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP3]] ; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] -; OPT: loop-memcpy-residual: -; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] +; OPT: dynamic-memcpy-expansion-residual-cond: +; OPT-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP2]], 0 +; OPT-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL:%.*]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] +; OPT: dynamic-memcpy-expansion-residual-body: +; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL]] ] ; OPT-NEXT: [[TMP10:%.*]] = add i64 [[TMP3]], [[RESIDUAL_LOOP_INDEX]] ; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[TMP10]] ; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 1 @@ -1047,12 +1050,9 @@ define amdgpu_kernel void @memcpy_global_align1_global_align1_variable(ptr addrs ; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 1 ; OPT-NEXT: [[TMP14]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1 ; OPT-NEXT: [[TMP15:%.*]] = icmp ult i64 [[TMP14]], [[TMP2]] -; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] -; OPT: post-loop-memcpy-expansion: +; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] +; OPT: dynamic-memcpy-post-expansion: ; OPT-NEXT: ret void -; OPT: loop-memcpy-residual-header: -; OPT-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP2]], 0 -; OPT-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] ; call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align 1 %src, i64 %n, i1 false) ret void @@ -1064,7 +1064,7 @@ define amdgpu_kernel void @memcpy_local_align4_local_align4_variable(ptr addrspa ; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]] ; OPT-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 ; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] -; OPT: loop-memcpy-expansion: +; OPT: dynamic-memcpy-expansion-main-body: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]] ; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP5]], align 4 @@ -1073,8 +1073,11 @@ define amdgpu_kernel void @memcpy_local_align4_local_align4_variable(ptr addrspa ; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 16 ; OPT-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]] ; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] -; OPT: loop-memcpy-residual: -; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] +; OPT: dynamic-memcpy-expansion-residual-cond: +; OPT-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0 +; OPT-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL:%.*]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] +; OPT: dynamic-memcpy-expansion-residual-body: +; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL]] ] ; OPT-NEXT: [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]] ; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[TMP10]] ; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 1 @@ -1082,12 +1085,9 @@ define amdgpu_kernel void @memcpy_local_align4_local_align4_variable(ptr addrspa ; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 1 ; OPT-NEXT: [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1 ; OPT-NEXT: [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]] -; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] -; OPT: post-loop-memcpy-expansion: +; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] +; OPT: dynamic-memcpy-post-expansion: ; OPT-NEXT: ret void -; OPT: loop-memcpy-residual-header: -; OPT-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0 -; OPT-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] ; call void @llvm.memcpy.p3.p3.i32(ptr addrspace(3) align 4 %dst, ptr addrspace(3) align 4 %src, i32 %n, i1 false) ret void @@ -1099,7 +1099,7 @@ define amdgpu_kernel void @memcpy_local_align2_local_align2_variable(ptr addrspa ; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]] ; OPT-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 ; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] -; OPT: loop-memcpy-expansion: +; OPT: dynamic-memcpy-expansion-main-body: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]] ; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP5]], align 2 @@ -1108,8 +1108,11 @@ define amdgpu_kernel void @memcpy_local_align2_local_align2_variable(ptr addrspa ; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 16 ; OPT-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]] ; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] -; OPT: loop-memcpy-residual: -; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] +; OPT: dynamic-memcpy-expansion-residual-cond: +; OPT-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0 +; OPT-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL:%.*]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] +; OPT: dynamic-memcpy-expansion-residual-body: +; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL]] ] ; OPT-NEXT: [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]] ; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[TMP10]] ; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 1 @@ -1117,12 +1120,9 @@ define amdgpu_kernel void @memcpy_local_align2_local_align2_variable(ptr addrspa ; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 1 ; OPT-NEXT: [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1 ; OPT-NEXT: [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]] -; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] -; OPT: post-loop-memcpy-expansion: +; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] +; OPT: dynamic-memcpy-post-expansion: ; OPT-NEXT: ret void -; OPT: loop-memcpy-residual-header: -; OPT-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0 -; OPT-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] ; call void @llvm.memcpy.p3.p3.i32(ptr addrspace(3) align 2 %dst, ptr addrspace(3) align 2 %src, i32 %n, i1 false) ret void @@ -1134,7 +1134,7 @@ define amdgpu_kernel void @memcpy_local_align1_local_align1_variable(ptr addrspa ; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]] ; OPT-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 ; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] -; OPT: loop-memcpy-expansion: +; OPT: dynamic-memcpy-expansion-main-body: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]] ; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP5]], align 1 @@ -1143,8 +1143,11 @@ define amdgpu_kernel void @memcpy_local_align1_local_align1_variable(ptr addrspa ; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 16 ; OPT-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]] ; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] -; OPT: loop-memcpy-residual: -; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] +; OPT: dynamic-memcpy-expansion-residual-cond: +; OPT-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0 +; OPT-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL:%.*]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] +; OPT: dynamic-memcpy-expansion-residual-body: +; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL]] ] ; OPT-NEXT: [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]] ; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[TMP10]] ; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 1 @@ -1152,12 +1155,9 @@ define amdgpu_kernel void @memcpy_local_align1_local_align1_variable(ptr addrspa ; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 1 ; OPT-NEXT: [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1 ; OPT-NEXT: [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]] -; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] -; OPT: post-loop-memcpy-expansion: +; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] +; OPT: dynamic-memcpy-post-expansion: ; OPT-NEXT: ret void -; OPT: loop-memcpy-residual-header: -; OPT-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0 -; OPT-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] ; call void @llvm.memcpy.p3.p3.i32(ptr addrspace(3) align 1 %dst, ptr addrspace(3) align 1 %src, i32 %n, i1 false) ret void @@ -1169,7 +1169,7 @@ define amdgpu_kernel void @memcpy_local_align4_global_align4_variable(ptr addrsp ; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]] ; OPT-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 ; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] -; OPT: loop-memcpy-expansion: +; OPT: dynamic-memcpy-expansion-main-body: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i32 [[LOOP_INDEX]] ; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 4 @@ -1178,8 +1178,11 @@ define amdgpu_kernel void @memcpy_local_align4_global_align4_variable(ptr addrsp ; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 16 ; OPT-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]] ; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] -; OPT: loop-memcpy-residual: -; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] +; OPT: dynamic-memcpy-expansion-residual-cond: +; OPT-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0 +; OPT-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL:%.*]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] +; OPT: dynamic-memcpy-expansion-residual-body: +; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL]] ] ; OPT-NEXT: [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]] ; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i32 [[TMP10]] ; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 1 @@ -1187,12 +1190,9 @@ define amdgpu_kernel void @memcpy_local_align4_global_align4_variable(ptr addrsp ; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 1 ; OPT-NEXT: [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1 ; OPT-NEXT: [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]] -; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] -; OPT: post-loop-memcpy-expansion: +; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] +; OPT: dynamic-memcpy-post-expansion: ; OPT-NEXT: ret void -; OPT: loop-memcpy-residual-header: -; OPT-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0 -; OPT-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] ; call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 4 %dst, ptr addrspace(1) align 4 %src, i32 %n, i1 false) ret void @@ -1204,7 +1204,7 @@ define amdgpu_kernel void @memcpy_global_align4_local_align4_variable(ptr addrsp ; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]] ; OPT-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 ; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] -; OPT: loop-memcpy-expansion: +; OPT: dynamic-memcpy-expansion-main-body: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]] ; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP5]], align 4 @@ -1213,8 +1213,11 @@ define amdgpu_kernel void @memcpy_global_align4_local_align4_variable(ptr addrsp ; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 16 ; OPT-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]] ; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] -; OPT: loop-memcpy-residual: -; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] +; OPT: dynamic-memcpy-expansion-residual-cond: +; OPT-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0 +; OPT-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL:%.*]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] +; OPT: dynamic-memcpy-expansion-residual-body: +; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL]] ] ; OPT-NEXT: [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]] ; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[TMP10]] ; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 1 @@ -1222,12 +1225,9 @@ define amdgpu_kernel void @memcpy_global_align4_local_align4_variable(ptr addrsp ; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 1 ; OPT-NEXT: [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1 ; OPT-NEXT: [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]] -; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] -; OPT: post-loop-memcpy-expansion: +; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] +; OPT: dynamic-memcpy-post-expansion: ; OPT-NEXT: ret void -; OPT: loop-memcpy-residual-header: -; OPT-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0 -; OPT-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] ; call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 4 %dst, ptr addrspace(3) align 4 %src, i32 %n, i1 false) ret void @@ -1496,7 +1496,7 @@ define amdgpu_kernel void @memmove_private_align1_global_align1(ptr addrspace(5) ; ; ALL-LABEL: @memmove_private_align1_global_align1( ; ALL-NEXT: br label [[LOAD_STORE_LOOP:%.*]] -; ALL: load-store-loop: +; ALL: static-memcpy-expansion-main-body: ; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] ; ALL-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 1, !alias.scope [[META0:![0-9]+]] @@ -1505,7 +1505,7 @@ define amdgpu_kernel void @memmove_private_align1_global_align1(ptr addrspace(5) ; ALL-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 ; ALL-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 256 ; ALL-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] -; ALL: memcpy-split: +; ALL: static-memcpy-post-expansion: ; ALL-NEXT: ret void ; call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) %dst, ptr addrspace(1) %src, i64 256, i1 false) @@ -1519,7 +1519,7 @@ define amdgpu_kernel void @memmove_global_align1_private_align1(ptr addrspace(1) ; ; ALL-LABEL: @memmove_global_align1_private_align1( ; ALL-NEXT: br label [[LOAD_STORE_LOOP:%.*]] -; ALL: load-store-loop: +; ALL: static-memcpy-expansion-main-body: ; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i64 [[LOOP_INDEX]] ; ALL-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 1, !alias.scope [[META3:![0-9]+]] @@ -1528,7 +1528,7 @@ define amdgpu_kernel void @memmove_global_align1_private_align1(ptr addrspace(1) ; ALL-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 ; ALL-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 256 ; ALL-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] -; ALL: memcpy-split: +; ALL: static-memcpy-post-expansion: ; ALL-NEXT: ret void ; call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) %dst, ptr addrspace(5) %src, i64 256, i1 false) @@ -1722,7 +1722,7 @@ define amdgpu_kernel void @memmove_local_align1_private_align1(ptr addrspace(3) ; ; ALL-LABEL: @memmove_local_align1_private_align1( ; ALL-NEXT: br label [[LOAD_STORE_LOOP:%.*]] -; ALL: load-store-loop: +; ALL: static-memcpy-expansion-main-body: ; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]] ; ALL-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 1, !alias.scope [[META6:![0-9]+]] @@ -1731,7 +1731,7 @@ define amdgpu_kernel void @memmove_local_align1_private_align1(ptr addrspace(3) ; ALL-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256 ; ALL-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 256 ; ALL-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] -; ALL: memcpy-split: +; ALL: static-memcpy-post-expansion: ; ALL-NEXT: ret void ; call void @llvm.memmove.p3.p5.i32(ptr addrspace(3) %dst, ptr addrspace(5) %src, i32 256, i1 false) @@ -1744,7 +1744,7 @@ define amdgpu_kernel void @memmove_local_align1_private_align1_unknown_size(ptr ; MAX1024-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]] ; MAX1024-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 ; MAX1024-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] -; MAX1024: loop-memcpy-expansion: +; MAX1024: dynamic-memcpy-expansion-main-body: ; MAX1024-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] ; MAX1024-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]] ; MAX1024-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP7]], align 1, !alias.scope [[META0:![0-9]+]] @@ -1753,8 +1753,11 @@ define amdgpu_kernel void @memmove_local_align1_private_align1_unknown_size(ptr ; MAX1024-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 16 ; MAX1024-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]] ; MAX1024-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] -; MAX1024: loop-memcpy-residual: -; MAX1024-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] +; MAX1024: dynamic-memcpy-expansion-residual-cond: +; MAX1024-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0 +; MAX1024-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL:%.*]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] +; MAX1024: dynamic-memcpy-expansion-residual-body: +; MAX1024-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL]] ] ; MAX1024-NEXT: [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]] ; MAX1024-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 [[TMP10]] ; MAX1024-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(5) [[TMP11]], align 1, !alias.scope [[META0]] @@ -1762,19 +1765,16 @@ define amdgpu_kernel void @memmove_local_align1_private_align1_unknown_size(ptr ; MAX1024-NEXT: store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 1, !noalias [[META0]] ; MAX1024-NEXT: [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1 ; MAX1024-NEXT: [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]] -; MAX1024-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] -; MAX1024: post-loop-memcpy-expansion: +; MAX1024-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] +; MAX1024: dynamic-memcpy-post-expansion: ; MAX1024-NEXT: ret void -; MAX1024: loop-memcpy-residual-header: -; MAX1024-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0 -; MAX1024-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] ; ; ALL-LABEL: @memmove_local_align1_private_align1_unknown_size( ; ALL-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 15 ; ALL-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]] ; ALL-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 ; ALL-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] -; ALL: loop-memcpy-expansion: +; ALL: dynamic-memcpy-expansion-main-body: ; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] ; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]] ; ALL-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP7]], align 1, !alias.scope [[META9:![0-9]+]] @@ -1783,8 +1783,11 @@ define amdgpu_kernel void @memmove_local_align1_private_align1_unknown_size(ptr ; ALL-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 16 ; ALL-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]] ; ALL-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] -; ALL: loop-memcpy-residual: -; ALL-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] +; ALL: dynamic-memcpy-expansion-residual-cond: +; ALL-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0 +; ALL-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL:%.*]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] +; ALL: dynamic-memcpy-expansion-residual-body: +; ALL-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL]] ] ; ALL-NEXT: [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]] ; ALL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 [[TMP10]] ; ALL-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(5) [[TMP11]], align 1, !alias.scope [[META9]] @@ -1792,12 +1795,9 @@ define amdgpu_kernel void @memmove_local_align1_private_align1_unknown_size(ptr ; ALL-NEXT: store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 1, !noalias [[META9]] ; ALL-NEXT: [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1 ; ALL-NEXT: [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]] -; ALL-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] -; ALL: post-loop-memcpy-expansion: +; ALL-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] +; ALL: dynamic-memcpy-post-expansion: ; ALL-NEXT: ret void -; ALL: loop-memcpy-residual-header: -; ALL-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0 -; ALL-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] ; call void @llvm.memmove.p3.p5.i32(ptr addrspace(3) %dst, ptr addrspace(5) %src, i32 %size, i1 false) ret void @@ -1810,7 +1810,7 @@ define amdgpu_kernel void @memmove_private_align1_local_align1(ptr addrspace(5) ; ; ALL-LABEL: @memmove_private_align1_local_align1( ; ALL-NEXT: br label [[LOAD_STORE_LOOP:%.*]] -; ALL: load-store-loop: +; ALL: static-memcpy-expansion-main-body: ; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]] ; ALL-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(3) [[TMP1]], align 1, !alias.scope [[META12:![0-9]+]] @@ -1819,7 +1819,7 @@ define amdgpu_kernel void @memmove_private_align1_local_align1(ptr addrspace(5) ; ALL-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256 ; ALL-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 256 ; ALL-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] -; ALL: memcpy-split: +; ALL: static-memcpy-post-expansion: ; ALL-NEXT: ret void ; call void @llvm.memmove.p5.p3.i32(ptr addrspace(5) %dst, ptr addrspace(3) %src, i32 256, i1 false) @@ -1832,7 +1832,7 @@ define amdgpu_kernel void @memmove_private_align1_local_align1_unknown_size(ptr ; MAX1024-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]] ; MAX1024-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 ; MAX1024-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] -; MAX1024: loop-memcpy-expansion: +; MAX1024: dynamic-memcpy-expansion-main-body: ; MAX1024-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] ; MAX1024-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]] ; MAX1024-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP7]], align 1, !alias.scope [[META3:![0-9]+]] @@ -1841,8 +1841,11 @@ define amdgpu_kernel void @memmove_private_align1_local_align1_unknown_size(ptr ; MAX1024-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 16 ; MAX1024-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]] ; MAX1024-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] -; MAX1024: loop-memcpy-residual: -; MAX1024-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] +; MAX1024: dynamic-memcpy-expansion-residual-cond: +; MAX1024-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0 +; MAX1024-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL:%.*]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] +; MAX1024: dynamic-memcpy-expansion-residual-body: +; MAX1024-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL]] ] ; MAX1024-NEXT: [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]] ; MAX1024-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[TMP10]] ; MAX1024-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 1, !alias.scope [[META3]] @@ -1850,19 +1853,16 @@ define amdgpu_kernel void @memmove_private_align1_local_align1_unknown_size(ptr ; MAX1024-NEXT: store i8 [[TMP12]], ptr addrspace(5) [[TMP13]], align 1, !noalias [[META3]] ; MAX1024-NEXT: [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1 ; MAX1024-NEXT: [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]] -; MAX1024-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] -; MAX1024: post-loop-memcpy-expansion: +; MAX1024-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] +; MAX1024: dynamic-memcpy-post-expansion: ; MAX1024-NEXT: ret void -; MAX1024: loop-memcpy-residual-header: -; MAX1024-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0 -; MAX1024-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] ; ; ALL-LABEL: @memmove_private_align1_local_align1_unknown_size( ; ALL-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 15 ; ALL-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]] ; ALL-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 ; ALL-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] -; ALL: loop-memcpy-expansion: +; ALL: dynamic-memcpy-expansion-main-body: ; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] ; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]] ; ALL-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP7]], align 1, !alias.scope [[META15:![0-9]+]] @@ -1871,8 +1871,11 @@ define amdgpu_kernel void @memmove_private_align1_local_align1_unknown_size(ptr ; ALL-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 16 ; ALL-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]] ; ALL-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] -; ALL: loop-memcpy-residual: -; ALL-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] +; ALL: dynamic-memcpy-expansion-residual-cond: +; ALL-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0 +; ALL-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL:%.*]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] +; ALL: dynamic-memcpy-expansion-residual-body: +; ALL-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL]] ] ; ALL-NEXT: [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]] ; ALL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[TMP10]] ; ALL-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 1, !alias.scope [[META15]] @@ -1880,12 +1883,9 @@ define amdgpu_kernel void @memmove_private_align1_local_align1_unknown_size(ptr ; ALL-NEXT: store i8 [[TMP12]], ptr addrspace(5) [[TMP13]], align 1, !noalias [[META15]] ; ALL-NEXT: [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1 ; ALL-NEXT: [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]] -; ALL-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] -; ALL: post-loop-memcpy-expansion: +; ALL-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] +; ALL: dynamic-memcpy-post-expansion: ; ALL-NEXT: ret void -; ALL: loop-memcpy-residual-header: -; ALL-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0 -; ALL-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] ; call void @llvm.memmove.p5.p3.i32(ptr addrspace(5) %dst, ptr addrspace(3) %src, i32 %size, i1 false) ret void @@ -2367,7 +2367,7 @@ define void @test_umin(i64 %0, i64 %idxprom, ptr %x, ptr %y) { ; OPT-NEXT: [[TMP3:%.*]] = sub i64 [[SPEC_SELECT]], [[TMP2]] ; OPT-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0 ; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] -; OPT: loop-memcpy-expansion: +; OPT: dynamic-memcpy-expansion-main-body: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i64 [[LOOP_INDEX]] ; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr [[TMP5]], align 1 @@ -2376,8 +2376,11 @@ define void @test_umin(i64 %0, i64 %idxprom, ptr %x, ptr %y) { ; OPT-NEXT: [[TMP8]] = add i64 [[LOOP_INDEX]], 16 ; OPT-NEXT: [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP3]] ; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] -; OPT: loop-memcpy-residual: -; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] +; OPT: dynamic-memcpy-expansion-residual-cond: +; OPT-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP2]], 0 +; OPT-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL:%.*]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] +; OPT: dynamic-memcpy-expansion-residual-body: +; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL]] ] ; OPT-NEXT: [[TMP10:%.*]] = add i64 [[TMP3]], [[RESIDUAL_LOOP_INDEX]] ; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[X]], i64 [[TMP10]] ; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr [[TMP11]], align 1 @@ -2385,12 +2388,9 @@ define void @test_umin(i64 %0, i64 %idxprom, ptr %x, ptr %y) { ; OPT-NEXT: store i8 [[TMP12]], ptr [[TMP13]], align 1 ; OPT-NEXT: [[TMP14]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1 ; OPT-NEXT: [[TMP15:%.*]] = icmp ult i64 [[TMP14]], [[TMP2]] -; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] -; OPT: post-loop-memcpy-expansion: +; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] +; OPT: dynamic-memcpy-post-expansion: ; OPT-NEXT: ret void -; OPT: loop-memcpy-residual-header: -; OPT-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP2]], 0 -; OPT-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] ; entry: %arrayidx = getelementptr [32 x [8 x i64]], ptr %y, i64 0, i64 %idxprom @@ -2439,7 +2439,7 @@ define amdgpu_kernel void @memcpy_volatile(ptr addrspace(1) %dst, ptr addrspace( ; ; ALL-LABEL: @memcpy_volatile( ; ALL-NEXT: br label [[LOAD_STORE_LOOP:%.*]] -; ALL: load-store-loop: +; ALL: static-memcpy-expansion-main-body: ; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] ; ALL-NEXT: [[TMP2:%.*]] = load volatile <64 x i32>, ptr addrspace(1) [[TMP1]], align 1 @@ -2448,7 +2448,7 @@ define amdgpu_kernel void @memcpy_volatile(ptr addrspace(1) %dst, ptr addrspace( ; ALL-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 ; ALL-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 512 ; ALL-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] -; ALL: memcpy-split: +; ALL: static-memcpy-post-expansion: ; ALL-NEXT: ret void ; call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 512, i1 true) diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll index 43752c22b1f3e..faf70f55876f7 100644 --- a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll @@ -12,7 +12,7 @@ define void @issue63986(i64 %0, i64 %idxprom, ptr inreg %ptr) { ; CHECK-NEXT: v_add_co_u32_e32 v8, vcc, s16, v4 ; CHECK-NEXT: v_addc_co_u32_e32 v9, vcc, v6, v5, vcc ; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: .LBB0_1: ; %loop-memcpy-expansion +; CHECK-NEXT: .LBB0_1: ; %dynamic-memcpy-expansion-main-body ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_mov_b32_e32 v7, s5 ; CHECK-NEXT: v_mov_b32_e32 v6, s4 @@ -20,28 +20,28 @@ define void @issue63986(i64 %0, i64 %idxprom, ptr inreg %ptr) { ; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, s4, v8 ; CHECK-NEXT: s_add_u32 s4, s4, 16 ; CHECK-NEXT: s_addc_u32 s5, s5, 0 -; CHECK-NEXT: v_cmp_ge_u64_e64 s[6:7], s[4:5], 32 +; CHECK-NEXT: v_cmp_lt_u64_e64 s[6:7], s[4:5], 32 ; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, v9, v7, vcc ; CHECK-NEXT: s_and_b64 vcc, exec, s[6:7] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[6:7], v[10:13] -; CHECK-NEXT: s_cbranch_vccz .LBB0_1 -; CHECK-NEXT: ; %bb.2: ; %loop-memcpy-residual-header +; CHECK-NEXT: s_cbranch_vccnz .LBB0_1 +; CHECK-NEXT: ; %bb.2: ; %dynamic-memcpy-expansion-residual-cond ; CHECK-NEXT: s_branch .LBB0_4 ; CHECK-NEXT: ; %bb.3: ; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7 ; CHECK-NEXT: s_branch .LBB0_5 -; CHECK-NEXT: .LBB0_4: ; %loop-memcpy-residual-header.post-loop-memcpy-expansion_crit_edge +; CHECK-NEXT: .LBB0_4: ; %dynamic-memcpy-expansion-residual-cond.dynamic-memcpy-post-expansion_crit_edge ; CHECK-NEXT: v_lshlrev_b64 v[6:7], 6, v[2:3] ; CHECK-NEXT: s_cbranch_execnz .LBB0_8 -; CHECK-NEXT: .LBB0_5: ; %loop-memcpy-residual.preheader +; CHECK-NEXT: .LBB0_5: ; %dynamic-memcpy-expansion-residual-body.preheader ; CHECK-NEXT: s_add_u32 s4, s16, 32 ; CHECK-NEXT: s_addc_u32 s5, s17, 0 ; CHECK-NEXT: v_mov_b32_e32 v3, s5 ; CHECK-NEXT: v_add_co_u32_e32 v2, vcc, s4, v4 ; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc ; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: ; %bb.6: ; %loop-memcpy-residual +; CHECK-NEXT: ; %bb.6: ; %dynamic-memcpy-expansion-residual-body ; CHECK-NEXT: s_add_u32 s6, 32, s4 ; CHECK-NEXT: s_addc_u32 s7, 0, s5 ; CHECK-NEXT: v_mov_b32_e32 v6, s6 @@ -57,7 +57,7 @@ define void @issue63986(i64 %0, i64 %idxprom, ptr inreg %ptr) { ; CHECK-NEXT: ; %bb.7: ; CHECK-NEXT: v_mov_b32_e32 v7, v5 ; CHECK-NEXT: v_mov_b32_e32 v6, v4 -; CHECK-NEXT: .LBB0_8: ; %post-loop-memcpy-expansion +; CHECK-NEXT: .LBB0_8: ; %dynamic-memcpy-post-expansion ; CHECK-NEXT: v_and_b32_e32 v2, 15, v0 ; CHECK-NEXT: v_and_b32_e32 v0, -16, v0 ; CHECK-NEXT: v_add_co_u32_e32 v4, vcc, v6, v0 @@ -76,18 +76,18 @@ define void @issue63986(i64 %0, i64 %idxprom, ptr inreg %ptr) { ; CHECK-NEXT: .LBB0_10: ; %Flow16 ; CHECK-NEXT: ; in Loop: Header=BB0_11 Depth=1 ; CHECK-NEXT: s_andn2_b64 vcc, exec, s[8:9] -; CHECK-NEXT: s_cbranch_vccz .LBB0_19 +; CHECK-NEXT: s_cbranch_vccz .LBB0_18 ; CHECK-NEXT: .LBB0_11: ; %while.cond ; CHECK-NEXT: ; =>This Loop Header: Depth=1 ; CHECK-NEXT: ; Child Loop BB0_13 Depth 2 ; CHECK-NEXT: ; Child Loop BB0_17 Depth 2 ; CHECK-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] ; CHECK-NEXT: s_cbranch_execz .LBB0_14 -; CHECK-NEXT: ; %bb.12: ; %loop-memcpy-expansion2.preheader +; CHECK-NEXT: ; %bb.12: ; %dynamic-memcpy-expansion-main-body2.preheader ; CHECK-NEXT: ; in Loop: Header=BB0_11 Depth=1 ; CHECK-NEXT: s_mov_b64 s[10:11], 0 ; CHECK-NEXT: s_mov_b64 s[12:13], 0 -; CHECK-NEXT: .LBB0_13: ; %loop-memcpy-expansion2 +; CHECK-NEXT: .LBB0_13: ; %dynamic-memcpy-expansion-main-body2 ; CHECK-NEXT: ; Parent Loop BB0_11 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v6, s10 @@ -108,37 +108,33 @@ define void @issue63986(i64 %0, i64 %idxprom, ptr inreg %ptr) { ; CHECK-NEXT: s_or_b64 exec, exec, s[8:9] ; CHECK-NEXT: s_mov_b64 s[8:9], -1 ; CHECK-NEXT: s_cbranch_execz .LBB0_10 -; CHECK-NEXT: ; %bb.15: ; %loop-memcpy-residual-header5 +; CHECK-NEXT: ; %bb.15: ; %dynamic-memcpy-expansion-residual-cond5 ; CHECK-NEXT: ; in Loop: Header=BB0_11 Depth=1 -; CHECK-NEXT: s_and_saveexec_b64 s[8:9], s[6:7] -; CHECK-NEXT: s_xor_b64 s[10:11], exec, s[8:9] +; CHECK-NEXT: s_and_saveexec_b64 s[10:11], s[6:7] ; CHECK-NEXT: s_cbranch_execz .LBB0_9 -; CHECK-NEXT: ; %bb.16: ; %loop-memcpy-residual4.preheader +; CHECK-NEXT: ; %bb.16: ; %dynamic-memcpy-expansion-residual-body4.preheader ; CHECK-NEXT: ; in Loop: Header=BB0_11 Depth=1 -; CHECK-NEXT: s_mov_b64 s[14:15], 0 ; CHECK-NEXT: s_mov_b64 s[12:13], 0 -; CHECK-NEXT: .LBB0_17: ; %loop-memcpy-residual4 +; CHECK-NEXT: s_mov_b64 s[14:15], 0 +; CHECK-NEXT: .LBB0_17: ; %dynamic-memcpy-expansion-residual-body4 ; CHECK-NEXT: ; Parent Loop BB0_11 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 -; CHECK-NEXT: v_mov_b32_e32 v10, s15 -; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, s14, v0 +; CHECK-NEXT: v_mov_b32_e32 v10, s13 +; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, s12, v0 ; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, v1, v10, vcc ; CHECK-NEXT: flat_load_ubyte v11, v[6:7] -; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, s14, v4 -; CHECK-NEXT: s_add_u32 s14, s14, 1 -; CHECK-NEXT: s_addc_u32 s15, s15, 0 -; CHECK-NEXT: v_cmp_ge_u64_e64 s[8:9], s[14:15], v[2:3] +; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, s12, v4 +; CHECK-NEXT: s_add_u32 s12, s12, 1 +; CHECK-NEXT: s_addc_u32 s13, s13, 0 +; CHECK-NEXT: v_cmp_ge_u64_e64 s[8:9], s[12:13], v[2:3] ; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, v5, v10, vcc -; CHECK-NEXT: s_or_b64 s[12:13], s[8:9], s[12:13] +; CHECK-NEXT: s_or_b64 s[14:15], s[8:9], s[14:15] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: flat_store_byte v[6:7], v11 -; CHECK-NEXT: s_andn2_b64 exec, exec, s[12:13] +; CHECK-NEXT: s_andn2_b64 exec, exec, s[14:15] ; CHECK-NEXT: s_cbranch_execnz .LBB0_17 -; CHECK-NEXT: ; %bb.18: ; %Flow -; CHECK-NEXT: ; in Loop: Header=BB0_11 Depth=1 -; CHECK-NEXT: s_or_b64 exec, exec, s[12:13] ; CHECK-NEXT: s_branch .LBB0_9 -; CHECK-NEXT: .LBB0_19: ; %DummyReturnBlock +; CHECK-NEXT: .LBB0_18: ; %DummyReturnBlock ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: diff --git a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll index cb68a987c243b..4f2816538b1ff 100644 --- a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll +++ b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll @@ -14,7 +14,7 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0) ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: .LBB0_1: ; %load-store-loop +; CHECK-NEXT: .LBB0_1: ; %static-memcpy-expansion-main-body ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v24, vcc_lo, v2, s4 ; CHECK-NEXT: v_add_co_ci_u32_e64 v25, null, s5, v3, vcc_lo @@ -83,7 +83,7 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0) ; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[96:99] ; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s6 ; CHECK-NEXT: s_cbranch_vccnz .LBB0_1 -; CHECK-NEXT: ; %bb.2: ; %memcpy-split +; CHECK-NEXT: ; %bb.2: ; %static-memcpy-post-expansion ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] ; @@ -108,7 +108,7 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0) ; ALIGNED-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v72, off, s[0:3], s32 ; 4-byte Folded Spill -; ALIGNED-NEXT: .LBB0_1: ; %load-store-loop +; ALIGNED-NEXT: .LBB0_1: ; %static-memcpy-expansion-main-body ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 ; ALIGNED-NEXT: v_add_co_u32 v4, vcc_lo, v2, s4 ; ALIGNED-NEXT: v_add_co_ci_u32_e64 v5, null, s5, v3, vcc_lo @@ -757,7 +757,7 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0) ; ALIGNED-NEXT: flat_store_byte v[20:21], v11 offset:48 ; ALIGNED-NEXT: flat_store_byte v[20:21], v4 offset:46 ; ALIGNED-NEXT: s_cbranch_vccnz .LBB0_1 -; ALIGNED-NEXT: ; %bb.2: ; %memcpy-split +; ALIGNED-NEXT: ; %bb.2: ; %static-memcpy-post-expansion ; ALIGNED-NEXT: s_clause 0x10 ; 68-byte Folded Reload ; ALIGNED-NEXT: buffer_load_dword v72, off, s[0:3], s32 ; ALIGNED-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 @@ -784,7 +784,7 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0) ; UNROLL3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; UNROLL3-NEXT: s_mov_b64 s[4:5], 0 ; UNROLL3-NEXT: .p2align 6 -; UNROLL3-NEXT: .LBB0_1: ; %load-store-loop +; UNROLL3-NEXT: .LBB0_1: ; %static-memcpy-expansion-main-body ; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 ; UNROLL3-NEXT: v_add_co_u32 v12, vcc_lo, v2, s4 ; UNROLL3-NEXT: v_add_co_ci_u32_e64 v13, null, s5, v3, vcc_lo @@ -805,7 +805,7 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0) ; UNROLL3-NEXT: v_cmp_gt_u64_e64 s6, 0x7e0, s[4:5] ; UNROLL3-NEXT: s_and_b32 vcc_lo, exec_lo, s6 ; UNROLL3-NEXT: s_cbranch_vccnz .LBB0_1 -; UNROLL3-NEXT: ; %bb.2: ; %memcpy-split +; UNROLL3-NEXT: ; %bb.2: ; %static-memcpy-post-expansion ; UNROLL3-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:2016 ; UNROLL3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:2016 @@ -824,7 +824,7 @@ define void @memcpy_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1) ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: .LBB1_1: ; %load-store-loop +; CHECK-NEXT: .LBB1_1: ; %static-memcpy-expansion-main-body ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, s4 ; CHECK-NEXT: v_add_co_ci_u32_e64 v97, null, s5, v3, vcc_lo @@ -884,7 +884,7 @@ define void @memcpy_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1) ; CHECK-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] ; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s6 ; CHECK-NEXT: s_cbranch_vccnz .LBB1_1 -; CHECK-NEXT: ; %bb.2: ; %memcpy-split +; CHECK-NEXT: ; %bb.2: ; %static-memcpy-post-expansion ; CHECK-NEXT: s_setpc_b64 s[30:31] ; ; ALIGNED-LABEL: memcpy_p1_p1_sz2048: @@ -899,7 +899,7 @@ define void @memcpy_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1) ; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill -; ALIGNED-NEXT: .LBB1_1: ; %load-store-loop +; ALIGNED-NEXT: .LBB1_1: ; %static-memcpy-expansion-main-body ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 ; ALIGNED-NEXT: v_add_co_u32 v24, vcc_lo, v2, s4 ; ALIGNED-NEXT: v_add_co_ci_u32_e64 v25, null, s5, v3, vcc_lo @@ -1520,7 +1520,7 @@ define void @memcpy_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1) ; ALIGNED-NEXT: global_store_byte v[16:17], v11, off offset:3 ; ALIGNED-NEXT: global_store_byte v[16:17], v4, off offset:1 ; ALIGNED-NEXT: s_cbranch_vccnz .LBB1_1 -; ALIGNED-NEXT: ; %bb.2: ; %memcpy-split +; ALIGNED-NEXT: ; %bb.2: ; %static-memcpy-post-expansion ; ALIGNED-NEXT: s_clause 0x7 ; 32-byte Folded Reload ; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 @@ -1538,7 +1538,7 @@ define void @memcpy_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1) ; UNROLL3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; UNROLL3-NEXT: s_mov_b64 s[4:5], 0 ; UNROLL3-NEXT: .p2align 6 -; UNROLL3-NEXT: .LBB1_1: ; %load-store-loop +; UNROLL3-NEXT: .LBB1_1: ; %static-memcpy-expansion-main-body ; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 ; UNROLL3-NEXT: v_add_co_u32 v12, vcc_lo, v2, s4 ; UNROLL3-NEXT: v_add_co_ci_u32_e64 v13, null, s5, v3, vcc_lo @@ -1559,7 +1559,7 @@ define void @memcpy_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1) ; UNROLL3-NEXT: v_cmp_gt_u64_e64 s6, 0x7e0, s[4:5] ; UNROLL3-NEXT: s_and_b32 vcc_lo, exec_lo, s6 ; UNROLL3-NEXT: s_cbranch_vccnz .LBB1_1 -; UNROLL3-NEXT: ; %bb.2: ; %memcpy-split +; UNROLL3-NEXT: ; %bb.2: ; %static-memcpy-post-expansion ; UNROLL3-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:2016 ; UNROLL3-NEXT: s_waitcnt vmcnt(0) ; UNROLL3-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:2016 @@ -1577,7 +1577,7 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4) ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: .LBB2_1: ; %load-store-loop +; CHECK-NEXT: .LBB2_1: ; %static-memcpy-expansion-main-body ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, s4 ; CHECK-NEXT: v_add_co_ci_u32_e64 v97, null, s5, v3, vcc_lo @@ -1639,7 +1639,7 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] ; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s6 ; CHECK-NEXT: s_cbranch_vccnz .LBB2_1 -; CHECK-NEXT: ; %bb.2: ; %memcpy-split +; CHECK-NEXT: ; %bb.2: ; %static-memcpy-post-expansion ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] ; @@ -1647,7 +1647,7 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4) ; ALIGNED: ; %bb.0: ; %entry ; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; ALIGNED-NEXT: s_mov_b64 s[4:5], 0 -; ALIGNED-NEXT: .LBB2_1: ; %load-store-loop +; ALIGNED-NEXT: .LBB2_1: ; %static-memcpy-expansion-main-body ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 ; ALIGNED-NEXT: v_add_co_u32 v8, vcc_lo, v2, s4 ; ALIGNED-NEXT: v_add_co_ci_u32_e64 v9, null, s5, v3, vcc_lo @@ -2141,7 +2141,7 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4) ; ALIGNED-NEXT: flat_store_byte v[84:85], v65 offset:1 ; ALIGNED-NEXT: flat_store_byte v[84:85], v4 ; ALIGNED-NEXT: s_cbranch_vccnz .LBB2_1 -; ALIGNED-NEXT: ; %bb.2: ; %memcpy-split +; ALIGNED-NEXT: ; %bb.2: ; %static-memcpy-post-expansion ; ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-NEXT: s_setpc_b64 s[30:31] ; @@ -2150,7 +2150,7 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4) ; UNROLL3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; UNROLL3-NEXT: s_mov_b64 s[4:5], 0 ; UNROLL3-NEXT: .p2align 6 -; UNROLL3-NEXT: .LBB2_1: ; %load-store-loop +; UNROLL3-NEXT: .LBB2_1: ; %static-memcpy-expansion-main-body ; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 ; UNROLL3-NEXT: v_add_co_u32 v12, vcc_lo, v2, s4 ; UNROLL3-NEXT: v_add_co_ci_u32_e64 v13, null, s5, v3, vcc_lo @@ -2171,7 +2171,7 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4) ; UNROLL3-NEXT: v_cmp_gt_u64_e64 s6, 0x7e0, s[4:5] ; UNROLL3-NEXT: s_and_b32 vcc_lo, exec_lo, s6 ; UNROLL3-NEXT: s_cbranch_vccnz .LBB2_1 -; UNROLL3-NEXT: ; %bb.2: ; %memcpy-split +; UNROLL3-NEXT: ; %bb.2: ; %static-memcpy-post-expansion ; UNROLL3-NEXT: s_clause 0x1 ; UNROLL3-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:2016 ; UNROLL3-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:2032 @@ -2191,7 +2191,7 @@ define void @memcpy_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5) ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: .LBB3_1: ; %load-store-loop +; CHECK-NEXT: .LBB3_1: ; %static-memcpy-expansion-main-body ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_clause 0x3e ; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:252 @@ -2392,7 +2392,7 @@ define void @memcpy_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5) ; CHECK-NEXT: v_add_nc_u32_e32 v0, 0x100, v0 ; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s6 ; CHECK-NEXT: s_cbranch_vccnz .LBB3_1 -; CHECK-NEXT: ; %bb.2: ; %memcpy-split +; CHECK-NEXT: ; %bb.2: ; %static-memcpy-post-expansion ; CHECK-NEXT: s_setpc_b64 s[30:31] ; ; ALIGNED-LABEL: memcpy_p5_p5_sz2048: @@ -2447,7 +2447,7 @@ define void @memcpy_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v126, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 ; 4-byte Folded Spill -; ALIGNED-NEXT: .LBB3_1: ; %load-store-loop +; ALIGNED-NEXT: .LBB3_1: ; %static-memcpy-expansion-main-body ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 ; ALIGNED-NEXT: s_clause 0x34 ; ALIGNED-NEXT: buffer_load_ubyte v116, v1, s[0:3], 0 offen offset:255 @@ -3495,7 +3495,7 @@ define void @memcpy_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen ; ALIGNED-NEXT: v_add_nc_u32_e32 v0, 0x100, v0 ; ALIGNED-NEXT: s_cbranch_vccnz .LBB3_1 -; ALIGNED-NEXT: ; %bb.2: ; %memcpy-split +; ALIGNED-NEXT: ; %bb.2: ; %static-memcpy-post-expansion ; ALIGNED-NEXT: s_clause 0x2f ; 192-byte Folded Reload ; ALIGNED-NEXT: buffer_load_dword v127, off, s[0:3], s32 ; ALIGNED-NEXT: buffer_load_dword v126, off, s[0:3], s32 offset:4 @@ -3554,7 +3554,7 @@ define void @memcpy_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5) ; UNROLL3-NEXT: v_mov_b32_e32 v2, v1 ; UNROLL3-NEXT: v_mov_b32_e32 v3, v0 ; UNROLL3-NEXT: s_mov_b64 s[4:5], 0 -; UNROLL3-NEXT: .LBB3_1: ; %load-store-loop +; UNROLL3-NEXT: .LBB3_1: ; %static-memcpy-expansion-main-body ; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 ; UNROLL3-NEXT: s_clause 0xb ; UNROLL3-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:44 @@ -3600,7 +3600,7 @@ define void @memcpy_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5) ; UNROLL3-NEXT: v_add_nc_u32_e32 v3, 48, v3 ; UNROLL3-NEXT: s_and_b32 vcc_lo, exec_lo, s6 ; UNROLL3-NEXT: s_cbranch_vccnz .LBB3_1 -; UNROLL3-NEXT: ; %bb.2: ; %memcpy-split +; UNROLL3-NEXT: ; %bb.2: ; %static-memcpy-post-expansion ; UNROLL3-NEXT: s_clause 0x3 ; UNROLL3-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:2028 ; UNROLL3-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:2024 @@ -3638,7 +3638,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: .LBB4_1: ; %load-store-loop +; CHECK-NEXT: .LBB4_1: ; %static-memcpy-expansion-main-body ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_clause 0x3e ; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:32 @@ -3741,7 +3741,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[84:87] ; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s6 ; CHECK-NEXT: s_cbranch_vccnz .LBB4_1 -; CHECK-NEXT: ; %bb.2: ; %memcpy-split +; CHECK-NEXT: ; %bb.2: ; %static-memcpy-post-expansion ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] ; @@ -3799,7 +3799,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1224 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1228 ; 4-byte Folded Spill -; ALIGNED-NEXT: .LBB4_1: ; %load-store-loop +; ALIGNED-NEXT: .LBB4_1: ; %static-memcpy-expansion-main-body ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 ; ALIGNED-NEXT: s_clause 0x3e ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:20 @@ -5282,7 +5282,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 ; ALIGNED-NEXT: s_cbranch_vccnz .LBB4_1 -; ALIGNED-NEXT: ; %bb.2: ; %memcpy-split +; ALIGNED-NEXT: ; %bb.2: ; %static-memcpy-post-expansion ; ALIGNED-NEXT: s_clause 0x2f ; 192-byte Folded Reload ; ALIGNED-NEXT: buffer_load_dword v127, off, s[0:3], s32 ; ALIGNED-NEXT: buffer_load_dword v126, off, s[0:3], s32 offset:4 @@ -5342,7 +5342,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; UNROLL3-NEXT: s_mov_b64 s[4:5], 0 ; UNROLL3-NEXT: s_inst_prefetch 0x1 ; UNROLL3-NEXT: .p2align 6 -; UNROLL3-NEXT: .LBB4_1: ; %load-store-loop +; UNROLL3-NEXT: .LBB4_1: ; %static-memcpy-expansion-main-body ; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 ; UNROLL3-NEXT: s_clause 0xb ; UNROLL3-NEXT: buffer_load_dword v4, v3, s[0:3], 0 offen @@ -5370,7 +5370,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[12:15] offset:32 ; UNROLL3-NEXT: s_and_b32 vcc_lo, exec_lo, s6 ; UNROLL3-NEXT: s_cbranch_vccnz .LBB4_1 -; UNROLL3-NEXT: ; %bb.2: ; %memcpy-split +; UNROLL3-NEXT: ; %bb.2: ; %static-memcpy-post-expansion ; UNROLL3-NEXT: s_inst_prefetch 0x2 ; UNROLL3-NEXT: s_clause 0x3 ; UNROLL3-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:2016 diff --git a/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll b/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll index 953511db10b29..d95965caa81ab 100644 --- a/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll +++ b/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll @@ -1051,10 +1051,10 @@ define void @memmove_p1_p3(ptr addrspace(1) align 1 %dst, ptr addrspace(3) align ; CHECK-NEXT: s_mov_b32 s6, exec_lo ; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[7:8] ; CHECK-NEXT: s_cbranch_execz .LBB7_3 -; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader +; CHECK-NEXT: ; %bb.1: ; %dynamic-memcpy-expansion-main-body.preheader ; CHECK-NEXT: v_mov_b32_e32 v9, v2 ; CHECK-NEXT: s_mov_b32 s7, 0 -; CHECK-NEXT: .LBB7_2: ; %loop-memcpy-expansion +; CHECK-NEXT: .LBB7_2: ; %dynamic-memcpy-expansion-main-body ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ds_read_b128 v[10:13], v9 ; CHECK-NEXT: v_add_co_u32 v14, vcc_lo, v0, s4 @@ -1073,15 +1073,14 @@ define void @memmove_p1_p3(ptr addrspace(1) align 1 %dst, ptr addrspace(3) align ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: s_mov_b32 s6, exec_lo ; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[5:6] -; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6 -; CHECK-NEXT: s_cbranch_execz .LBB7_7 -; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader +; CHECK-NEXT: s_cbranch_execz .LBB7_6 +; CHECK-NEXT: ; %bb.4: ; %dynamic-memcpy-expansion-residual-body.preheader ; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v3 ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc_lo -; CHECK-NEXT: .LBB7_5: ; %loop-memcpy-residual +; CHECK-NEXT: .LBB7_5: ; %dynamic-memcpy-expansion-residual-body ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ds_read_u8 v7, v2 ; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v0, s4 @@ -1095,9 +1094,7 @@ define void @memmove_p1_p3(ptr addrspace(1) align 1 %dst, ptr addrspace(3) align ; CHECK-NEXT: global_store_byte v[3:4], v7, off ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB7_5 -; CHECK-NEXT: ; %bb.6: ; %Flow -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 -; CHECK-NEXT: .LBB7_7: ; %Flow7 +; CHECK-NEXT: .LBB7_6: ; %Flow7 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1263,11 +1260,11 @@ define void @memmove_p1_p5(ptr addrspace(1) align 1 %dst, ptr addrspace(5) align ; CHECK-NEXT: s_mov_b32 s6, exec_lo ; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[7:8] ; CHECK-NEXT: s_cbranch_execz .LBB9_3 -; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader +; CHECK-NEXT: ; %bb.1: ; %dynamic-memcpy-expansion-main-body.preheader ; CHECK-NEXT: v_mov_b32_e32 v9, v2 ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: .p2align 6 -; CHECK-NEXT: .LBB9_2: ; %loop-memcpy-expansion +; CHECK-NEXT: .LBB9_2: ; %dynamic-memcpy-expansion-main-body ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_clause 0x3 ; CHECK-NEXT: buffer_load_dword v10, v9, s[0:3], 0 offen @@ -1290,15 +1287,14 @@ define void @memmove_p1_p5(ptr addrspace(1) align 1 %dst, ptr addrspace(5) align ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: s_mov_b32 s6, exec_lo ; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[5:6] -; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6 -; CHECK-NEXT: s_cbranch_execz .LBB9_7 -; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader +; CHECK-NEXT: s_cbranch_execz .LBB9_6 +; CHECK-NEXT: ; %bb.4: ; %dynamic-memcpy-expansion-residual-body.preheader ; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v3 ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc_lo -; CHECK-NEXT: .LBB9_5: ; %loop-memcpy-residual +; CHECK-NEXT: .LBB9_5: ; %dynamic-memcpy-expansion-residual-body ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen ; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v0, s4 @@ -1312,9 +1308,7 @@ define void @memmove_p1_p5(ptr addrspace(1) align 1 %dst, ptr addrspace(5) align ; CHECK-NEXT: global_store_byte v[3:4], v7, off ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB9_5 -; CHECK-NEXT: ; %bb.6: ; %Flow -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 -; CHECK-NEXT: .LBB9_7: ; %Flow7 +; CHECK-NEXT: .LBB9_6: ; %Flow7 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1479,10 +1473,10 @@ define void @memmove_p3_p1(ptr addrspace(3) align 1 %dst, ptr addrspace(1) align ; CHECK-NEXT: s_mov_b32 s6, exec_lo ; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[7:8] ; CHECK-NEXT: s_cbranch_execz .LBB11_3 -; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader +; CHECK-NEXT: ; %bb.1: ; %dynamic-memcpy-expansion-main-body.preheader ; CHECK-NEXT: v_mov_b32_e32 v9, v0 ; CHECK-NEXT: s_mov_b32 s7, 0 -; CHECK-NEXT: .LBB11_2: ; %loop-memcpy-expansion +; CHECK-NEXT: .LBB11_2: ; %dynamic-memcpy-expansion-main-body ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v10, vcc_lo, v1, s4 ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, s5, v2, vcc_lo @@ -1501,15 +1495,14 @@ define void @memmove_p3_p1(ptr addrspace(3) align 1 %dst, ptr addrspace(1) align ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: s_mov_b32 s6, exec_lo ; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[5:6] -; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6 -; CHECK-NEXT: s_cbranch_execz .LBB11_7 -; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader +; CHECK-NEXT: s_cbranch_execz .LBB11_6 +; CHECK-NEXT: ; %bb.4: ; %dynamic-memcpy-expansion-residual-body.preheader ; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v3 ; CHECK-NEXT: v_add_co_ci_u32_e64 v2, null, v2, v4, vcc_lo -; CHECK-NEXT: .LBB11_5: ; %loop-memcpy-residual +; CHECK-NEXT: .LBB11_5: ; %dynamic-memcpy-expansion-residual-body ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v1, s4 ; CHECK-NEXT: v_add_co_ci_u32_e64 v4, null, s5, v2, vcc_lo @@ -1523,9 +1516,7 @@ define void @memmove_p3_p1(ptr addrspace(3) align 1 %dst, ptr addrspace(1) align ; CHECK-NEXT: v_add_nc_u32_e32 v0, 1, v0 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB11_5 -; CHECK-NEXT: ; %bb.6: ; %Flow -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 -; CHECK-NEXT: .LBB11_7: ; %Flow7 +; CHECK-NEXT: .LBB11_6: ; %Flow7 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -1673,10 +1664,10 @@ define void @memmove_p3_p4(ptr addrspace(3) align 1 %dst, ptr addrspace(4) align ; CHECK-NEXT: s_mov_b32 s6, exec_lo ; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[7:8] ; CHECK-NEXT: s_cbranch_execz .LBB13_3 -; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader +; CHECK-NEXT: ; %bb.1: ; %dynamic-memcpy-expansion-main-body.preheader ; CHECK-NEXT: v_mov_b32_e32 v9, v0 ; CHECK-NEXT: s_mov_b32 s7, 0 -; CHECK-NEXT: .LBB13_2: ; %loop-memcpy-expansion +; CHECK-NEXT: .LBB13_2: ; %dynamic-memcpy-expansion-main-body ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v10, vcc_lo, v1, s4 ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, s5, v2, vcc_lo @@ -1695,15 +1686,14 @@ define void @memmove_p3_p4(ptr addrspace(3) align 1 %dst, ptr addrspace(4) align ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: s_mov_b32 s6, exec_lo ; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[5:6] -; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6 -; CHECK-NEXT: s_cbranch_execz .LBB13_7 -; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader +; CHECK-NEXT: s_cbranch_execz .LBB13_6 +; CHECK-NEXT: ; %bb.4: ; %dynamic-memcpy-expansion-residual-body.preheader ; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v3 ; CHECK-NEXT: v_add_co_ci_u32_e64 v2, null, v2, v4, vcc_lo -; CHECK-NEXT: .LBB13_5: ; %loop-memcpy-residual +; CHECK-NEXT: .LBB13_5: ; %dynamic-memcpy-expansion-residual-body ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v1, s4 ; CHECK-NEXT: v_add_co_ci_u32_e64 v4, null, s5, v2, vcc_lo @@ -1717,9 +1707,7 @@ define void @memmove_p3_p4(ptr addrspace(3) align 1 %dst, ptr addrspace(4) align ; CHECK-NEXT: v_add_nc_u32_e32 v0, 1, v0 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB13_5 -; CHECK-NEXT: ; %bb.6: ; %Flow -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 -; CHECK-NEXT: .LBB13_7: ; %Flow7 +; CHECK-NEXT: .LBB13_6: ; %Flow7 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -1740,12 +1728,12 @@ define void @memmove_p3_p5(ptr addrspace(3) align 1 %dst, ptr addrspace(5) align ; CHECK-NEXT: v_and_b32_e32 v5, 15, v4 ; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[2:3] ; CHECK-NEXT: s_cbranch_execz .LBB14_3 -; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader +; CHECK-NEXT: ; %bb.1: ; %dynamic-memcpy-expansion-main-body.preheader ; CHECK-NEXT: v_mov_b32_e32 v7, v1 ; CHECK-NEXT: v_mov_b32_e32 v8, v0 ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: .p2align 6 -; CHECK-NEXT: .LBB14_2: ; %loop-memcpy-expansion +; CHECK-NEXT: .LBB14_2: ; %dynamic-memcpy-expansion-main-body ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_clause 0x3 ; CHECK-NEXT: buffer_load_dword v9, v7, s[0:3], 0 offen @@ -1767,14 +1755,13 @@ define void @memmove_p3_p5(ptr addrspace(3) align 1 %dst, ptr addrspace(5) align ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: s_mov_b32 s6, exec_lo ; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[5:6] -; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6 -; CHECK-NEXT: s_cbranch_execz .LBB14_7 -; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader +; CHECK-NEXT: s_cbranch_execz .LBB14_6 +; CHECK-NEXT: ; %bb.4: ; %dynamic-memcpy-expansion-residual-body.preheader ; CHECK-NEXT: v_and_b32_e32 v2, -16, v4 ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; CHECK-NEXT: .LBB14_5: ; %loop-memcpy-residual +; CHECK-NEXT: .LBB14_5: ; %dynamic-memcpy-expansion-residual-body ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen ; CHECK-NEXT: s_add_u32 s4, s4, 1 @@ -1787,9 +1774,7 @@ define void @memmove_p3_p5(ptr addrspace(3) align 1 %dst, ptr addrspace(5) align ; CHECK-NEXT: v_add_nc_u32_e32 v0, 1, v0 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB14_5 -; CHECK-NEXT: ; %bb.6: ; %Flow -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 -; CHECK-NEXT: .LBB14_7: ; %Flow12 +; CHECK-NEXT: .LBB14_6: ; %Flow12 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -1959,11 +1944,11 @@ define void @memmove_p5_p1(ptr addrspace(5) align 1 %dst, ptr addrspace(1) align ; CHECK-NEXT: s_mov_b32 s6, exec_lo ; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[7:8] ; CHECK-NEXT: s_cbranch_execz .LBB16_3 -; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader +; CHECK-NEXT: ; %bb.1: ; %dynamic-memcpy-expansion-main-body.preheader ; CHECK-NEXT: v_mov_b32_e32 v9, v0 ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: .p2align 6 -; CHECK-NEXT: .LBB16_2: ; %loop-memcpy-expansion +; CHECK-NEXT: .LBB16_2: ; %dynamic-memcpy-expansion-main-body ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v10, vcc_lo, v1, s4 ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, s5, v2, vcc_lo @@ -1985,15 +1970,14 @@ define void @memmove_p5_p1(ptr addrspace(5) align 1 %dst, ptr addrspace(1) align ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: s_mov_b32 s6, exec_lo ; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[5:6] -; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6 -; CHECK-NEXT: s_cbranch_execz .LBB16_7 -; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader +; CHECK-NEXT: s_cbranch_execz .LBB16_6 +; CHECK-NEXT: ; %bb.4: ; %dynamic-memcpy-expansion-residual-body.preheader ; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v3 ; CHECK-NEXT: v_add_co_ci_u32_e64 v2, null, v2, v4, vcc_lo -; CHECK-NEXT: .LBB16_5: ; %loop-memcpy-residual +; CHECK-NEXT: .LBB16_5: ; %dynamic-memcpy-expansion-residual-body ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v1, s4 ; CHECK-NEXT: v_add_co_ci_u32_e64 v4, null, s5, v2, vcc_lo @@ -2007,9 +1991,7 @@ define void @memmove_p5_p1(ptr addrspace(5) align 1 %dst, ptr addrspace(1) align ; CHECK-NEXT: v_add_nc_u32_e32 v0, 1, v0 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB16_5 -; CHECK-NEXT: ; %bb.6: ; %Flow -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 -; CHECK-NEXT: .LBB16_7: ; %Flow7 +; CHECK-NEXT: .LBB16_6: ; %Flow7 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2029,12 +2011,12 @@ define void @memmove_p5_p3(ptr addrspace(5) align 1 %dst, ptr addrspace(3) align ; CHECK-NEXT: v_and_b32_e32 v5, 15, v4 ; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[2:3] ; CHECK-NEXT: s_cbranch_execz .LBB17_3 -; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader +; CHECK-NEXT: ; %bb.1: ; %dynamic-memcpy-expansion-main-body.preheader ; CHECK-NEXT: v_mov_b32_e32 v7, v1 ; CHECK-NEXT: v_mov_b32_e32 v8, v0 ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: .p2align 6 -; CHECK-NEXT: .LBB17_2: ; %loop-memcpy-expansion +; CHECK-NEXT: .LBB17_2: ; %dynamic-memcpy-expansion-main-body ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ds_read_b128 v[9:12], v7 ; CHECK-NEXT: s_add_u32 s4, s4, 16 @@ -2055,14 +2037,13 @@ define void @memmove_p5_p3(ptr addrspace(5) align 1 %dst, ptr addrspace(3) align ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: s_mov_b32 s6, exec_lo ; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[5:6] -; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6 -; CHECK-NEXT: s_cbranch_execz .LBB17_7 -; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader +; CHECK-NEXT: s_cbranch_execz .LBB17_6 +; CHECK-NEXT: ; %bb.4: ; %dynamic-memcpy-expansion-residual-body.preheader ; CHECK-NEXT: v_and_b32_e32 v2, -16, v4 ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; CHECK-NEXT: .LBB17_5: ; %loop-memcpy-residual +; CHECK-NEXT: .LBB17_5: ; %dynamic-memcpy-expansion-residual-body ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ds_read_u8 v2, v1 ; CHECK-NEXT: s_add_u32 s4, s4, 1 @@ -2075,9 +2056,7 @@ define void @memmove_p5_p3(ptr addrspace(5) align 1 %dst, ptr addrspace(3) align ; CHECK-NEXT: v_add_nc_u32_e32 v0, 1, v0 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB17_5 -; CHECK-NEXT: ; %bb.6: ; %Flow -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 -; CHECK-NEXT: .LBB17_7: ; %Flow12 +; CHECK-NEXT: .LBB17_6: ; %Flow12 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2097,11 +2076,11 @@ define void @memmove_p5_p4(ptr addrspace(5) align 1 %dst, ptr addrspace(4) align ; CHECK-NEXT: s_mov_b32 s6, exec_lo ; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[7:8] ; CHECK-NEXT: s_cbranch_execz .LBB18_3 -; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader +; CHECK-NEXT: ; %bb.1: ; %dynamic-memcpy-expansion-main-body.preheader ; CHECK-NEXT: v_mov_b32_e32 v9, v0 ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: .p2align 6 -; CHECK-NEXT: .LBB18_2: ; %loop-memcpy-expansion +; CHECK-NEXT: .LBB18_2: ; %dynamic-memcpy-expansion-main-body ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v10, vcc_lo, v1, s4 ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, s5, v2, vcc_lo @@ -2123,15 +2102,14 @@ define void @memmove_p5_p4(ptr addrspace(5) align 1 %dst, ptr addrspace(4) align ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: s_mov_b32 s6, exec_lo ; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[5:6] -; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6 -; CHECK-NEXT: s_cbranch_execz .LBB18_7 -; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader +; CHECK-NEXT: s_cbranch_execz .LBB18_6 +; CHECK-NEXT: ; %bb.4: ; %dynamic-memcpy-expansion-residual-body.preheader ; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v3 ; CHECK-NEXT: v_add_co_ci_u32_e64 v2, null, v2, v4, vcc_lo -; CHECK-NEXT: .LBB18_5: ; %loop-memcpy-residual +; CHECK-NEXT: .LBB18_5: ; %dynamic-memcpy-expansion-residual-body ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v1, s4 ; CHECK-NEXT: v_add_co_ci_u32_e64 v4, null, s5, v2, vcc_lo @@ -2145,9 +2123,7 @@ define void @memmove_p5_p4(ptr addrspace(5) align 1 %dst, ptr addrspace(4) align ; CHECK-NEXT: v_add_nc_u32_e32 v0, 1, v0 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB18_5 -; CHECK-NEXT: ; %bb.6: ; %Flow -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 -; CHECK-NEXT: .LBB18_7: ; %Flow7 +; CHECK-NEXT: .LBB18_6: ; %Flow7 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: diff --git a/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll b/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll index 297b2b984cdae..ad78e0fe7438b 100644 --- a/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll +++ b/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll @@ -20,19 +20,19 @@ entry: ; IR-LABEL: @memcpy_caller ; IR: entry: ; IR: [[Cond:%[0-9]+]] = icmp ne i64 %n, 0 -; IR: br i1 [[Cond]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion +; IR: br i1 [[Cond]], label %dynamic-memcpy-expansion-main-body, label %dynamic-memcpy-post-expansion -; IR: loop-memcpy-expansion: -; IR: %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %loop-memcpy-expansion ] +; IR: dynamic-memcpy-expansion-main-body: +; IR: %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %dynamic-memcpy-expansion-main-body ] ; IR: [[SrcGep:%[0-9]+]] = getelementptr inbounds i8, ptr %src, i64 %loop-index ; IR: [[Load:%[0-9]+]] = load i8, ptr [[SrcGep]] ; IR: [[DstGep:%[0-9]+]] = getelementptr inbounds i8, ptr %dst, i64 %loop-index ; IR: store i8 [[Load]], ptr [[DstGep]] ; IR: [[IndexInc]] = add i64 %loop-index, 1 ; IR: [[Cond2:%[0-9]+]] = icmp ult i64 [[IndexInc]], %n -; IR: br i1 [[Cond2]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion +; IR: br i1 [[Cond2]], label %dynamic-memcpy-expansion-main-body, label %dynamic-memcpy-post-expansion -; IR-LABEL: post-loop-memcpy-expansion: +; IR-LABEL: dynamic-memcpy-post-expansion: ; IR: ret ptr %dst ; PTX-LABEL: .visible .func (.param .b64 func_retval0) memcpy_caller @@ -53,19 +53,19 @@ entry: ; IR-LABEL: @memcpy_volatile_caller ; IR: entry: ; IR: [[Cond:%[0-9]+]] = icmp ne i64 %n, 0 -; IR: br i1 [[Cond]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion +; IR: br i1 [[Cond]], label %dynamic-memcpy-expansion-main-body, label %dynamic-memcpy-post-expansion -; IR: loop-memcpy-expansion: -; IR: %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %loop-memcpy-expansion ] +; IR: dynamic-memcpy-expansion-main-body: +; IR: %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %dynamic-memcpy-expansion-main-body ] ; IR: [[SrcGep:%[0-9]+]] = getelementptr inbounds i8, ptr %src, i64 %loop-index ; IR: [[Load:%[0-9]+]] = load volatile i8, ptr [[SrcGep]] ; IR: [[DstGep:%[0-9]+]] = getelementptr inbounds i8, ptr %dst, i64 %loop-index ; IR: store volatile i8 [[Load]], ptr [[DstGep]] ; IR: [[IndexInc]] = add i64 %loop-index, 1 ; IR: [[Cond2:%[0-9]+]] = icmp ult i64 [[IndexInc]], %n -; IR: br i1 [[Cond2]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion +; IR: br i1 [[Cond2]], label %dynamic-memcpy-expansion-main-body, label %dynamic-memcpy-post-expansion -; IR-LABEL: post-loop-memcpy-expansion: +; IR-LABEL: dynamic-memcpy-post-expansion: ; IR: ret ptr %dst @@ -97,16 +97,16 @@ entry: ; Check that calls with compile-time constant size are handled correctly ; IR-LABEL: @memcpy_known_size ; IR: entry: -; IR: br label %load-store-loop -; IR: load-store-loop: -; IR: %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %load-store-loop ] +; IR: br label %static-memcpy-expansion-main-body +; IR: static-memcpy-expansion-main-body: +; IR: %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %static-memcpy-expansion-main-body ] ; IR: [[SrcGep:%[0-9]+]] = getelementptr inbounds i8, ptr %src, i64 %loop-index ; IR: [[Load:%[0-9]+]] = load i8, ptr [[SrcGep]] ; IR: [[DstGep:%[0-9]+]] = getelementptr inbounds i8, ptr %dst, i64 %loop-index ; IR: store i8 [[Load]], ptr [[DstGep]] ; IR: [[IndexInc]] = add i64 %loop-index, 1 ; IR: [[Cond:%[0-9]+]] = icmp ult i64 %3, 144 -; IR: br i1 [[Cond]], label %load-store-loop, label %memcpy-split +; IR: br i1 [[Cond]], label %static-memcpy-expansion-main-body, label %static-memcpy-post-expansion } define ptr @memset_caller(ptr %dst, i32 %c, i64 %n) #0 { diff --git a/llvm/unittests/Transforms/Utils/MemTransferLowering.cpp b/llvm/unittests/Transforms/Utils/MemTransferLowering.cpp index dd03b4f2ae971..752029e54f394 100644 --- a/llvm/unittests/Transforms/Utils/MemTransferLowering.cpp +++ b/llvm/unittests/Transforms/Utils/MemTransferLowering.cpp @@ -120,7 +120,8 @@ TEST_F(MemTransferLowerTest, MemCpyKnownLength) { MemCpyInst *MemCpyI = cast(Inst); auto &SE = FAM.getResult(F); expandMemCpyAsLoop(MemCpyI, TTI, &SE); - auto *CopyLoopBB = getBasicBlockByName(F, "load-store-loop"); + auto *CopyLoopBB = + getBasicBlockByName(F, "static-memcpy-expansion-main-body"); Instruction *LoadInst = getInstructionByOpcode(*CopyLoopBB, Instruction::Load, 1); EXPECT_NE(nullptr, LoadInst->getMetadata(LLVMContext::MD_alias_scope)); @@ -203,7 +204,8 @@ TEST_F(MemTransferLowerTest, AtomicMemCpyKnownLength) { AnyMemCpyInst *MemCpyI = cast(Inst); auto &SE = FAM.getResult(F); expandAtomicMemCpyAsLoop(MemCpyI, TTI, &SE); - auto *CopyLoopBB = getBasicBlockByName(F, "load-store-loop"); + auto *CopyLoopBB = + getBasicBlockByName(F, "static-memcpy-expansion-main-body"); Instruction *LoadInst = getInstructionByOpcode(*CopyLoopBB, Instruction::Load, 1); EXPECT_TRUE(LoadInst->isAtomic()); @@ -248,7 +250,8 @@ TEST_F(MemTransferLowerTest, AtomicMemCpyUnKnownLength) { auto *MemCpyI = cast(Inst); auto &SE = FAM.getResult(F); expandAtomicMemCpyAsLoop(MemCpyI, TTI, &SE); - auto *CopyLoopBB = getBasicBlockByName(F, "loop-memcpy-expansion"); + auto *CopyLoopBB = + getBasicBlockByName(F, "dynamic-memcpy-expansion-main-body"); Instruction *LoadInst = getInstructionByOpcode(*CopyLoopBB, Instruction::Load, 1); EXPECT_TRUE(LoadInst->isAtomic());