From b221dca4458a7e026e82d07ebc6fad010aa4793d Mon Sep 17 00:00:00 2001 From: cf-zhao Date: Fri, 17 May 2024 08:28:59 +0000 Subject: [PATCH] Revert "Support stack clash protection" This reverts commit 4f4298791f15f26e0649f57c6edfd999af51ec41. (cherry picked from commit f9af047c9f0602b71489d2f042fecdbe22ae100f) --- ...acksave-stackrestore-in-IRTranslator.patch | 315 -- ...tion-of-locals-and-stack-realignment.patch | 546 ---- ...Stack-probing-for-function-prologues.patch | 2652 ----------------- ...-for-dynamic-allocas-in-SelectionDAG.patch | 744 ----- ...ng-for-dynamic-allocas-in-GlobalISel.patch | 496 --- ...-for-stack-clash-protection-backport.patch | 177 -- llvm.spec | 12 +- 7 files changed, 1 insertion(+), 4941 deletions(-) delete mode 100644 0021-Backport-GlobalISel-Don-t-expand-stacksave-stackrestore-in-IRTranslator.patch delete mode 100644 0022-Backport-AArch64-Refactor-allocation-of-locals-and-stack-realignment.patch delete mode 100644 0023-Backport-AArch64-Stack-probing-for-function-prologues.patch delete mode 100644 0024-Backport-AArch64-Stack-probing-for-dynamic-allocas-in-SelectionDAG.patch delete mode 100644 0025-Backport-AArch64-Stack-probing-for-dynamic-allocas-in-GlobalISel.patch delete mode 100644 0026-Update-testcase-for-stack-clash-protection-backport.patch diff --git a/0021-Backport-GlobalISel-Don-t-expand-stacksave-stackrestore-in-IRTranslator.patch b/0021-Backport-GlobalISel-Don-t-expand-stacksave-stackrestore-in-IRTranslator.patch deleted file mode 100644 index 3fbe2e7..0000000 --- a/0021-Backport-GlobalISel-Don-t-expand-stacksave-stackrestore-in-IRTranslator.patch +++ /dev/null @@ -1,315 +0,0 @@ -From 7aeecae6393d5c3333beec64ad343ed1cabe75e4 Mon Sep 17 00:00:00 2001 -From: Matt Arsenault -Date: Sat, 29 Jul 2023 19:12:24 -0400 -Subject: [PATCH 1/7] GlobalISel: Don't expand stacksave/stackrestore in - IRTranslator - -In some (likely invalid edge cases anyway), it's not correct to -directly copy the stack pointer register. ---- - .../llvm/CodeGen/GlobalISel/LegalizerHelper.h | 2 + - llvm/include/llvm/Support/TargetOpcodes.def | 6 +++ - llvm/include/llvm/Target/GenericOpcodes.td | 12 ++++++ - llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp | 25 ++---------- - .../CodeGen/GlobalISel/LegalizerHelper.cpp | 26 +++++++++++++ - .../AArch64/GISel/AArch64LegalizerInfo.cpp | 4 +- - llvm/lib/Target/X86/X86LegalizerInfo.cpp | 4 ++ - .../AArch64/GlobalISel/arm64-irtranslator.ll | 4 +- - .../GlobalISel/legalizer-info-validation.mir | 10 ++++- - .../GlobalISel/stacksave-stackrestore.ll | 35 +++++++++++++++++ - .../X86/GlobalISel/stacksave-stackrestore.ll | 39 +++++++++++++++++++ - 11 files changed, 141 insertions(+), 26 deletions(-) - create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/stacksave-stackrestore.ll - create mode 100644 llvm/test/CodeGen/X86/GlobalISel/stacksave-stackrestore.ll - -diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h -index a568edd0e640..9288091874cf 100644 ---- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h -+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h -@@ -401,6 +401,8 @@ public: - LegalizeResult lowerExtractInsertVectorElt(MachineInstr &MI); - LegalizeResult lowerShuffleVector(MachineInstr &MI); - LegalizeResult lowerDynStackAlloc(MachineInstr &MI); -+ LegalizeResult lowerStackSave(MachineInstr &MI); -+ LegalizeResult lowerStackRestore(MachineInstr &MI); - LegalizeResult lowerExtract(MachineInstr &MI); - LegalizeResult lowerInsert(MachineInstr &MI); - LegalizeResult lowerSADDO_SSUBO(MachineInstr &MI); -diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def -index 186bea75ae96..c92ce6dc701c 100644 ---- a/llvm/include/llvm/Support/TargetOpcodes.def -+++ b/llvm/include/llvm/Support/TargetOpcodes.def -@@ -763,6 +763,12 @@ HANDLE_TARGET_OPCODE(G_JUMP_TABLE) - /// Generic dynamic stack allocation. - HANDLE_TARGET_OPCODE(G_DYN_STACKALLOC) - -+/// Generic stack pointer save. -+HANDLE_TARGET_OPCODE(G_STACKSAVE) -+ -+/// Generic stack pointer restore. -+HANDLE_TARGET_OPCODE(G_STACKRESTORE) -+ - /// Strict floating point instructions. - HANDLE_TARGET_OPCODE(G_STRICT_FADD) - HANDLE_TARGET_OPCODE(G_STRICT_FSUB) -diff --git a/llvm/include/llvm/Target/GenericOpcodes.td b/llvm/include/llvm/Target/GenericOpcodes.td -index 00d56d1c4bd5..e8cfaeab3cd8 100644 ---- a/llvm/include/llvm/Target/GenericOpcodes.td -+++ b/llvm/include/llvm/Target/GenericOpcodes.td -@@ -225,6 +225,18 @@ def G_DYN_STACKALLOC : GenericInstruction { - let hasSideEffects = true; - } - -+def G_STACKSAVE : GenericInstruction { -+ let OutOperandList = (outs ptype0:$dst); -+ let InOperandList = (ins); -+ let hasSideEffects = true; -+} -+ -+def G_STACKRESTORE : GenericInstruction { -+ let OutOperandList = (outs); -+ let InOperandList = (ins ptype0:$src); -+ let hasSideEffects = true; -+} -+ - def G_FREEZE : GenericInstruction { - let OutOperandList = (outs type0:$dst); - let InOperandList = (ins type0:$src); -diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp -index 9a67a8d05a4d..e4b837c6b8ce 100644 ---- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp -+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp -@@ -2229,31 +2229,12 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, - return true; - } - case Intrinsic::stacksave: { -- // Save the stack pointer to the location provided by the intrinsic. -- Register Reg = getOrCreateVReg(CI); -- Register StackPtr = MF->getSubtarget() -- .getTargetLowering() -- ->getStackPointerRegisterToSaveRestore(); -- -- // If the target doesn't specify a stack pointer, then fall back. -- if (!StackPtr) -- return false; -- -- MIRBuilder.buildCopy(Reg, StackPtr); -+ MIRBuilder.buildInstr(TargetOpcode::G_STACKSAVE, {getOrCreateVReg(CI)}, {}); - return true; - } - case Intrinsic::stackrestore: { -- // Restore the stack pointer from the location provided by the intrinsic. -- Register Reg = getOrCreateVReg(*CI.getArgOperand(0)); -- Register StackPtr = MF->getSubtarget() -- .getTargetLowering() -- ->getStackPointerRegisterToSaveRestore(); -- -- // If the target doesn't specify a stack pointer, then fall back. -- if (!StackPtr) -- return false; -- -- MIRBuilder.buildCopy(StackPtr, Reg); -+ MIRBuilder.buildInstr(TargetOpcode::G_STACKRESTORE, {}, -+ {getOrCreateVReg(*CI.getArgOperand(0))}); - return true; - } - case Intrinsic::cttz: -diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp -index f0da0d88140f..75d9789be4d0 100644 ---- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp -+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp -@@ -3503,6 +3503,10 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) { - return lowerShuffleVector(MI); - case G_DYN_STACKALLOC: - return lowerDynStackAlloc(MI); -+ case G_STACKSAVE: -+ return lowerStackSave(MI); -+ case G_STACKRESTORE: -+ return lowerStackRestore(MI); - case G_EXTRACT: - return lowerExtract(MI); - case G_INSERT: -@@ -6810,6 +6814,28 @@ LegalizerHelper::lowerDynStackAlloc(MachineInstr &MI) { - return Legalized; - } - -+LegalizerHelper::LegalizeResult -+LegalizerHelper::lowerStackSave(MachineInstr &MI) { -+ Register StackPtr = TLI.getStackPointerRegisterToSaveRestore(); -+ if (!StackPtr) -+ return UnableToLegalize; -+ -+ MIRBuilder.buildCopy(MI.getOperand(0), StackPtr); -+ MI.eraseFromParent(); -+ return Legalized; -+} -+ -+LegalizerHelper::LegalizeResult -+LegalizerHelper::lowerStackRestore(MachineInstr &MI) { -+ Register StackPtr = TLI.getStackPointerRegisterToSaveRestore(); -+ if (!StackPtr) -+ return UnableToLegalize; -+ -+ MIRBuilder.buildCopy(StackPtr, MI.getOperand(0)); -+ MI.eraseFromParent(); -+ return Legalized; -+} -+ - LegalizerHelper::LegalizeResult - LegalizerHelper::lowerExtract(MachineInstr &MI) { - auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs(); -diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp -index d905da4eaec3..f0130a0be29d 100644 ---- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp -+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp -@@ -797,7 +797,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) - return Query.Types[0] == p0 && Query.Types[1] == s64; - }); - -- getActionDefinitionsBuilder(G_DYN_STACKALLOC).lower(); -+ getActionDefinitionsBuilder({G_DYN_STACKALLOC, -+ G_STACKSAVE, -+ G_STACKRESTORE}).lower(); - - if (ST.hasMOPS()) { - // G_BZERO is not supported. Currently it is only emitted by -diff --git a/llvm/lib/Target/X86/X86LegalizerInfo.cpp b/llvm/lib/Target/X86/X86LegalizerInfo.cpp -index a4a247f85f3d..104461cff0a9 100644 ---- a/llvm/lib/Target/X86/X86LegalizerInfo.cpp -+++ b/llvm/lib/Target/X86/X86LegalizerInfo.cpp -@@ -528,6 +528,10 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI, - // memory intrinsics - getActionDefinitionsBuilder({G_MEMCPY, G_MEMMOVE, G_MEMSET}).libcall(); - -+ getActionDefinitionsBuilder({G_DYN_STACKALLOC, -+ G_STACKSAVE, -+ G_STACKRESTORE}).lower(); -+ - // fp intrinsics - getActionDefinitionsBuilder(G_INTRINSIC_ROUNDEVEN) - .scalarize(0) -diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll -index 5f3544add398..575cd6b874e3 100644 ---- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll -+++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll -@@ -2392,8 +2392,8 @@ declare ptr @llvm.stacksave() - declare void @llvm.stackrestore(ptr) - define void @test_stacksaverestore() { - ; CHECK-LABEL: name: test_stacksaverestore -- ; CHECK: [[SAVE:%[0-9]+]]:_(p0) = COPY $sp -- ; CHECK-NEXT: $sp = COPY [[SAVE]](p0) -+ ; CHECK: [[SAVE:%[0-9]+]]:_(p0) = G_STACKSAVE -+ ; CHECK-NEXT: G_STACKRESTORE [[SAVE]] - ; CHECK-NEXT: RET_ReallyLR - %sp = call ptr @llvm.stacksave() - call void @llvm.stackrestore(ptr %sp) -diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir -index b4fe73d29fa6..461161f5b338 100644 ---- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir -+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir -@@ -641,7 +641,15 @@ - # DEBUG-NEXT: G_JUMP_TABLE (opcode {{[0-9]+}}): 1 type index, 0 imm indices - # DEBUG-NEXT: .. the first uncovered type index: 1, OK - # DEBUG-NEXT: .. the first uncovered imm index: 0, OK --# DEBUG-NEXT: G_DYN_STACKALLOC (opcode {{[0-9]+}}): 2 type indices, 0 imm indices -+# DEBUG-NEXT: G_DYN_STACKALLOC (opcode [[DYN_STACKALLOC:[0-9]+]]): 2 type indices, 0 imm indices -+# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected -+# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected -+# DEBUG-NEXT: G_STACKSAVE (opcode {{[0-9]+}}): 1 type index, 0 imm indices -+# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to [[DYN_STACKALLOC]] -+# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected -+# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected -+# DEBUG-NEXT: G_STACKRESTORE (opcode {{[0-9]+}}): 1 type index, 0 imm indices -+# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to [[DYN_STACKALLOC]] - # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected - # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected - # DEBUG-NEXT: G_STRICT_FADD (opcode {{[0-9]+}}): 1 type index, 0 imm indices -diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/stacksave-stackrestore.ll b/llvm/test/CodeGen/AArch64/GlobalISel/stacksave-stackrestore.ll -new file mode 100644 -index 000000000000..16bf85af9c17 ---- /dev/null -+++ b/llvm/test/CodeGen/AArch64/GlobalISel/stacksave-stackrestore.ll -@@ -0,0 +1,35 @@ -+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -+; RUN: llc -global-isel=1 -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s -+ -+declare void @use_addr(ptr) -+declare ptr @llvm.stacksave.p0() -+declare void @llvm.stackrestore.p0(ptr) -+ -+define void @test_scoped_alloca(i64 %n) { -+; CHECK-LABEL: test_scoped_alloca: -+; CHECK: // %bb.0: -+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -+; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill -+; CHECK-NEXT: mov x29, sp -+; CHECK-NEXT: .cfi_def_cfa w29, 32 -+; CHECK-NEXT: .cfi_offset w19, -16 -+; CHECK-NEXT: .cfi_offset w30, -24 -+; CHECK-NEXT: .cfi_offset w29, -32 -+; CHECK-NEXT: add x9, x0, #15 -+; CHECK-NEXT: mov x8, sp -+; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 -+; CHECK-NEXT: mov x19, sp -+; CHECK-NEXT: sub x0, x8, x9 -+; CHECK-NEXT: mov sp, x0 -+; CHECK-NEXT: bl use_addr -+; CHECK-NEXT: mov sp, x19 -+; CHECK-NEXT: mov sp, x29 -+; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload -+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload -+; CHECK-NEXT: ret -+ %sp = call ptr @llvm.stacksave.p0() -+ %addr = alloca i8, i64 %n -+ call void @use_addr(ptr %addr) -+ call void @llvm.stackrestore.p0(ptr %sp) -+ ret void -+} -diff --git a/llvm/test/CodeGen/X86/GlobalISel/stacksave-stackrestore.ll b/llvm/test/CodeGen/X86/GlobalISel/stacksave-stackrestore.ll -new file mode 100644 -index 000000000000..e86c04ee22db ---- /dev/null -+++ b/llvm/test/CodeGen/X86/GlobalISel/stacksave-stackrestore.ll -@@ -0,0 +1,39 @@ -+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -+; RUN: llc -global-isel=1 -mtriple=x86_64-linux-gnu -o - %s | FileCheck %s -+ -+declare void @use_addr(ptr) -+declare ptr @llvm.stacksave.p0() -+declare void @llvm.stackrestore.p0(ptr) -+ -+define void @test_scoped_alloca(i64 %n) { -+; CHECK-LABEL: test_scoped_alloca: -+; CHECK: # %bb.0: -+; CHECK-NEXT: pushq %rbp -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: .cfi_offset %rbp, -16 -+; CHECK-NEXT: movq %rsp, %rbp -+; CHECK-NEXT: .cfi_def_cfa_register %rbp -+; CHECK-NEXT: pushq %rbx -+; CHECK-NEXT: pushq %rax -+; CHECK-NEXT: .cfi_offset %rbx, -24 -+; CHECK-NEXT: movq %rsp, %rbx -+; CHECK-NEXT: movq %rsp, %rax -+; CHECK-NEXT: imulq $1, %rdi, %rcx -+; CHECK-NEXT: addq $15, %rcx -+; CHECK-NEXT: andq $-16, %rcx -+; CHECK-NEXT: subq %rcx, %rax -+; CHECK-NEXT: movq %rax, %rsp -+; CHECK-NEXT: movq %rax, %rdi -+; CHECK-NEXT: callq use_addr -+; CHECK-NEXT: movq %rbx, %rsp -+; CHECK-NEXT: leaq -8(%rbp), %rsp -+; CHECK-NEXT: popq %rbx -+; CHECK-NEXT: popq %rbp -+; CHECK-NEXT: .cfi_def_cfa %rsp, 8 -+; CHECK-NEXT: retq -+ %sp = call ptr @llvm.stacksave.p0() -+ %addr = alloca i8, i64 %n -+ call void @use_addr(ptr %addr) -+ call void @llvm.stackrestore.p0(ptr %sp) -+ ret void -+} --- -2.42.0.windows.2 - diff --git a/0022-Backport-AArch64-Refactor-allocation-of-locals-and-stack-realignment.patch b/0022-Backport-AArch64-Refactor-allocation-of-locals-and-stack-realignment.patch deleted file mode 100644 index 6fefb9c..0000000 --- a/0022-Backport-AArch64-Refactor-allocation-of-locals-and-stack-realignment.patch +++ /dev/null @@ -1,546 +0,0 @@ -From 8db377e2a22d83637171008b6c8723f1869a2926 Mon Sep 17 00:00:00 2001 -From: rickyleung -Date: Tue, 7 May 2024 21:24:49 +0800 -Subject: [PATCH 3/7] [backport][AArch64] Refactor allocation of locals and - stack realignment - -Reference: https://github.com/wc00862805aj/llvm-project/commit/dedf2c6bb5193652f6ad7d9ff9e676624c2485b7? - -Factor out some stack allocation in a separate function. This patch -splits out the generic portion of a larger refactoring done as a part of -stack clash protection support. - -The patch is almost, but not quite NFC. The only difference should -be that where we have adjacent allocation of stack space -for local SVE objects and non-local SVE objects the order -of `sub sp, ...` and `addvl sp, ...` instructions is reversed, because now -it's done with a single call to `emitFrameOffset` and it happens -add/subtract the fixed part before the scalable part, e.g. - - addvl sp, sp, #-2 - sub sp, sp, llvm#16, lsl llvm#12 - sub sp, sp, llvm#16 - -becomes - - sub sp, sp, llvm#16, lsl llvm#12 - sub sp, sp, llvm#16 - addvl sp, sp, #-2 ---- - .../Target/AArch64/AArch64FrameLowering.cpp | 114 +++++++++--------- - .../lib/Target/AArch64/AArch64FrameLowering.h | 5 + - .../AArch64/framelayout-sve-basepointer.mir | 4 +- - .../framelayout-sve-fixed-width-access.mir | 2 +- - .../framelayout-sve-scavengingslot.mir | 4 +- - llvm/test/CodeGen/AArch64/framelayout-sve.mir | 54 ++++----- - .../AArch64/spill-stack-realignment.mir | 2 +- - llvm/test/CodeGen/AArch64/stack-guard-sve.ll | 4 +- - .../AArch64/sve-calling-convention-mixed.ll | 4 +- - .../CodeGen/AArch64/sve-fixed-length-fp128.ll | 4 +- - 10 files changed, 103 insertions(+), 94 deletions(-) - -diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp -index 4d5676f34101..eeb6185fa36d 100644 ---- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp -+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp -@@ -300,6 +300,7 @@ static int64_t getArgumentStackToRestore(MachineFunction &MF, - static bool produceCompactUnwindFrame(MachineFunction &MF); - static bool needsWinCFI(const MachineFunction &MF); - static StackOffset getSVEStackSize(const MachineFunction &MF); -+static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB); - static bool needsShadowCallStackPrologueEpilogue(MachineFunction &MF); - - /// Returns true if a homogeneous prolog or epilog code can be emitted -@@ -671,6 +672,44 @@ void AArch64FrameLowering::emitCalleeSavedSVERestores( - emitCalleeSavedRestores(MBB, MBBI, true); - } - -+void AArch64FrameLowering::allocateStackSpace( -+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, -+ bool NeedsRealignment, StackOffset AllocSize, bool NeedsWinCFI, -+ bool *HasWinCFI, bool EmitCFI, StackOffset InitialOffset) const { -+ -+ if (!AllocSize) -+ return; -+ -+ DebugLoc DL; -+ MachineFunction &MF = *MBB.getParent(); -+ const AArch64Subtarget &Subtarget = MF.getSubtarget(); -+ const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); -+ AArch64FunctionInfo &AFI = *MF.getInfo(); -+ const MachineFrameInfo &MFI = MF.getFrameInfo(); -+ -+ Register TargetReg = -+ NeedsRealignment ? findScratchNonCalleeSaveRegister(&MBB) : AArch64::SP; -+ // SUB Xd/SP, SP, AllocSize -+ emitFrameOffset(MBB, MBBI, DL, TargetReg, AArch64::SP, -AllocSize, &TII, -+ MachineInstr::FrameSetup, false, NeedsWinCFI, HasWinCFI, -+ EmitCFI, InitialOffset); -+ -+ if (NeedsRealignment) { -+ const int64_t MaxAlign = MFI.getMaxAlign().value(); -+ const uint64_t AndMask = ~(MaxAlign - 1); -+ // AND SP, Xd, 0b11111...0000 -+ BuildMI(MBB, MBBI, DL, TII.get(AArch64::ANDXri), AArch64::SP) -+ .addReg(TargetReg, RegState::Kill) -+ .addImm(AArch64_AM::encodeLogicalImmediate(AndMask, 64)) -+ .setMIFlags(MachineInstr::FrameSetup); -+ AFI.setStackRealigned(true); -+ -+ // No need for SEH instructions here; if we're realigning the stack, -+ // we've set a frame pointer and already finished the SEH prologue. -+ assert(!NeedsWinCFI); -+ } -+} -+ - static MCRegister getRegisterOrZero(MCRegister Reg, bool HasSVE) { - switch (Reg.id()) { - default: -@@ -1769,7 +1808,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, - } - } - -- StackOffset AllocateBefore = SVEStackSize, AllocateAfter = {}; -+ StackOffset SVECalleeSavesSize = {}, SVELocalsSize = SVEStackSize; - MachineBasicBlock::iterator CalleeSavesBegin = MBBI, CalleeSavesEnd = MBBI; - - // Process the SVE callee-saves to determine what space needs to be -@@ -1782,67 +1821,32 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, - ++MBBI; - CalleeSavesEnd = MBBI; - -- AllocateBefore = StackOffset::getScalable(CalleeSavedSize); -- AllocateAfter = SVEStackSize - AllocateBefore; -+ SVECalleeSavesSize = StackOffset::getScalable(CalleeSavedSize); -+ SVELocalsSize = SVEStackSize - SVECalleeSavesSize; - } - - // Allocate space for the callee saves (if any). -- emitFrameOffset( -- MBB, CalleeSavesBegin, DL, AArch64::SP, AArch64::SP, -AllocateBefore, TII, -- MachineInstr::FrameSetup, false, false, nullptr, -- EmitAsyncCFI && !HasFP && AllocateBefore, -- StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes)); -+ StackOffset CFAOffset = -+ StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes); -+ allocateStackSpace(MBB, CalleeSavesBegin, false, SVECalleeSavesSize, false, -+ nullptr, EmitAsyncCFI && !HasFP, CFAOffset); -+ CFAOffset += SVECalleeSavesSize; - - if (EmitAsyncCFI) - emitCalleeSavedSVELocations(MBB, CalleeSavesEnd); - -- // Finally allocate remaining SVE stack space. -- emitFrameOffset(MBB, CalleeSavesEnd, DL, AArch64::SP, AArch64::SP, -- -AllocateAfter, TII, MachineInstr::FrameSetup, false, false, -- nullptr, EmitAsyncCFI && !HasFP && AllocateAfter, -- AllocateBefore + StackOffset::getFixed( -- (int64_t)MFI.getStackSize() - NumBytes)); -- -- // Allocate space for the rest of the frame. -- if (NumBytes) { -- unsigned scratchSPReg = AArch64::SP; -- -- if (NeedsRealignment) { -- scratchSPReg = findScratchNonCalleeSaveRegister(&MBB); -- assert(scratchSPReg != AArch64::NoRegister); -- } -- -- // If we're a leaf function, try using the red zone. -- if (!canUseRedZone(MF)) { -- // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have -- // the correct value here, as NumBytes also includes padding bytes, -- // which shouldn't be counted here. -- emitFrameOffset( -- MBB, MBBI, DL, scratchSPReg, AArch64::SP, -- StackOffset::getFixed(-NumBytes), TII, MachineInstr::FrameSetup, -- false, NeedsWinCFI, &HasWinCFI, EmitAsyncCFI && !HasFP, -- SVEStackSize + -- StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes)); -- } -- if (NeedsRealignment) { -- assert(MFI.getMaxAlign() > Align(1)); -- assert(scratchSPReg != AArch64::SP); -- -- // SUB X9, SP, NumBytes -- // -- X9 is temporary register, so shouldn't contain any live data here, -- // -- free to use. This is already produced by emitFrameOffset above. -- // AND SP, X9, 0b11111...0000 -- uint64_t AndMask = ~(MFI.getMaxAlign().value() - 1); -- -- BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP) -- .addReg(scratchSPReg, RegState::Kill) -- .addImm(AArch64_AM::encodeLogicalImmediate(AndMask, 64)); -- AFI->setStackRealigned(true); -- -- // No need for SEH instructions here; if we're realigning the stack, -- // we've set a frame pointer and already finished the SEH prologue. -- assert(!NeedsWinCFI); -- } -+ // Allocate space for the rest of the frame including SVE locals. Align the -+ // stack as necessary. -+ assert(!(canUseRedZone(MF) && NeedsRealignment) && -+ "Cannot use redzone with stack realignment"); -+ if (!canUseRedZone(MF)) { -+ // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have -+ // the correct value here, as NumBytes also includes padding bytes, -+ // which shouldn't be counted here. -+ allocateStackSpace(MBB, CalleeSavesEnd, NeedsRealignment, -+ SVELocalsSize + StackOffset::getFixed(NumBytes), -+ NeedsWinCFI, &HasWinCFI, EmitAsyncCFI && !HasFP, -+ CFAOffset); - } - - // If we need a base pointer, set it up here. It's whatever the value of the -diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h -index 147b5c181be5..f3313f3b53ff 100644 ---- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h -+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h -@@ -150,6 +150,11 @@ private: - MachineBasicBlock::iterator MBBI) const; - void emitCalleeSavedSVERestores(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI) const; -+ void allocateStackSpace(MachineBasicBlock &MBB, -+ MachineBasicBlock::iterator MBBI, -+ bool NeedsRealignment, StackOffset AllocSize, -+ bool NeedsWinCFI, bool *HasWinCFI, bool EmitCFI, -+ StackOffset InitialOffset) const; - - /// Emit target zero call-used regs. - void emitZeroCallUsedRegs(BitVector RegsToZero, -diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve-basepointer.mir b/llvm/test/CodeGen/AArch64/framelayout-sve-basepointer.mir -index 623c0f240be4..265c474fbc5d 100644 ---- a/llvm/test/CodeGen/AArch64/framelayout-sve-basepointer.mir -+++ b/llvm/test/CodeGen/AArch64/framelayout-sve-basepointer.mir -@@ -4,8 +4,8 @@ - name: hasBasepointer - # CHECK-LABEL: name: hasBasepointer - # CHECK: bb.0: --# CHECK: $sp = frame-setup ADDVL_XXI $sp, -1 --# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0 -+# CHECK: $sp = frame-setup SUBXri $sp, 16, 0 -+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1 - # CHECK-NEXT: $x19 = ADDXri $sp, 0, 0 - # CHECK: STRXui $x0, $x19, 0 - tracksRegLiveness: true -diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve-fixed-width-access.mir b/llvm/test/CodeGen/AArch64/framelayout-sve-fixed-width-access.mir -index e367a380f8ba..35fd7ca77d5c 100644 ---- a/llvm/test/CodeGen/AArch64/framelayout-sve-fixed-width-access.mir -+++ b/llvm/test/CodeGen/AArch64/framelayout-sve-fixed-width-access.mir -@@ -7,9 +7,9 @@ - ; CHECK: // %bb.0: // %entry - ; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill - ; CHECK-NEXT: mov x29, sp -+ ; CHECK-NEXT: sub sp, sp, #2064 - ; CHECK-NEXT: addvl sp, sp, #-32 - ; CHECK-NEXT: addvl sp, sp, #-28 -- ; CHECK-NEXT: sub sp, sp, #2064 - ; CHECK-NEXT: ldr x8, [sp, #2048] - ; CHECK-NEXT: addvl sp, sp, #31 - ; CHECK-NEXT: addvl sp, sp, #29 -diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve-scavengingslot.mir b/llvm/test/CodeGen/AArch64/framelayout-sve-scavengingslot.mir -index d54f67634d02..680f9c335c25 100644 ---- a/llvm/test/CodeGen/AArch64/framelayout-sve-scavengingslot.mir -+++ b/llvm/test/CodeGen/AArch64/framelayout-sve-scavengingslot.mir -@@ -4,9 +4,9 @@ - name: LateScavengingSlot - # CHECK-LABEL: name: LateScavengingSlot - # CHECK: bb.0: --# CHECK: $sp = frame-setup ADDVL_XXI $sp, -1 --# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 8, 12 -+# CHECK: $sp = frame-setup SUBXri $sp, 8, 12 - # CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0 -+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1 - # CHECK: STRXui killed $[[SCRATCH:x[0-9]+]], $sp, 0 - # CHECK-NEXT: $[[SCRATCH]] = ADDVL_XXI $fp, -1 - # CHECK-NEXT: STRXui $x0, killed $[[SCRATCH]], 0 -diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve.mir b/llvm/test/CodeGen/AArch64/framelayout-sve.mir -index 7c87587c6dc4..8b657c95bfc7 100644 ---- a/llvm/test/CodeGen/AArch64/framelayout-sve.mir -+++ b/llvm/test/CodeGen/AArch64/framelayout-sve.mir -@@ -60,10 +60,10 @@ - # CHECK-NEXT: $sp = frame-setup STRXpre killed $[[SCRATCH:[a-z0-9]+]], $sp, -16 - # CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16 - # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16 --# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2 --# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 - # CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0 --# CHECK-NEXT: CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 -+# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 32 -+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2 -+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 - - # CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2 - # CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 32 -@@ -77,7 +77,7 @@ - # ASM-LABEL: test_allocate_sve: - # ASM: .cfi_def_cfa_offset 16 - # ASM-NEXT: .cfi_offset w29, -16 --# ASM: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG -+# ASM: .cfi_def_cfa_offset 32 - # ASM: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 16 * VG - # ASM: .cfi_def_cfa wsp, 32 - # ASM: .cfi_def_cfa_offset 16 -@@ -87,7 +87,7 @@ - # - # UNWINDINFO: DW_CFA_def_cfa_offset: +16 - # UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16 --# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +16, DW_OP_plus, DW_OP_consts +16, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus -+# UNWINDINFO: DW_CFA_def_cfa_offset: +32 - # UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +32, DW_OP_plus, DW_OP_consts +16, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus - # UNWINDINFO: DW_CFA_def_cfa: reg31 +32 - # UNWINDINFO: DW_CFA_def_cfa_offset: +16 -@@ -125,9 +125,9 @@ body: | - # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w20, -8 - # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w21, -16 - # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -32 --# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2 --# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 - # CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0 -+# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 48 -+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2 - # CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 - # - # CHECK-NEXT: $x20 = IMPLICIT_DEF -@@ -149,7 +149,7 @@ body: | - # ASM: .cfi_offset w20, -8 - # ASM-NEXT: .cfi_offset w21, -16 - # ASM-NEXT: .cfi_offset w29, -32 --# ASM: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 16 * VG -+# ASM: .cfi_def_cfa_offset 48 - # ASM: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 48 + 16 * VG - # - # ASM: .cfi_def_cfa wsp, 48 -@@ -164,7 +164,7 @@ body: | - # UNWINDINFO: DW_CFA_offset: reg20 -8 - # UNWINDINFO-NEXT: DW_CFA_offset: reg21 -16 - # UNWINDINFO-NEXT: DW_CFA_offset: reg29 -32 --# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +32, DW_OP_plus, DW_OP_consts +16, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus -+# UNWINDINFO: DW_CFA_def_cfa_offset: +48 - # UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +48, DW_OP_plus, DW_OP_consts +16, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus - # - # UNWINDINFO: DW_CFA_def_cfa: reg31 +48 -@@ -205,9 +205,9 @@ body: | - # CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa $w29, 16 - # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w30, -8 - # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16 --# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2 - # CHECK-NEXT: $[[TMP:x[0-9]+]] = frame-setup SUBXri $sp, 16, 0 --# CHECK-NEXT: $sp = ANDXri killed $[[TMP]] -+# CHECK-NEXT: $[[TMP]] = frame-setup ADDVL_XXI $[[TMP]], -2 -+# CHECK-NEXT: $sp = frame-setup ANDXri killed $[[TMP]] - # CHECK-NEXT: $sp = frame-destroy ADDXri $fp, 0, 0 - # CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16 - # CHECK-NEXT: $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 -@@ -267,9 +267,9 @@ body: | - # CHECK-NEXT: $sp = frame-setup STRXpre killed $[[SCRATCH:[a-z0-9]+]], $sp, -16 - # CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16 - # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16 --# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -3 --# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 - # CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0 -+# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 32 -+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -3 - # CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 - - # CHECK-NEXT: $[[TMP:x[0-9]+]] = ADDXri $sp, 16 -@@ -292,7 +292,7 @@ body: | - # ASM-LABEL: test_address_sve: - # ASM: .cfi_def_cfa_offset 16 - # ASM-NEXT: .cfi_offset w29, -16 --# ASM: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG -+# ASM: .cfi_def_cfa_offset 32 - # ASM: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 24 * VG - # - # ASM: .cfi_def_cfa wsp, 32 -@@ -302,7 +302,7 @@ body: | - # - # UNWINDINFO: DW_CFA_def_cfa_offset: +16 - # UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16 --# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +16, DW_OP_plus, DW_OP_consts +24, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus -+# UNWINDINFO: DW_CFA_def_cfa_offset: +32 - # UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +32, DW_OP_plus, DW_OP_consts +24, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus - # - # UNWINDINFO: DW_CFA_def_cfa: reg31 +32 -@@ -353,8 +353,8 @@ body: | - # CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa $w29, 16 - # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w30, -8 - # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16 --# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -3 - # CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0 -+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -3 - - # CHECK-NEXT: STR_ZXI $z0, $fp, -1 - # CHECK-NEXT: STR_ZXI $z1, $fp, -2 -@@ -429,9 +429,9 @@ body: | - # CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16 - # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16 - --# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1 --# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 - # CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0 -+# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 32 -+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1 - # CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 - # CHECK: $[[TMP:x[0-9]+]] = ADDVL_XXI $sp, 1 - # CHECK-NEXT: $x0 = LDRXui killed $[[TMP]], 4 -@@ -448,7 +448,7 @@ body: | - # ASM-LABEL: test_stack_arg_sve: - # ASM: .cfi_def_cfa_offset 16 - # ASM-NEXT: .cfi_offset w29, -16 --# ASM: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG -+# ASM: .cfi_def_cfa_offset 32 - # ASM: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 8 * VG - # - # ASM: .cfi_def_cfa wsp, 32 -@@ -458,7 +458,7 @@ body: | - - # UNWINDINFO: DW_CFA_def_cfa_offset: +16 - # UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16 --# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +16, DW_OP_plus, DW_OP_consts +8, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus -+# UNWINDINFO: DW_CFA_def_cfa_offset: +32 - # UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +32, DW_OP_plus, DW_OP_consts +8, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus - # - # UNWINDINFO: DW_CFA_def_cfa: reg31 +32 -@@ -640,8 +640,8 @@ body: | - # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w19, -16 - # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w30, -24 - # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -32 --# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1 - # CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0 -+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1 - # CHECK-NEXT: $x19 = ADDXri $sp, 0, 0 - # CHECK-NEXT: STRXui $xzr, $x19, 0 - # CHECK-NEXT: $sp = frame-destroy ADDXri $fp, 0, 0 -@@ -863,9 +863,9 @@ body: | - # CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x10, 0x4d, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 - # CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x10, 0x4e, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 - # CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x10, 0x4f, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 --# CHECK: $sp = frame-setup ADDVL_XXI $sp, -1 --# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x98, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 - # CHECK: $sp = frame-setup SUBXri $sp, 32, 0 -+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 -+# CHECK: $sp = frame-setup ADDVL_XXI $sp, -1 - # CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x98, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 - - # CHECK: $sp = frame-destroy ADDXri $sp, 32, 0 -@@ -916,7 +916,7 @@ body: | - # ASM-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 32 - 48 * VG - # ASM-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 32 - 56 * VG - # ASM-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 32 - 64 * VG --# ASM: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x98, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 152 * VG -+# ASM: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 64 + 144 * VG - # ASM: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x98, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 64 + 152 * VG - # - # ASM: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x98, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 152 * VG -@@ -950,7 +950,7 @@ body: | - # UNWINDINFO-NEXT: DW_CFA_expression: reg77 DW_OP_consts -32, DW_OP_plus, DW_OP_consts -48, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus - # UNWINDINFO-NEXT: DW_CFA_expression: reg78 DW_OP_consts -32, DW_OP_plus, DW_OP_consts -56, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus - # UNWINDINFO-NEXT: DW_CFA_expression: reg79 DW_OP_consts -32, DW_OP_plus, DW_OP_consts -64, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus --# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +32, DW_OP_plus, DW_OP_consts +152, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus -+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +64, DW_OP_plus, DW_OP_consts +144, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus - # UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +64, DW_OP_plus, DW_OP_consts +152, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus - # - # UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +32, DW_OP_plus, DW_OP_consts +152, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus -@@ -1031,9 +1031,9 @@ body: | - # CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 - # CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 - # CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 --# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1 - # CHECK-NEXT: $[[TMP:x[0-9]+]] = frame-setup SUBXri $sp, 16, 0 --# CHECK-NEXT: $sp = ANDXri killed $[[TMP]] -+# CHECK-NEXT: $[[TMP]] = frame-setup ADDVL_XXI $[[TMP]], -1 -+# CHECK-NEXT: $sp = frame-setup ANDXri killed $[[TMP]] - - # CHECK: $sp = frame-destroy ADDVL_XXI $fp, -18 - # CHECK-NEXT: $p15 = frame-destroy LDR_PXI $sp, 4 -diff --git a/llvm/test/CodeGen/AArch64/spill-stack-realignment.mir b/llvm/test/CodeGen/AArch64/spill-stack-realignment.mir -index 1b9411d07f43..f6fc627ac2d3 100644 ---- a/llvm/test/CodeGen/AArch64/spill-stack-realignment.mir -+++ b/llvm/test/CodeGen/AArch64/spill-stack-realignment.mir -@@ -21,7 +21,7 @@ stack: - - { id: 1, size: 4, alignment: 4, local-offset: -68 } - - # CHECK: body: --# CHECK: $sp = ANDXri killed ${{x[0-9]+}}, 7865 -+# CHECK: $sp = frame-setup ANDXri killed ${{x[0-9]+}}, 7865 - # CHECK: STRSui $s0, $sp, 0 - # CHECK: STRSui $s0, $fp, 7 - body: | -diff --git a/llvm/test/CodeGen/AArch64/stack-guard-sve.ll b/llvm/test/CodeGen/AArch64/stack-guard-sve.ll -index 1672a7eb8739..5acbb22bf1ab 100644 ---- a/llvm/test/CodeGen/AArch64/stack-guard-sve.ll -+++ b/llvm/test/CodeGen/AArch64/stack-guard-sve.ll -@@ -148,9 +148,9 @@ entry: - - ; CHECK-LABEL: local_stack_alloc: - ; CHECK: mov x29, sp --; CHECK: addvl sp, sp, #-2 - ; CHECK: sub sp, sp, #16, lsl #12 - ; CHECK: sub sp, sp, #16 -+; CHECK: addvl sp, sp, #-2 - - ; Stack guard is placed below the SVE stack area (and above all fixed-width objects) - ; CHECK-DAG: add [[STACK_GUARD_SPILL_PART_LOC:x[0-9]+]], sp, #8, lsl #12 -@@ -198,9 +198,9 @@ entry: - - ; CHECK-LABEL: local_stack_alloc_strong: - ; CHECK: mov x29, sp --; CHECK: addvl sp, sp, #-3 - ; CHECK: sub sp, sp, #16, lsl #12 - ; CHECK: sub sp, sp, #16 -+; CHECK: addvl sp, sp, #-3 - - ; Stack guard is placed at the top of the SVE stack area - ; CHECK-DAG: ldr [[STACK_GUARD:x[0-9]+]], [{{x[0-9]+}}, :lo12:__stack_chk_guard] -diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll -index a97649523565..235364ac2321 100644 ---- a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll -+++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll -@@ -56,8 +56,8 @@ define float @foo2(ptr %x0, ptr %x1) nounwind { - ; CHECK-LABEL: foo2: - ; CHECK: // %bb.0: // %entry - ; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill --; CHECK-NEXT: addvl sp, sp, #-4 - ; CHECK-NEXT: sub sp, sp, #16 -+; CHECK-NEXT: addvl sp, sp, #-4 - ; CHECK-NEXT: ptrue p0.b - ; CHECK-NEXT: add x8, sp, #16 - ; CHECK-NEXT: ld4d { z1.d - z4.d }, p0/z, [x0] -@@ -699,8 +699,8 @@ define void @verify_all_operands_are_initialised() { - ; CHECK-LABEL: verify_all_operands_are_initialised: - ; CHECK: // %bb.0: - ; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill --; CHECK-NEXT: addvl sp, sp, #-1 - ; CHECK-NEXT: sub sp, sp, #16 -+; CHECK-NEXT: addvl sp, sp, #-1 - ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 8 * VG - ; CHECK-NEXT: .cfi_offset w30, -8 - ; CHECK-NEXT: .cfi_offset w29, -16 -diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp128.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp128.ll -index 31ff9287046c..b3529549c22b 100644 ---- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp128.ll -+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp128.ll -@@ -9,8 +9,8 @@ define void @fcvt_v4f64_v4f128(ptr %a, ptr %b) vscale_range(2,0) #0 { - ; CHECK: // %bb.0: - ; CHECK-NEXT: str x29, [sp, #-32]! // 8-byte Folded Spill - ; CHECK-NEXT: stp x30, x19, [sp, #16] // 16-byte Folded Spill --; CHECK-NEXT: addvl sp, sp, #-2 - ; CHECK-NEXT: sub sp, sp, #48 -+; CHECK-NEXT: addvl sp, sp, #-2 - ; CHECK-NEXT: ptrue p0.d, vl4 - ; CHECK-NEXT: add x8, sp, #48 - ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -@@ -59,8 +59,8 @@ define void @fcvt_v4f128_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 { - ; CHECK: // %bb.0: - ; CHECK-NEXT: str x29, [sp, #-32]! // 8-byte Folded Spill - ; CHECK-NEXT: stp x30, x19, [sp, #16] // 16-byte Folded Spill --; CHECK-NEXT: addvl sp, sp, #-2 - ; CHECK-NEXT: sub sp, sp, #128 -+; CHECK-NEXT: addvl sp, sp, #-2 - ; CHECK-NEXT: ldr q1, [x0, #64] - ; CHECK-NEXT: mov x19, x1 - ; CHECK-NEXT: ldr q0, [x0, #80] --- -2.42.0.windows.2 - diff --git a/0023-Backport-AArch64-Stack-probing-for-function-prologues.patch b/0023-Backport-AArch64-Stack-probing-for-function-prologues.patch deleted file mode 100644 index ff40b08..0000000 --- a/0023-Backport-AArch64-Stack-probing-for-function-prologues.patch +++ /dev/null @@ -1,2652 +0,0 @@ -From 3a9ddc2f95926a75a9b436ad4dfd4070f535a113 Mon Sep 17 00:00:00 2001 -From: rickyleung -Date: Tue, 7 May 2024 21:25:52 +0800 -Subject: [PATCH 4/7] [backport][AArch64] Stack probing for function prologues - -Reference: https://github.com/llvm/llvm-project/commit/cc944f502f1ee20d73ff88c2c86cc909f12caadb - -This adds code to AArch64 function prologues to protect against stack -clash attacks by probing (writing to) the stack at regular enough -intervals to ensure that the guard page cannot be skipped over. - -The patch depends on and maintains the following invariants: - -Upon function entry the caller guarantees that it has probed the stack -(e.g. performed a store) at some address [sp, #N], where`0 <= N <= -1024`. This invariant comes from a requirement for compatibility with -GCC. Any address range in the allocated stack, no smaller than -stack-probe-size bytes contains at least one probe At any time the stack -pointer is above or in the guard page Probes are performed in -descreasing address order -The stack-probe-size is a function attribute that can be set by a -platform to correspond to the guard page size. - -By default, the stack probe size is 4KiB, which is a safe default as -this is the smallest possible page size for AArch64. Linux uses a 64KiB -guard for AArch64, so this can be overridden by the stack-probe-size -function attribute. - -For small frames without a frame pointer (<= 240 bytes), no probes are -needed. - -For larger frame sizes, LLVM always stores x29 to the stack. This serves -as an implicit stack probe. Thus, while allocating stack objects the -compiler assumes that the stack has been probed at [sp]. - -There are multiple probing sequences that can be emitted, depending on -the size of the stack allocation: - -A straight-line sequence of subtracts and stores, used when the -allocation size is smaller than 5 guard pages. A loop allocating and -probing one page size per iteration, plus at most a single probe to deal -with the remainder, used when the allocation size is larger but still -known at compile time. A loop which moves the SP down to the target -value held in a register (or a loop, moving a scratch register to the -target value help in SP), used when the allocation size is not known at -compile-time, such as when allocating space for SVE values, or when -over-aligning the stack. This is emitted in AArch64InstrInfo because it -will also be used for dynamic allocas in a future patch. A single probe -where the amount of stack adjustment is unknown, but is known to be less -than or equal to a page size. - ---------- - -Co-authored-by: Oliver Stannard ---- - .../Target/AArch64/AArch64FrameLowering.cpp | 335 +++++++- - .../lib/Target/AArch64/AArch64FrameLowering.h | 17 +- - .../Target/AArch64/AArch64ISelLowering.cpp | 6 + - llvm/lib/Target/AArch64/AArch64ISelLowering.h | 10 + - llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 90 +++ - llvm/lib/Target/AArch64/AArch64InstrInfo.h | 6 + - llvm/lib/Target/AArch64/AArch64InstrInfo.td | 24 +- - .../AArch64/AArch64MachineFunctionInfo.cpp | 43 +- - .../AArch64/AArch64MachineFunctionInfo.h | 6 + - .../test/CodeGen/AArch64/stack-probing-64k.ll | 392 ++++++++++ - .../AArch64/stack-probing-last-in-block.mir | 146 ++++ - .../test/CodeGen/AArch64/stack-probing-sve.ll | 724 ++++++++++++++++++ - llvm/test/CodeGen/AArch64/stack-probing.ll | 539 +++++++++++++ - 13 files changed, 2300 insertions(+), 38 deletions(-) - create mode 100644 llvm/test/CodeGen/AArch64/stack-probing-64k.ll - create mode 100644 llvm/test/CodeGen/AArch64/stack-probing-last-in-block.mir - create mode 100644 llvm/test/CodeGen/AArch64/stack-probing-sve.ll - create mode 100644 llvm/test/CodeGen/AArch64/stack-probing.ll - -diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp -index eeb6185fa36d..af019ab23770 100644 ---- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp -+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp -@@ -672,10 +672,18 @@ void AArch64FrameLowering::emitCalleeSavedSVERestores( - emitCalleeSavedRestores(MBB, MBBI, true); - } - -+// Return the maximum possible number of bytes for `Size` due to the -+// architectural limit on the size of a SVE register. -+static int64_t upperBound(StackOffset Size) { -+ static const int64_t MAX_BYTES_PER_SCALABLE_BYTE = 16; -+ return Size.getScalable() * MAX_BYTES_PER_SCALABLE_BYTE + Size.getFixed(); -+} -+ - void AArch64FrameLowering::allocateStackSpace( - MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, -- bool NeedsRealignment, StackOffset AllocSize, bool NeedsWinCFI, -- bool *HasWinCFI, bool EmitCFI, StackOffset InitialOffset) const { -+ int64_t RealignmentPadding, StackOffset AllocSize, bool NeedsWinCFI, -+ bool *HasWinCFI, bool EmitCFI, StackOffset InitialOffset, -+ bool FollowupAllocs) const { - - if (!AllocSize) - return; -@@ -687,27 +695,129 @@ void AArch64FrameLowering::allocateStackSpace( - AArch64FunctionInfo &AFI = *MF.getInfo(); - const MachineFrameInfo &MFI = MF.getFrameInfo(); - -- Register TargetReg = -- NeedsRealignment ? findScratchNonCalleeSaveRegister(&MBB) : AArch64::SP; -- // SUB Xd/SP, SP, AllocSize -+ const int64_t MaxAlign = MFI.getMaxAlign().value(); -+ const uint64_t AndMask = ~(MaxAlign - 1); -+ -+ if (!Subtarget.getTargetLowering()->hasInlineStackProbe(MF)) { -+ Register TargetReg = RealignmentPadding -+ ? findScratchNonCalleeSaveRegister(&MBB) -+ : AArch64::SP; -+ // SUB Xd/SP, SP, AllocSize -+ emitFrameOffset(MBB, MBBI, DL, TargetReg, AArch64::SP, -AllocSize, &TII, -+ MachineInstr::FrameSetup, false, NeedsWinCFI, HasWinCFI, -+ EmitCFI, InitialOffset); -+ -+ if (RealignmentPadding) { -+ // AND SP, X9, 0b11111...0000 -+ BuildMI(MBB, MBBI, DL, TII.get(AArch64::ANDXri), AArch64::SP) -+ .addReg(TargetReg, RegState::Kill) -+ .addImm(AArch64_AM::encodeLogicalImmediate(AndMask, 64)) -+ .setMIFlags(MachineInstr::FrameSetup); -+ AFI.setStackRealigned(true); -+ -+ // No need for SEH instructions here; if we're realigning the stack, -+ // we've set a frame pointer and already finished the SEH prologue. -+ assert(!NeedsWinCFI); -+ } -+ return; -+ } -+ -+ // -+ // Stack probing allocation. -+ // -+ -+ // Fixed length allocation. If we don't need to re-align the stack and don't -+ // have SVE objects, we can use a more efficient sequence for stack probing. -+ if (AllocSize.getScalable() == 0 && RealignmentPadding == 0) { -+ Register ScratchReg = findScratchNonCalleeSaveRegister(&MBB); -+ assert(ScratchReg != AArch64::NoRegister); -+ BuildMI(MBB, MBBI, DL, TII.get(AArch64::PROBED_STACKALLOC)) -+ .addDef(ScratchReg) -+ .addImm(AllocSize.getFixed()) -+ .addImm(InitialOffset.getFixed()) -+ .addImm(InitialOffset.getScalable()); -+ // The fixed allocation may leave unprobed bytes at the top of the -+ // stack. If we have subsequent alocation (e.g. if we have variable-sized -+ // objects), we need to issue an extra probe, so these allocations start in -+ // a known state. -+ if (FollowupAllocs) { -+ // STR XZR, [SP] -+ BuildMI(MBB, MBBI, DL, TII.get(AArch64::STRXui)) -+ .addReg(AArch64::XZR) -+ .addReg(AArch64::SP) -+ .addImm(0) -+ .setMIFlags(MachineInstr::FrameSetup); -+ } -+ -+ return; -+ } -+ -+ // Variable length allocation. -+ -+ // If the (unknown) allocation size cannot exceed the probe size, decrement -+ // the stack pointer right away. -+ int64_t ProbeSize = AFI.getStackProbeSize(); -+ if (upperBound(AllocSize) + RealignmentPadding <= ProbeSize) { -+ Register ScratchReg = RealignmentPadding -+ ? findScratchNonCalleeSaveRegister(&MBB) -+ : AArch64::SP; -+ assert(ScratchReg != AArch64::NoRegister); -+ // SUB Xd, SP, AllocSize -+ emitFrameOffset(MBB, MBBI, DL, ScratchReg, AArch64::SP, -AllocSize, &TII, -+ MachineInstr::FrameSetup, false, NeedsWinCFI, HasWinCFI, -+ EmitCFI, InitialOffset); -+ if (RealignmentPadding) { -+ // AND SP, Xn, 0b11111...0000 -+ BuildMI(MBB, MBBI, DL, TII.get(AArch64::ANDXri), AArch64::SP) -+ .addReg(ScratchReg, RegState::Kill) -+ .addImm(AArch64_AM::encodeLogicalImmediate(AndMask, 64)) -+ .setMIFlags(MachineInstr::FrameSetup); -+ AFI.setStackRealigned(true); -+ } -+ if (FollowupAllocs || upperBound(AllocSize) + RealignmentPadding > -+ AArch64::StackProbeMaxUnprobedStack) { -+ // STR XZR, [SP] -+ BuildMI(MBB, MBBI, DL, TII.get(AArch64::STRXui)) -+ .addReg(AArch64::XZR) -+ .addReg(AArch64::SP) -+ .addImm(0) -+ .setMIFlags(MachineInstr::FrameSetup); -+ } -+ return; -+ } -+ -+ // Emit a variable-length allocation probing loop. -+ // TODO: As an optimisation, the loop can be "unrolled" into a few parts, -+ // each of them guaranteed to adjust the stack by less than the probe size. -+ Register TargetReg = findScratchNonCalleeSaveRegister(&MBB); -+ assert(TargetReg != AArch64::NoRegister); -+ // SUB Xd, SP, AllocSize - emitFrameOffset(MBB, MBBI, DL, TargetReg, AArch64::SP, -AllocSize, &TII, - MachineInstr::FrameSetup, false, NeedsWinCFI, HasWinCFI, - EmitCFI, InitialOffset); - -- if (NeedsRealignment) { -- const int64_t MaxAlign = MFI.getMaxAlign().value(); -- const uint64_t AndMask = ~(MaxAlign - 1); -- // AND SP, Xd, 0b11111...0000 -- BuildMI(MBB, MBBI, DL, TII.get(AArch64::ANDXri), AArch64::SP) -+ if (RealignmentPadding) { -+ // AND Xn, Xn, 0b11111...0000 -+ BuildMI(MBB, MBBI, DL, TII.get(AArch64::ANDXri), TargetReg) - .addReg(TargetReg, RegState::Kill) - .addImm(AArch64_AM::encodeLogicalImmediate(AndMask, 64)) - .setMIFlags(MachineInstr::FrameSetup); -- AFI.setStackRealigned(true); -+ } - -- // No need for SEH instructions here; if we're realigning the stack, -- // we've set a frame pointer and already finished the SEH prologue. -- assert(!NeedsWinCFI); -+ BuildMI(MBB, MBBI, DL, TII.get(AArch64::PROBED_STACKALLOC_VAR)) -+ .addReg(TargetReg); -+ if (EmitCFI) { -+ // Set the CFA register back to SP. -+ unsigned Reg = -+ Subtarget.getRegisterInfo()->getDwarfRegNum(AArch64::SP, true); -+ unsigned CFIIndex = -+ MF.addFrameInst(MCCFIInstruction::createDefCfaRegister(nullptr, Reg)); -+ BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) -+ .addCFIIndex(CFIIndex) -+ .setMIFlags(MachineInstr::FrameSetup); - } -+ if (RealignmentPadding) -+ AFI.setStackRealigned(true); - } - - static MCRegister getRegisterOrZero(MCRegister Reg, bool HasSVE) { -@@ -893,9 +1003,11 @@ bool AArch64FrameLowering::canUseAsPrologue( - MachineBasicBlock *TmpMBB = const_cast(&MBB); - const AArch64Subtarget &Subtarget = MF->getSubtarget(); - const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); -+ const AArch64TargetLowering *TLI = Subtarget.getTargetLowering(); - -- // Don't need a scratch register if we're not going to re-align the stack. -- if (!RegInfo->hasStackRealignment(*MF)) -+ // Don't need a scratch register if we're not going to re-align the stack or -+ // emit stack probes. -+ if (!RegInfo->hasStackRealignment(*MF) && TLI->hasInlineStackProbe(*MF)) - return true; - // Otherwise, we can use any block as long as it has a scratch register - // available. -@@ -905,15 +1017,11 @@ bool AArch64FrameLowering::canUseAsPrologue( - static bool windowsRequiresStackProbe(MachineFunction &MF, - uint64_t StackSizeInBytes) { - const AArch64Subtarget &Subtarget = MF.getSubtarget(); -- if (!Subtarget.isTargetWindows()) -- return false; -- const Function &F = MF.getFunction(); -+ const AArch64FunctionInfo &MFI = *MF.getInfo(); - // TODO: When implementing stack protectors, take that into account - // for the probe threshold. -- unsigned StackProbeSize = -- F.getFnAttributeAsParsedInteger("stack-probe-size", 4096); -- return (StackSizeInBytes >= StackProbeSize) && -- !F.hasFnAttribute("no-stack-arg-probe"); -+ return Subtarget.isTargetWindows() && MFI.hasStackProbing() && -+ StackSizeInBytes >= uint64_t(MFI.getStackProbeSize()); - } - - static bool needsWinCFI(const MachineFunction &MF) { -@@ -1678,7 +1786,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, - // Alignment is required for the parent frame, not the funclet - const bool NeedsRealignment = - NumBytes && !IsFunclet && RegInfo->hasStackRealignment(MF); -- int64_t RealignmentPadding = -+ const int64_t RealignmentPadding = - (NeedsRealignment && MFI.getMaxAlign() > Align(16)) - ? MFI.getMaxAlign().value() - 16 - : 0; -@@ -1814,6 +1922,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, - // Process the SVE callee-saves to determine what space needs to be - // allocated. - if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) { -+ LLVM_DEBUG(dbgs() << "SVECalleeSavedStackSize = " << CalleeSavedSize -+ << "\n"); - // Find callee save instructions in frame. - CalleeSavesBegin = MBBI; - assert(IsSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction"); -@@ -1828,8 +1938,10 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, - // Allocate space for the callee saves (if any). - StackOffset CFAOffset = - StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes); -- allocateStackSpace(MBB, CalleeSavesBegin, false, SVECalleeSavesSize, false, -- nullptr, EmitAsyncCFI && !HasFP, CFAOffset); -+ StackOffset LocalsSize = SVELocalsSize + StackOffset::getFixed(NumBytes); -+ allocateStackSpace(MBB, CalleeSavesBegin, 0, SVECalleeSavesSize, false, -+ nullptr, EmitAsyncCFI && !HasFP, CFAOffset, -+ MFI.hasVarSizedObjects() || LocalsSize); - CFAOffset += SVECalleeSavesSize; - - if (EmitAsyncCFI) -@@ -1843,10 +1955,10 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, - // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have - // the correct value here, as NumBytes also includes padding bytes, - // which shouldn't be counted here. -- allocateStackSpace(MBB, CalleeSavesEnd, NeedsRealignment, -+ allocateStackSpace(MBB, CalleeSavesEnd, RealignmentPadding, - SVELocalsSize + StackOffset::getFixed(NumBytes), - NeedsWinCFI, &HasWinCFI, EmitAsyncCFI && !HasFP, -- CFAOffset); -+ CFAOffset, MFI.hasVarSizedObjects()); - } - - // If we need a base pointer, set it up here. It's whatever the value of the -@@ -4028,3 +4140,170 @@ void AArch64FrameLowering::orderFrameObjects( - dbgs() << "\n"; - }); - } -+ -+/// Emit a loop to decrement SP until it is equal to TargetReg, with probes at -+/// least every ProbeSize bytes. Returns an iterator of the first instruction -+/// after the loop. The difference between SP and TargetReg must be an exact -+/// multiple of ProbeSize. -+MachineBasicBlock::iterator -+AArch64FrameLowering::inlineStackProbeLoopExactMultiple( -+ MachineBasicBlock::iterator MBBI, int64_t ProbeSize, -+ Register TargetReg) const { -+ MachineBasicBlock &MBB = *MBBI->getParent(); -+ MachineFunction &MF = *MBB.getParent(); -+ const AArch64InstrInfo *TII = -+ MF.getSubtarget().getInstrInfo(); -+ DebugLoc DL = MBB.findDebugLoc(MBBI); -+ -+ MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator()); -+ MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock()); -+ MF.insert(MBBInsertPoint, LoopMBB); -+ MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock()); -+ MF.insert(MBBInsertPoint, ExitMBB); -+ -+ // SUB SP, SP, #ProbeSize (or equivalent if ProbeSize is not encodable -+ // in SUB). -+ emitFrameOffset(*LoopMBB, LoopMBB->end(), DL, AArch64::SP, AArch64::SP, -+ StackOffset::getFixed(-ProbeSize), TII, -+ MachineInstr::FrameSetup); -+ // STR XZR, [SP] -+ BuildMI(*LoopMBB, LoopMBB->end(), DL, TII->get(AArch64::STRXui)) -+ .addReg(AArch64::XZR) -+ .addReg(AArch64::SP) -+ .addImm(0) -+ .setMIFlags(MachineInstr::FrameSetup); -+ // CMP SP, TargetReg -+ BuildMI(*LoopMBB, LoopMBB->end(), DL, TII->get(AArch64::SUBSXrx64), -+ AArch64::XZR) -+ .addReg(AArch64::SP) -+ .addReg(TargetReg) -+ .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0)) -+ .setMIFlags(MachineInstr::FrameSetup); -+ // B.CC Loop -+ BuildMI(*LoopMBB, LoopMBB->end(), DL, TII->get(AArch64::Bcc)) -+ .addImm(AArch64CC::NE) -+ .addMBB(LoopMBB) -+ .setMIFlags(MachineInstr::FrameSetup); -+ -+ LoopMBB->addSuccessor(ExitMBB); -+ LoopMBB->addSuccessor(LoopMBB); -+ // Synthesize the exit MBB. -+ ExitMBB->splice(ExitMBB->end(), &MBB, MBBI, MBB.end()); -+ ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB); -+ MBB.addSuccessor(LoopMBB); -+ // Update liveins. -+ recomputeLiveIns(*LoopMBB); -+ recomputeLiveIns(*ExitMBB); -+ -+ return ExitMBB->begin(); -+} -+ -+void AArch64FrameLowering::inlineStackProbeFixed( -+ MachineBasicBlock::iterator MBBI, Register ScratchReg, int64_t FrameSize, -+ StackOffset CFAOffset) const { -+ MachineBasicBlock *MBB = MBBI->getParent(); -+ MachineFunction &MF = *MBB->getParent(); -+ const AArch64InstrInfo *TII = -+ MF.getSubtarget().getInstrInfo(); -+ AArch64FunctionInfo *AFI = MF.getInfo(); -+ bool EmitAsyncCFI = AFI->needsAsyncDwarfUnwindInfo(MF); -+ bool HasFP = hasFP(MF); -+ -+ DebugLoc DL; -+ int64_t ProbeSize = MF.getInfo()->getStackProbeSize(); -+ int64_t NumBlocks = FrameSize / ProbeSize; -+ int64_t ResidualSize = FrameSize % ProbeSize; -+ -+ LLVM_DEBUG(dbgs() << "Stack probing: total " << FrameSize << " bytes, " -+ << NumBlocks << " blocks of " << ProbeSize -+ << " bytes, plus " << ResidualSize << " bytes\n"); -+ -+ // Decrement SP by NumBlock * ProbeSize bytes, with either unrolled or -+ // ordinary loop. -+ if (NumBlocks <= AArch64::StackProbeMaxLoopUnroll) { -+ for (int i = 0; i < NumBlocks; ++i) { -+ // SUB SP, SP, #ProbeSize (or equivalent if ProbeSize is not -+ // encodable in a SUB). -+ emitFrameOffset(*MBB, MBBI, DL, AArch64::SP, AArch64::SP, -+ StackOffset::getFixed(-ProbeSize), TII, -+ MachineInstr::FrameSetup, false, false, nullptr, -+ EmitAsyncCFI && !HasFP, CFAOffset); -+ CFAOffset += StackOffset::getFixed(ProbeSize); -+ // STR XZR, [SP] -+ BuildMI(*MBB, MBBI, DL, TII->get(AArch64::STRXui)) -+ .addReg(AArch64::XZR) -+ .addReg(AArch64::SP) -+ .addImm(0) -+ .setMIFlags(MachineInstr::FrameSetup); -+ } -+ } else if (NumBlocks != 0) { -+ // SUB ScratchReg, SP, #FrameSize (or equivalent if FrameSize is not -+ // encodable in ADD). ScrathReg may temporarily become the CFA register. -+ emitFrameOffset(*MBB, MBBI, DL, ScratchReg, AArch64::SP, -+ StackOffset::getFixed(-ProbeSize * NumBlocks), TII, -+ MachineInstr::FrameSetup, false, false, nullptr, -+ EmitAsyncCFI && !HasFP, CFAOffset); -+ CFAOffset += StackOffset::getFixed(ProbeSize * NumBlocks); -+ MBBI = inlineStackProbeLoopExactMultiple(MBBI, ProbeSize, ScratchReg); -+ MBB = MBBI->getParent(); -+ if (EmitAsyncCFI && !HasFP) { -+ // Set the CFA register back to SP. -+ const AArch64RegisterInfo &RegInfo = -+ *MF.getSubtarget().getRegisterInfo(); -+ unsigned Reg = RegInfo.getDwarfRegNum(AArch64::SP, true); -+ unsigned CFIIndex = -+ MF.addFrameInst(MCCFIInstruction::createDefCfaRegister(nullptr, Reg)); -+ BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) -+ .addCFIIndex(CFIIndex) -+ .setMIFlags(MachineInstr::FrameSetup); -+ } -+ } -+ -+ if (ResidualSize != 0) { -+ // SUB SP, SP, #ResidualSize (or equivalent if ResidualSize is not encodable -+ // in SUB). -+ emitFrameOffset(*MBB, MBBI, DL, AArch64::SP, AArch64::SP, -+ StackOffset::getFixed(-ResidualSize), TII, -+ MachineInstr::FrameSetup, false, false, nullptr, -+ EmitAsyncCFI && !HasFP, CFAOffset); -+ if (ResidualSize > AArch64::StackProbeMaxUnprobedStack) { -+ // STR XZR, [SP] -+ BuildMI(*MBB, MBBI, DL, TII->get(AArch64::STRXui)) -+ .addReg(AArch64::XZR) -+ .addReg(AArch64::SP) -+ .addImm(0) -+ .setMIFlags(MachineInstr::FrameSetup); -+ } -+ } -+} -+ -+void AArch64FrameLowering::inlineStackProbe(MachineFunction &MF, -+ MachineBasicBlock &MBB) const { -+ // Get the instructions that need to be replaced. We emit at most two of -+ // these. Remember them in order to avoid complications coming from the need -+ // to traverse the block while potentially creating more blocks. -+ SmallVector ToReplace; -+ for (MachineInstr &MI : MBB) -+ if (MI.getOpcode() == AArch64::PROBED_STACKALLOC || -+ MI.getOpcode() == AArch64::PROBED_STACKALLOC_VAR) -+ ToReplace.push_back(&MI); -+ -+ for (MachineInstr *MI : ToReplace) { -+ if (MI->getOpcode() == AArch64::PROBED_STACKALLOC) { -+ Register ScratchReg = MI->getOperand(0).getReg(); -+ int64_t FrameSize = MI->getOperand(1).getImm(); -+ StackOffset CFAOffset = StackOffset::get(MI->getOperand(2).getImm(), -+ MI->getOperand(3).getImm()); -+ inlineStackProbeFixed(MI->getIterator(), ScratchReg, FrameSize, -+ CFAOffset); -+ } else { -+ assert(MI->getOpcode() == AArch64::PROBED_STACKALLOC_VAR && -+ "Stack probe pseudo-instruction expected"); -+ const AArch64InstrInfo *TII = -+ MI->getMF()->getSubtarget().getInstrInfo(); -+ Register TargetReg = MI->getOperand(0).getReg(); -+ (void)TII->probedStackAlloc(MI->getIterator(), TargetReg, true); -+ } -+ MI->eraseFromParent(); -+ } -+} -\ No newline at end of file -diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h -index f3313f3b53ff..941af03a78b7 100644 ---- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h -+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h -@@ -152,13 +152,26 @@ private: - MachineBasicBlock::iterator MBBI) const; - void allocateStackSpace(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, -- bool NeedsRealignment, StackOffset AllocSize, -+ int64_t RealignmentPadding, StackOffset AllocSize, - bool NeedsWinCFI, bool *HasWinCFI, bool EmitCFI, -- StackOffset InitialOffset) const; -+ StackOffset InitialOffset, bool FollowupAllocs) const; - - /// Emit target zero call-used regs. - void emitZeroCallUsedRegs(BitVector RegsToZero, - MachineBasicBlock &MBB) const override; -+ -+ /// Replace a StackProbe stub (if any) with the actual probe code inline -+ void inlineStackProbe(MachineFunction &MF, -+ MachineBasicBlock &PrologueMBB) const override; -+ -+ void inlineStackProbeFixed(MachineBasicBlock::iterator MBBI, -+ Register ScratchReg, int64_t FrameSize, -+ StackOffset CFAOffset) const; -+ -+ MachineBasicBlock::iterator -+ inlineStackProbeLoopExactMultiple(MachineBasicBlock::iterator MBBI, -+ int64_t NegProbeSize, -+ Register TargetReg) const; - }; - - } // End llvm namespace -diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp -index 6e721b937846..082043420fb9 100644 ---- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp -+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp -@@ -26051,3 +26051,9 @@ bool AArch64TargetLowering::preferScalarizeSplat(SDNode *N) const { - } - return true; - } -+ -+bool AArch64TargetLowering::hasInlineStackProbe( -+ const MachineFunction &MF) const { -+ return !Subtarget->isTargetWindows() && -+ MF.getInfo()->hasStackProbing(); -+} -\ No newline at end of file -diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h -index aca45f113e73..643d363e234a 100644 ---- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h -+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h -@@ -508,6 +508,13 @@ const unsigned RoundingBitsPos = 22; - const ArrayRef getGPRArgRegs(); - const ArrayRef getFPRArgRegs(); - -+/// Maximum allowed number of unprobed bytes above SP at an ABI -+/// boundary. -+const unsigned StackProbeMaxUnprobedStack = 1024; -+ -+/// Maximum number of iterations to unroll for a constant size probing loop. -+const unsigned StackProbeMaxLoopUnroll = 4; -+ - } // namespace AArch64 - - class AArch64Subtarget; -@@ -942,6 +949,9 @@ public: - // used for 64bit and 128bit vectors as well. - bool useSVEForFixedLengthVectorVT(EVT VT, bool OverrideNEON = false) const; - -+ /// True if stack clash protection is enabled for this functions. -+ bool hasInlineStackProbe(const MachineFunction &MF) const override; -+ - private: - /// Keep a pointer to the AArch64Subtarget around so that we can - /// make the right decision when generating code for different targets. -diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp -index 0691e07a639b..b3b42a97e8c9 100644 ---- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp -+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp -@@ -11,6 +11,7 @@ - //===----------------------------------------------------------------------===// - - #include "AArch64InstrInfo.h" -+#include "AArch64ExpandImm.h" - #include "AArch64MachineFunctionInfo.h" - #include "AArch64Subtarget.h" - #include "MCTargetDesc/AArch64AddressingModes.h" -@@ -18,6 +19,7 @@ - #include "llvm/ADT/ArrayRef.h" - #include "llvm/ADT/STLExtras.h" - #include "llvm/ADT/SmallVector.h" -+#include "llvm/CodeGen/LivePhysRegs.h" - #include "llvm/CodeGen/MachineBasicBlock.h" - #include "llvm/CodeGen/MachineCombinerPattern.h" - #include "llvm/CodeGen/MachineFrameInfo.h" -@@ -8428,6 +8430,94 @@ unsigned llvm::getBLRCallOpcode(const MachineFunction &MF) { - return AArch64::BLR; - } - -+MachineBasicBlock::iterator -+AArch64InstrInfo::probedStackAlloc(MachineBasicBlock::iterator MBBI, -+ Register TargetReg, bool FrameSetup) const { -+ assert(TargetReg != AArch64::SP && "New top of stack cannot aleady be in SP"); -+ -+ MachineBasicBlock &MBB = *MBBI->getParent(); -+ MachineFunction &MF = *MBB.getParent(); -+ const AArch64InstrInfo *TII = -+ MF.getSubtarget().getInstrInfo(); -+ int64_t ProbeSize = MF.getInfo()->getStackProbeSize(); -+ DebugLoc DL = MBB.findDebugLoc(MBBI); -+ -+ MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator()); -+ MachineBasicBlock *LoopTestMBB = -+ MF.CreateMachineBasicBlock(MBB.getBasicBlock()); -+ MF.insert(MBBInsertPoint, LoopTestMBB); -+ MachineBasicBlock *LoopBodyMBB = -+ MF.CreateMachineBasicBlock(MBB.getBasicBlock()); -+ MF.insert(MBBInsertPoint, LoopBodyMBB); -+ MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock()); -+ MF.insert(MBBInsertPoint, ExitMBB); -+ MachineInstr::MIFlag Flags = -+ FrameSetup ? MachineInstr::FrameSetup : MachineInstr::NoFlags; -+ -+ // LoopTest: -+ // SUB SP, SP, #ProbeSize -+ emitFrameOffset(*LoopTestMBB, LoopTestMBB->end(), DL, AArch64::SP, -+ AArch64::SP, StackOffset::getFixed(-ProbeSize), TII, Flags); -+ -+ // CMP SP, TargetReg -+ BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::SUBSXrx64), -+ AArch64::XZR) -+ .addReg(AArch64::SP) -+ .addReg(TargetReg) -+ .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0)) -+ .setMIFlags(Flags); -+ -+ // B. LoopExit -+ BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::Bcc)) -+ .addImm(AArch64CC::LE) -+ .addMBB(ExitMBB) -+ .setMIFlags(Flags); -+ -+ // STR XZR, [SP] -+ BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::STRXui)) -+ .addReg(AArch64::XZR) -+ .addReg(AArch64::SP) -+ .addImm(0) -+ .setMIFlags(Flags); -+ -+ // B loop -+ BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::B)) -+ .addMBB(LoopTestMBB) -+ .setMIFlags(Flags); -+ -+ // LoopExit: -+ // MOV SP, TargetReg -+ BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::ADDXri), AArch64::SP) -+ .addReg(TargetReg) -+ .addImm(0) -+ .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)) -+ .setMIFlags(Flags); -+ -+ // STR XZR, [SP] -+ BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::STRXui)) -+ .addReg(AArch64::XZR) -+ .addReg(AArch64::SP) -+ .addImm(0) -+ .setMIFlags(Flags); -+ -+ ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end()); -+ ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB); -+ -+ LoopTestMBB->addSuccessor(ExitMBB); -+ LoopTestMBB->addSuccessor(LoopBodyMBB); -+ LoopBodyMBB->addSuccessor(LoopTestMBB); -+ MBB.addSuccessor(LoopTestMBB); -+ -+ // Update liveins. -+ if (MF.getRegInfo().reservedRegsFrozen()) { -+ recomputeLiveIns(*LoopTestMBB); -+ recomputeLiveIns(*LoopBodyMBB); -+ recomputeLiveIns(*ExitMBB); -+ } -+ -+ return ExitMBB->begin(); -+} -+ - #define GET_INSTRINFO_HELPERS - #define GET_INSTRMAP_INFO - #include "AArch64GenInstrInfo.inc" -diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h -index 20210a96d67a..7e84b86fc52c 100644 ---- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h -+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h -@@ -340,6 +340,12 @@ public: - static void decomposeStackOffsetForDwarfOffsets(const StackOffset &Offset, - int64_t &ByteSized, - int64_t &VGSized); -+ // Decrement the SP, issuing probes along the way. `TargetReg` is the new top -+ // of the stack. `FrameSetup` is passed as true, if the allocation is a part -+ // of constructing the activation frame of a function. -+ MachineBasicBlock::iterator probedStackAlloc(MachineBasicBlock::iterator MBBI, -+ Register TargetReg, -+ bool FrameSetup) const; - #define GET_INSTRINFO_HELPER_DECLS - #include "AArch64GenInstrInfo.inc" - -diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td -index 9e72d37880c5..09980c2f45e6 100644 ---- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td -+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td -@@ -880,7 +880,8 @@ include "SMEInstrFormats.td" - // Miscellaneous instructions. - //===----------------------------------------------------------------------===// - --let Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 in { -+let hasSideEffects = 1, isCodeGenOnly = 1 in { -+let Defs = [SP], Uses = [SP] in { - // We set Sched to empty list because we expect these instructions to simply get - // removed in most cases. - def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), -@@ -889,7 +890,26 @@ def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), - def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), - [(AArch64callseq_end timm:$amt1, timm:$amt2)]>, - Sched<[]>; --} // Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 -+} -+ -+let Defs = [SP, NZCV], Uses = [SP] in { -+// Probed stack allocation of a constant size, used in function prologues when -+// stack-clash protection is enabled. -+def PROBED_STACKALLOC : Pseudo<(outs GPR64:$scratch), -+ (ins i64imm:$stacksize, i64imm:$fixed_offset, -+ i64imm:$scalable_offset), -+ []>, -+ Sched<[]>; -+ -+// Probed stack allocation of a variable size, used in function prologues when -+// stack-clash protection is enabled. -+def PROBED_STACKALLOC_VAR : Pseudo<(outs), -+ (ins GPR64sp:$target), -+ []>, -+ Sched<[]>; -+ -+} // Defs = [SP, NZCV], Uses = [SP] in -+} // hasSideEffects = 1, isCodeGenOnly = 1 - - let isReMaterializable = 1, isCodeGenOnly = 1 in { - // FIXME: The following pseudo instructions are only needed because remat -diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp -index 961a19317d66..0bef3c2d2483 100644 ---- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp -+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp -@@ -97,14 +97,45 @@ AArch64FunctionInfo::AArch64FunctionInfo(const Function &F, - if (const auto *BTE = mdconst::extract_or_null( - F.getParent()->getModuleFlag("branch-target-enforcement"))) - BranchTargetEnforcement = BTE->getZExtValue(); -- return; -+ } else { -+ const StringRef BTIEnable = -+ F.getFnAttribute("branch-target-enforcement").getValueAsString(); -+ assert(BTIEnable.equals_insensitive("true") || -+ BTIEnable.equals_insensitive("false")); -+ BranchTargetEnforcement = BTIEnable.equals_insensitive("true"); - } - -- const StringRef BTIEnable = -- F.getFnAttribute("branch-target-enforcement").getValueAsString(); -- assert(BTIEnable.equals_insensitive("true") || -- BTIEnable.equals_insensitive("false")); -- BranchTargetEnforcement = BTIEnable.equals_insensitive("true"); -+ // The default stack probe size is 4096 if the function has no -+ // stack-probe-size attribute. This is a safe default because it is the -+ // smallest possible guard page size. -+ uint64_t ProbeSize = 4096; -+ if (F.hasFnAttribute("stack-probe-size")) -+ ProbeSize = F.getFnAttributeAsParsedInteger("stack-probe-size"); -+ else if (const auto *PS = mdconst::extract_or_null( -+ F.getParent()->getModuleFlag("stack-probe-size"))) -+ ProbeSize = PS->getZExtValue(); -+ assert(int64_t(ProbeSize) > 0 && "Invalid stack probe size"); -+ -+ if (STI->isTargetWindows()) { -+ if (!F.hasFnAttribute("no-stack-arg-probe")) -+ StackProbeSize = ProbeSize; -+ } else { -+ // Round down to the stack alignment. -+ uint64_t StackAlign = -+ STI->getFrameLowering()->getTransientStackAlign().value(); -+ ProbeSize = std::max(StackAlign, ProbeSize & ~(StackAlign - 1U)); -+ StringRef ProbeKind; -+ if (F.hasFnAttribute("probe-stack")) -+ ProbeKind = F.getFnAttribute("probe-stack").getValueAsString(); -+ else if (const auto *PS = dyn_cast_or_null( -+ F.getParent()->getModuleFlag("probe-stack"))) -+ ProbeKind = PS->getString(); -+ if (ProbeKind.size()) { -+ if (ProbeKind != "inline-asm") -+ report_fatal_error("Unsupported stack probing method"); -+ StackProbeSize = ProbeSize; -+ } -+ } - } - - MachineFunctionInfo *AArch64FunctionInfo::clone( -diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h -index d82fb436925e..d50011594eb1 100644 ---- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h -+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h -@@ -192,6 +192,8 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { - /// True if the function need asynchronous unwind information. - mutable std::optional NeedsAsyncDwarfUnwindInfo; - -+ int64_t StackProbeSize = 0; -+ - public: - AArch64FunctionInfo(const Function &F, const AArch64Subtarget *STI); - -@@ -447,6 +449,10 @@ public: - bool needsDwarfUnwindInfo(const MachineFunction &MF) const; - bool needsAsyncDwarfUnwindInfo(const MachineFunction &MF) const; - -+ bool hasStackProbing() const { return StackProbeSize != 0; } -+ -+ int64_t getStackProbeSize() const { return StackProbeSize; } -+ - private: - // Hold the lists of LOHs. - MILOHContainer LOHContainerSet; -diff --git a/llvm/test/CodeGen/AArch64/stack-probing-64k.ll b/llvm/test/CodeGen/AArch64/stack-probing-64k.ll -new file mode 100644 -index 000000000000..0a3198fc520e ---- /dev/null -+++ b/llvm/test/CodeGen/AArch64/stack-probing-64k.ll -@@ -0,0 +1,392 @@ -+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -+; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -enable-post-misched=false | FileCheck %s -+; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -enable-post-misched=false -global-isel | FileCheck %s -+ -+; Tests for prolog sequences for stack probing, when using a 64KiB stack guard. -+ -+; 64k bytes is the largest frame we can probe in one go. -+define void @static_65536(ptr %out) #0 { -+; CHECK-LABEL: static_65536: -+; CHECK: // %bb.0: // %entry -+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: .cfi_offset w29, -16 -+; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 -+; CHECK-NEXT: .cfi_def_cfa_offset 65552 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: mov x8, sp -+; CHECK-NEXT: str x8, [x0] -+; CHECK-NEXT: add sp, sp, #16, lsl #12 // =65536 -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload -+; CHECK-NEXT: .cfi_def_cfa_offset 0 -+; CHECK-NEXT: .cfi_restore w29 -+; CHECK-NEXT: ret -+entry: -+ %v = alloca i8, i64 65536, align 1 -+ store i8* %v, ptr %out, align 8 -+ ret void -+} -+ -+; 64k+16 bytes, still needs just one probe. -+define void @static_65552(ptr %out) #0 { -+; CHECK-LABEL: static_65552: -+; CHECK: // %bb.0: // %entry -+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: .cfi_offset w29, -16 -+; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 -+; CHECK-NEXT: .cfi_def_cfa_offset 65552 -+; CHECK-NEXT: str xzr, [sp], #-16 -+; CHECK-NEXT: .cfi_def_cfa_offset 65568 -+; CHECK-NEXT: mov x8, sp -+; CHECK-NEXT: str x8, [x0] -+; CHECK-NEXT: add sp, sp, #16, lsl #12 // =65536 -+; CHECK-NEXT: .cfi_def_cfa_offset 32 -+; CHECK-NEXT: add sp, sp, #16 -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload -+; CHECK-NEXT: .cfi_def_cfa_offset 0 -+; CHECK-NEXT: .cfi_restore w29 -+; CHECK-NEXT: ret -+entry: -+ %v = alloca i8, i64 65552, align 1 -+ store i8* %v, ptr %out, align 8 -+ ret void -+} -+ -+; 64k+1024 bytes, the largest frame which needs just one probe. -+define void @static_66560(ptr %out) #0 { -+; CHECK-LABEL: static_66560: -+; CHECK: // %bb.0: // %entry -+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: .cfi_offset w29, -16 -+; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 -+; CHECK-NEXT: .cfi_def_cfa_offset 65552 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: sub sp, sp, #1024 -+; CHECK-NEXT: .cfi_def_cfa_offset 66576 -+; CHECK-NEXT: mov x8, sp -+; CHECK-NEXT: str x8, [x0] -+; CHECK-NEXT: add sp, sp, #16, lsl #12 // =65536 -+; CHECK-NEXT: .cfi_def_cfa_offset 1040 -+; CHECK-NEXT: add sp, sp, #1024 -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload -+; CHECK-NEXT: .cfi_def_cfa_offset 0 -+; CHECK-NEXT: .cfi_restore w29 -+; CHECK-NEXT: ret -+entry: -+ %v = alloca i8, i64 66560, align 1 -+ store i8* %v, ptr %out, align 8 -+ ret void -+} -+ -+; 64k+1024+16 bytes, the smallest frame which needs two probes. -+define void @static_66576(ptr %out) #0 { -+; CHECK-LABEL: static_66576: -+; CHECK: // %bb.0: // %entry -+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: .cfi_offset w29, -16 -+; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 -+; CHECK-NEXT: .cfi_def_cfa_offset 65552 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: sub sp, sp, #1040 -+; CHECK-NEXT: .cfi_def_cfa_offset 66592 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: mov x8, sp -+; CHECK-NEXT: str x8, [x0] -+; CHECK-NEXT: add sp, sp, #16, lsl #12 // =65536 -+; CHECK-NEXT: .cfi_def_cfa_offset 1056 -+; CHECK-NEXT: add sp, sp, #1040 -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload -+; CHECK-NEXT: .cfi_def_cfa_offset 0 -+; CHECK-NEXT: .cfi_restore w29 -+; CHECK-NEXT: ret -+entry: -+ %v = alloca i8, i64 66576, align 1 -+ store i8* %v, ptr %out, align 8 -+ ret void -+} -+ -+; 2*64k+1024, the largest frame needing two probes. -+define void @static_132096(ptr %out) #0 { -+; CHECK-LABEL: static_132096: -+; CHECK: // %bb.0: // %entry -+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: .cfi_offset w29, -16 -+; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 -+; CHECK-NEXT: .cfi_def_cfa_offset 65552 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 -+; CHECK-NEXT: .cfi_def_cfa_offset 131088 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: sub sp, sp, #1024 -+; CHECK-NEXT: .cfi_def_cfa_offset 132112 -+; CHECK-NEXT: mov x8, sp -+; CHECK-NEXT: str x8, [x0] -+; CHECK-NEXT: add sp, sp, #32, lsl #12 // =131072 -+; CHECK-NEXT: .cfi_def_cfa_offset 1040 -+; CHECK-NEXT: add sp, sp, #1024 -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload -+; CHECK-NEXT: .cfi_def_cfa_offset 0 -+; CHECK-NEXT: .cfi_restore w29 -+; CHECK-NEXT: ret -+entry: -+ %v = alloca i8, i64 132096, align 1 -+ store i8* %v, ptr %out, align 8 -+ ret void -+} -+ -+; 5*64k-16, the largest frame probed without a loop. -+define void @static_327664(ptr %out) #0 { -+; CHECK-LABEL: static_327664: -+; CHECK: // %bb.0: // %entry -+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: .cfi_offset w29, -16 -+; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 -+; CHECK-NEXT: .cfi_def_cfa_offset 65552 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 -+; CHECK-NEXT: .cfi_def_cfa_offset 131088 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 -+; CHECK-NEXT: .cfi_def_cfa_offset 196624 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 -+; CHECK-NEXT: .cfi_def_cfa_offset 262160 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: sub sp, sp, #15, lsl #12 // =61440 -+; CHECK-NEXT: .cfi_def_cfa_offset 323600 -+; CHECK-NEXT: sub sp, sp, #4080 -+; CHECK-NEXT: .cfi_def_cfa_offset 327680 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: mov x8, sp -+; CHECK-NEXT: str x8, [x0] -+; CHECK-NEXT: add sp, sp, #79, lsl #12 // =323584 -+; CHECK-NEXT: .cfi_def_cfa_offset 4096 -+; CHECK-NEXT: add sp, sp, #4080 -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload -+; CHECK-NEXT: .cfi_def_cfa_offset 0 -+; CHECK-NEXT: .cfi_restore w29 -+; CHECK-NEXT: ret -+entry: -+ %v = alloca i8, i64 327664, align 1 -+ store i8* %v, ptr %out, align 8 -+ ret void -+} -+ -+; 5*64k, smallest frame probed with a loop. -+define void @static_327680(ptr %out) #0 { -+; CHECK-LABEL: static_327680: -+; CHECK: // %bb.0: // %entry -+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: .cfi_offset w29, -16 -+; CHECK-NEXT: sub x9, sp, #80, lsl #12 // =327680 -+; CHECK-NEXT: .cfi_def_cfa w9, 327696 -+; CHECK-NEXT: .LBB6_1: // %entry -+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -+; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: cmp sp, x9 -+; CHECK-NEXT: b.ne .LBB6_1 -+; CHECK-NEXT: // %bb.2: // %entry -+; CHECK-NEXT: .cfi_def_cfa_register wsp -+; CHECK-NEXT: mov x8, sp -+; CHECK-NEXT: str x8, [x0] -+; CHECK-NEXT: add sp, sp, #80, lsl #12 // =327680 -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload -+; CHECK-NEXT: .cfi_def_cfa_offset 0 -+; CHECK-NEXT: .cfi_restore w29 -+; CHECK-NEXT: ret -+entry: -+ %v = alloca i8, i64 327680, align 1 -+ store i8* %v, ptr %out, align 8 -+ ret void -+} -+ -+; 5*64k+1024, large enough to use a loop, but not a multiple of 64KiB -+; so has a reminder, but no extra probe. -+define void @static_328704(ptr %out) #0 { -+; CHECK-LABEL: static_328704: -+; CHECK: // %bb.0: // %entry -+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: .cfi_offset w29, -16 -+; CHECK-NEXT: sub x9, sp, #80, lsl #12 // =327680 -+; CHECK-NEXT: .cfi_def_cfa w9, 327696 -+; CHECK-NEXT: .LBB7_1: // %entry -+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -+; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: cmp sp, x9 -+; CHECK-NEXT: b.ne .LBB7_1 -+; CHECK-NEXT: // %bb.2: // %entry -+; CHECK-NEXT: .cfi_def_cfa_register wsp -+; CHECK-NEXT: sub sp, sp, #1024 -+; CHECK-NEXT: .cfi_def_cfa_offset 328720 -+; CHECK-NEXT: mov x8, sp -+; CHECK-NEXT: str x8, [x0] -+; CHECK-NEXT: add sp, sp, #80, lsl #12 // =327680 -+; CHECK-NEXT: .cfi_def_cfa_offset 1040 -+; CHECK-NEXT: add sp, sp, #1024 -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload -+; CHECK-NEXT: .cfi_def_cfa_offset 0 -+; CHECK-NEXT: .cfi_restore w29 -+; CHECK-NEXT: ret -+entry: -+ %v = alloca i8, i64 328704, align 1 -+ store i8* %v, ptr %out, align 8 -+ ret void -+} -+ -+; 5*64k+1040, large enough to use a loop, has a reminder and -+; an extra probe. -+define void @static_328720(ptr %out) #0 { -+; CHECK-LABEL: static_328720: -+; CHECK: // %bb.0: // %entry -+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: .cfi_offset w29, -16 -+; CHECK-NEXT: sub x9, sp, #80, lsl #12 // =327680 -+; CHECK-NEXT: .cfi_def_cfa w9, 327696 -+; CHECK-NEXT: .LBB8_1: // %entry -+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -+; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: cmp sp, x9 -+; CHECK-NEXT: b.ne .LBB8_1 -+; CHECK-NEXT: // %bb.2: // %entry -+; CHECK-NEXT: .cfi_def_cfa_register wsp -+; CHECK-NEXT: sub sp, sp, #1040 -+; CHECK-NEXT: .cfi_def_cfa_offset 328736 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: mov x8, sp -+; CHECK-NEXT: str x8, [x0] -+; CHECK-NEXT: add sp, sp, #80, lsl #12 // =327680 -+; CHECK-NEXT: .cfi_def_cfa_offset 1056 -+; CHECK-NEXT: add sp, sp, #1040 -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload -+; CHECK-NEXT: .cfi_def_cfa_offset 0 -+; CHECK-NEXT: .cfi_restore w29 -+; CHECK-NEXT: ret -+entry: -+ %v = alloca i8, i64 328720, align 1 -+ store i8* %v, ptr %out, align 8 -+ ret void -+} -+ -+; A small allocation, but with a very large alignment requirement. We do this -+; by moving SP far enough that a sufficiently-aligned block will exist -+; somewhere in the stack frame, so must probe the whole of that larger SP move. -+define void @static_16_align_131072(ptr %out) #0 { -+; CHECK-LABEL: static_16_align_131072: -+; CHECK: // %bb.0: // %entry -+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: mov x29, sp -+; CHECK-NEXT: .cfi_def_cfa w29, 16 -+; CHECK-NEXT: .cfi_offset w30, -8 -+; CHECK-NEXT: .cfi_offset w29, -16 -+; CHECK-NEXT: sub x9, sp, #31, lsl #12 // =126976 -+; CHECK-NEXT: sub x9, x9, #4080 -+; CHECK-NEXT: and x9, x9, #0xfffffffffffe0000 -+; CHECK-NEXT: .LBB9_1: // %entry -+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -+; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 -+; CHECK-NEXT: cmp sp, x9 -+; CHECK-NEXT: b.le .LBB9_3 -+; CHECK-NEXT: // %bb.2: // %entry -+; CHECK-NEXT: // in Loop: Header=BB9_1 Depth=1 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: b .LBB9_1 -+; CHECK-NEXT: .LBB9_3: // %entry -+; CHECK-NEXT: mov sp, x9 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: mov x8, sp -+; CHECK-NEXT: str x8, [x0] -+; CHECK-NEXT: mov sp, x29 -+; CHECK-NEXT: .cfi_def_cfa wsp, 16 -+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload -+; CHECK-NEXT: .cfi_def_cfa_offset 0 -+; CHECK-NEXT: .cfi_restore w30 -+; CHECK-NEXT: .cfi_restore w29 -+; CHECK-NEXT: ret -+entry: -+ %v = alloca i8, i64 16, align 131072 -+ store i8* %v, ptr %out, align 8 -+ ret void -+} -+ -+; A small allocation, but with a very large alignment requirement which -+; is nevertheless small enough as to not need a loop. -+define void @static_16_align_8192(ptr %out) #0 { -+; CHECK-LABEL: static_16_align_8192: -+; CHECK: // %bb.0: // %entry -+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: mov x29, sp -+; CHECK-NEXT: .cfi_def_cfa w29, 16 -+; CHECK-NEXT: .cfi_offset w30, -8 -+; CHECK-NEXT: .cfi_offset w29, -16 -+; CHECK-NEXT: sub x9, sp, #1, lsl #12 // =4096 -+; CHECK-NEXT: sub x9, x9, #4080 -+; CHECK-NEXT: and sp, x9, #0xffffffffffffe000 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: mov x8, sp -+; CHECK-NEXT: str x8, [x0] -+; CHECK-NEXT: mov sp, x29 -+; CHECK-NEXT: .cfi_def_cfa wsp, 16 -+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload -+; CHECK-NEXT: .cfi_def_cfa_offset 0 -+; CHECK-NEXT: .cfi_restore w30 -+; CHECK-NEXT: .cfi_restore w29 -+; CHECK-NEXT: ret -+entry: -+ %v = alloca i8, i64 16, align 8192 -+ store i8* %v, ptr %out, align 8 -+ ret void -+} -+ -+; A large allocation with a very large alignment requirement which -+; is nevertheless small enough as to not need a loop. -+define void @static_32752_align_32k(ptr %out) #0 { -+; CHECK-LABEL: static_32752_align_32k: -+; CHECK: // %bb.0: // %entry -+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: mov x29, sp -+; CHECK-NEXT: .cfi_def_cfa w29, 16 -+; CHECK-NEXT: .cfi_offset w30, -8 -+; CHECK-NEXT: .cfi_offset w29, -16 -+; CHECK-NEXT: sub x9, sp, #7, lsl #12 // =28672 -+; CHECK-NEXT: sub x9, x9, #4080 -+; CHECK-NEXT: and sp, x9, #0xffffffffffff8000 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: mov x8, sp -+; CHECK-NEXT: str x8, [x0] -+; CHECK-NEXT: mov sp, x29 -+; CHECK-NEXT: .cfi_def_cfa wsp, 16 -+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload -+; CHECK-NEXT: .cfi_def_cfa_offset 0 -+; CHECK-NEXT: .cfi_restore w30 -+; CHECK-NEXT: .cfi_restore w29 -+; CHECK-NEXT: ret -+entry: -+ %v = alloca i8, i64 32752, align 32768 -+ store i8* %v, ptr %out, align 8 -+ ret void -+} -+ -+attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "stack-probe-size"="65536" "frame-pointer"="none" } -\ No newline at end of file -diff --git a/llvm/test/CodeGen/AArch64/stack-probing-last-in-block.mir b/llvm/test/CodeGen/AArch64/stack-probing-last-in-block.mir -new file mode 100644 -index 000000000000..a8a21ab330ba ---- /dev/null -+++ b/llvm/test/CodeGen/AArch64/stack-probing-last-in-block.mir -@@ -0,0 +1,146 @@ -+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 -+# RUN: llc -run-pass=prologepilog %s -o - | FileCheck %s -+# Regression test for a crash when the probing instruction -+# to replace is last in the block. -+--- | -+ source_filename = "tt.ll" -+ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" -+ target triple = "aarch64-linux" -+ -+ declare i1 @g(ptr) -+ -+ define void @f(ptr %out) #0 { -+ entry: -+ %p = alloca i32, i32 50000, align 4 -+ br label %loop -+ -+ loop: ; preds = %loop, %entry -+ %c = call i1 @g(ptr %p) -+ br i1 %c, label %loop, label %exit -+ -+ exit: ; preds = %loop -+ ret void -+ } -+ -+ attributes #0 = { uwtable "frame-pointer"="none" "probe-stack"="inline-asm" "target-features"="+sve" } -+ -+... -+--- -+name: f -+alignment: 4 -+exposesReturnsTwice: false -+legalized: false -+regBankSelected: false -+selected: false -+failedISel: false -+tracksRegLiveness: true -+hasWinCFI: false -+callsEHReturn: false -+callsUnwindInit: false -+hasEHCatchret: false -+hasEHScopes: false -+hasEHFunclets: false -+isOutlined: false -+debugInstrRef: false -+failsVerification: false -+tracksDebugUserValues: true -+registers: [] -+liveins: [] -+frameInfo: -+ isFrameAddressTaken: false -+ isReturnAddressTaken: false -+ hasStackMap: false -+ hasPatchPoint: false -+ stackSize: 0 -+ offsetAdjustment: 0 -+ maxAlignment: 4 -+ adjustsStack: true -+ hasCalls: true -+ stackProtector: '' -+ functionContext: '' -+ maxCallFrameSize: 0 -+ cvBytesOfCalleeSavedRegisters: 0 -+ hasOpaqueSPAdjustment: false -+ hasVAStart: false -+ hasMustTailInVarArgFunc: false -+ hasTailCall: false -+ localFrameSize: 200000 -+ savePoint: '' -+ restorePoint: '' -+fixedStack: [] -+stack: -+ - { id: 0, name: p, type: default, offset: 0, size: 200000, alignment: 4, -+ stack-id: default, callee-saved-register: '', callee-saved-restored: true, -+ local-offset: -200000, debug-info-variable: '', debug-info-expression: '', -+ debug-info-location: '' } -+entry_values: [] -+callSites: [] -+debugValueSubstitutions: [] -+constants: [] -+machineFunctionInfo: {} -+body: | -+ ; CHECK-LABEL: name: f -+ ; CHECK: bb.0.entry: -+ ; CHECK-NEXT: successors: %bb.3(0x80000000) -+ ; CHECK-NEXT: liveins: $lr, $fp -+ ; CHECK-NEXT: {{ $}} -+ ; CHECK-NEXT: early-clobber $sp = frame-setup STPXpre killed $fp, killed $lr, $sp, -2 :: (store (s64) into %stack.2), (store (s64) into %stack.1) -+ ; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16 -+ ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w30, -8 -+ ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16 -+ ; CHECK-NEXT: $x9 = frame-setup SUBXri $sp, 48, 12 -+ ; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa $w9, 196624 -+ ; CHECK-NEXT: {{ $}} -+ ; CHECK-NEXT: bb.3.entry: -+ ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.3(0x40000000) -+ ; CHECK-NEXT: liveins: $x9 -+ ; CHECK-NEXT: {{ $}} -+ ; CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1, 12 -+ ; CHECK-NEXT: frame-setup STRXui $xzr, $sp, 0 -+ ; CHECK-NEXT: $xzr = frame-setup SUBSXrx64 $sp, $x9, 24, implicit-def $nzcv -+ ; CHECK-NEXT: frame-setup Bcc 1, %bb.3, implicit $nzcv -+ ; CHECK-NEXT: {{ $}} -+ ; CHECK-NEXT: bb.4.entry: -+ ; CHECK-NEXT: successors: %bb.1(0x80000000) -+ ; CHECK-NEXT: {{ $}} -+ ; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_register $wsp -+ ; CHECK-NEXT: $sp = frame-setup SUBXri $sp, 3392, 0 -+ ; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 200016 -+ ; CHECK-NEXT: frame-setup STRXui $xzr, $sp, 0 -+ ; CHECK-NEXT: {{ $}} -+ ; CHECK-NEXT: bb.1.loop: -+ ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) -+ ; CHECK-NEXT: {{ $}} -+ ; CHECK-NEXT: $x0 = ADDXri $sp, 0, 0 -+ ; CHECK-NEXT: BL @g, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp, implicit-def $w0 -+ ; CHECK-NEXT: TBNZW killed renamable $w0, 0, %bb.1 -+ ; CHECK-NEXT: B %bb.2 -+ ; CHECK-NEXT: {{ $}} -+ ; CHECK-NEXT: bb.2.exit: -+ ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 48, 12 -+ ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 3408 -+ ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 3392, 0 -+ ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 16 -+ ; CHECK-NEXT: early-clobber $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 :: (load (s64) from %stack.2), (load (s64) from %stack.1) -+ ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0 -+ ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w30 -+ ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29 -+ ; CHECK-NEXT: RET_ReallyLR -+ bb.0.entry: -+ successors: %bb.1(0x80000000) -+ -+ -+ bb.1.loop: -+ successors: %bb.1(0x7c000000), %bb.2(0x04000000) -+ -+ ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp -+ $x0 = ADDXri %stack.0.p, 0, 0 -+ BL @g, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp, implicit-def $w0 -+ ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp -+ TBNZW killed renamable $w0, 0, %bb.1 -+ B %bb.2 -+ -+ bb.2.exit: -+ RET_ReallyLR -+ -+... -\ No newline at end of file -diff --git a/llvm/test/CodeGen/AArch64/stack-probing-sve.ll b/llvm/test/CodeGen/AArch64/stack-probing-sve.ll -new file mode 100644 -index 000000000000..e765d071e722 ---- /dev/null -+++ b/llvm/test/CodeGen/AArch64/stack-probing-sve.ll -@@ -0,0 +1,724 @@ -+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -+; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs | FileCheck %s -+; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -global-isel -global-isel-abort=2 | FileCheck %s -+ -+; Test prolog sequences for stack probing when SVE objects are involved. -+ -+; The space for SVE objects needs probing in the general case, because -+; the stack adjustment may happen to be too big (i.e. greater than the -+; probe size) to allocate with a single `addvl`. -+; When we do know that the stack adjustment cannot exceed the probe size -+; we can avoid emitting a probe loop and emit a simple `addvl; str` -+; sequence instead. -+ -+define void @sve_1_vector(ptr %out) #0 { -+; CHECK-LABEL: sve_1_vector: -+; CHECK: // %bb.0: // %entry -+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: .cfi_offset w29, -16 -+; CHECK-NEXT: addvl sp, sp, #-1 -+; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG -+; CHECK-NEXT: addvl sp, sp, #1 -+; CHECK-NEXT: .cfi_def_cfa wsp, 16 -+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload -+; CHECK-NEXT: .cfi_def_cfa_offset 0 -+; CHECK-NEXT: .cfi_restore w29 -+; CHECK-NEXT: ret -+entry: -+ %vec = alloca , align 16 -+ ret void -+} -+ -+; As above, but with 4 SVE vectors of stack space. -+define void @sve_4_vector(ptr %out) #0 { -+; CHECK-LABEL: sve_4_vector: -+; CHECK: // %bb.0: // %entry -+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: .cfi_offset w29, -16 -+; CHECK-NEXT: addvl sp, sp, #-4 -+; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG -+; CHECK-NEXT: addvl sp, sp, #4 -+; CHECK-NEXT: .cfi_def_cfa wsp, 16 -+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload -+; CHECK-NEXT: .cfi_def_cfa_offset 0 -+; CHECK-NEXT: .cfi_restore w29 -+; CHECK-NEXT: ret -+entry: -+ %vec1 = alloca , align 16 -+ %vec2 = alloca , align 16 -+ %vec3 = alloca , align 16 -+ %vec4 = alloca , align 16 -+ ret void -+} -+ -+; As above, but with 16 SVE vectors of stack space. -+; The stack adjustment is less than or equal to 16 x 256 = 4096, so -+; we can allocate the locals at once. -+define void @sve_16_vector(ptr %out) #0 { -+; CHECK-LABEL: sve_16_vector: -+; CHECK: // %bb.0: // %entry -+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: .cfi_offset w29, -16 -+; CHECK-NEXT: addvl sp, sp, #-16 -+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 128 * VG -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: addvl sp, sp, #16 -+; CHECK-NEXT: .cfi_def_cfa wsp, 16 -+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload -+; CHECK-NEXT: .cfi_def_cfa_offset 0 -+; CHECK-NEXT: .cfi_restore w29 -+; CHECK-NEXT: ret -+entry: -+ %vec1 = alloca , align 16 -+ %vec2 = alloca , align 16 -+ %vec3 = alloca , align 16 -+ %vec4 = alloca , align 16 -+ %vec5 = alloca , align 16 -+ %vec6 = alloca , align 16 -+ %vec7 = alloca , align 16 -+ %vec8 = alloca , align 16 -+ %vec9 = alloca , align 16 -+ %vec10 = alloca , align 16 -+ %vec11 = alloca , align 16 -+ %vec12 = alloca , align 16 -+ %vec13 = alloca , align 16 -+ %vec14 = alloca , align 16 -+ %vec15 = alloca , align 16 -+ %vec16 = alloca , align 16 -+ ret void -+} -+ -+; As above, but with 17 SVE vectors of stack space. Now we need -+; a probing loops since stack adjustment may be greater than -+; the probe size (17 x 256 = 4354 bytes) -+; TODO: Allocating `k*16+r` SVE vectors can be unrolled into -+; emiting the `k + r` sequences of `addvl sp, sp, #-N; str xzr, [sp]` -+define void @sve_17_vector(ptr %out) #0 { -+; CHECK-LABEL: sve_17_vector: -+; CHECK: // %bb.0: // %entry -+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: .cfi_offset w29, -16 -+; CHECK-NEXT: addvl x9, sp, #-17 -+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 136 * VG -+; CHECK-NEXT: .LBB3_1: // %entry -+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 -+; CHECK-NEXT: cmp sp, x9 -+; CHECK-NEXT: b.le .LBB3_3 -+; CHECK-NEXT: // %bb.2: // %entry -+; CHECK-NEXT: // in Loop: Header=BB3_1 Depth=1 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: b .LBB3_1 -+; CHECK-NEXT: .LBB3_3: // %entry -+; CHECK-NEXT: mov sp, x9 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: .cfi_def_cfa_register wsp -+; CHECK-NEXT: addvl sp, sp, #17 -+; CHECK-NEXT: .cfi_def_cfa wsp, 16 -+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload -+; CHECK-NEXT: .cfi_def_cfa_offset 0 -+; CHECK-NEXT: .cfi_restore w29 -+; CHECK-NEXT: ret -+entry: -+ %vec1 = alloca , align 16 -+ %vec2 = alloca , align 16 -+ %vec3 = alloca , align 16 -+ %vec4 = alloca , align 16 -+ %vec5 = alloca , align 16 -+ %vec6 = alloca , align 16 -+ %vec7 = alloca , align 16 -+ %vec8 = alloca , align 16 -+ %vec9 = alloca , align 16 -+ %vec10 = alloca , align 16 -+ %vec11 = alloca , align 16 -+ %vec12 = alloca , align 16 -+ %vec13 = alloca , align 16 -+ %vec14 = alloca , align 16 -+ %vec15 = alloca , align 16 -+ %vec16 = alloca , align 16 -+ %vec17 = alloca , align 16 -+ ret void -+} -+ -+; Space for callee-saved SVE register is allocated similarly to allocating -+; space for SVE locals. When we know the stack adjustment cannot exceed the -+; probe size we can skip the explict probe, since saving SVE registers serves -+; as an implicit probe. -+define void @sve_1v_csr( %a) #0 { -+; CHECK-LABEL: sve_1v_csr: -+; CHECK: // %bb.0: // %entry -+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: .cfi_offset w29, -16 -+; CHECK-NEXT: addvl sp, sp, #-1 -+; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG -+; CHECK-NEXT: str z8, [sp] // 16-byte Folded Spill -+; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG -+; CHECK-NEXT: //APP -+; CHECK-NEXT: //NO_APP -+; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload -+; CHECK-NEXT: addvl sp, sp, #1 -+; CHECK-NEXT: .cfi_def_cfa wsp, 16 -+; CHECK-NEXT: .cfi_restore z8 -+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload -+; CHECK-NEXT: .cfi_def_cfa_offset 0 -+; CHECK-NEXT: .cfi_restore w29 -+; CHECK-NEXT: ret -+entry: -+ call void asm sideeffect "", "~{z8}" () -+ ret void -+} -+ -+define void @sve_4v_csr( %a) #0 { -+; CHECK-LABEL: sve_4v_csr: -+; CHECK: // %bb.0: // %entry -+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: .cfi_offset w29, -16 -+; CHECK-NEXT: addvl sp, sp, #-4 -+; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG -+; CHECK-NEXT: str z11, [sp] // 16-byte Folded Spill -+; CHECK-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill -+; CHECK-NEXT: str z9, [sp, #2, mul vl] // 16-byte Folded Spill -+; CHECK-NEXT: str z8, [sp, #3, mul vl] // 16-byte Folded Spill -+; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG -+; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG -+; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG -+; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG -+; CHECK-NEXT: //APP -+; CHECK-NEXT: //NO_APP -+; CHECK-NEXT: ldr z11, [sp] // 16-byte Folded Reload -+; CHECK-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload -+; CHECK-NEXT: ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload -+; CHECK-NEXT: ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload -+; CHECK-NEXT: addvl sp, sp, #4 -+; CHECK-NEXT: .cfi_def_cfa wsp, 16 -+; CHECK-NEXT: .cfi_restore z8 -+; CHECK-NEXT: .cfi_restore z9 -+; CHECK-NEXT: .cfi_restore z10 -+; CHECK-NEXT: .cfi_restore z11 -+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload -+; CHECK-NEXT: .cfi_def_cfa_offset 0 -+; CHECK-NEXT: .cfi_restore w29 -+; CHECK-NEXT: ret -+entry: -+ call void asm sideeffect "", "~{z8},~{z9},~{z10},~{z11}" () -+ ret void -+} -+ -+define void @sve_16v_csr( %a) #0 { -+; CHECK-LABEL: sve_16v_csr: -+; CHECK: // %bb.0: // %entry -+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: .cfi_offset w29, -16 -+; CHECK-NEXT: addvl sp, sp, #-16 -+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 128 * VG -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: str z23, [sp] // 16-byte Folded Spill -+; CHECK-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill -+; CHECK-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill -+; CHECK-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill -+; CHECK-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill -+; CHECK-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill -+; CHECK-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill -+; CHECK-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill -+; CHECK-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill -+; CHECK-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill -+; CHECK-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill -+; CHECK-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill -+; CHECK-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill -+; CHECK-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill -+; CHECK-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill -+; CHECK-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill -+; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG -+; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG -+; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG -+; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG -+; CHECK-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG -+; CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG -+; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG -+; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG -+; CHECK-NEXT: //APP -+; CHECK-NEXT: //NO_APP -+; CHECK-NEXT: ldr z23, [sp] // 16-byte Folded Reload -+; CHECK-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload -+; CHECK-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload -+; CHECK-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload -+; CHECK-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload -+; CHECK-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload -+; CHECK-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload -+; CHECK-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload -+; CHECK-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload -+; CHECK-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload -+; CHECK-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload -+; CHECK-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload -+; CHECK-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload -+; CHECK-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload -+; CHECK-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload -+; CHECK-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload -+; CHECK-NEXT: addvl sp, sp, #16 -+; CHECK-NEXT: .cfi_def_cfa wsp, 16 -+; CHECK-NEXT: .cfi_restore z8 -+; CHECK-NEXT: .cfi_restore z9 -+; CHECK-NEXT: .cfi_restore z10 -+; CHECK-NEXT: .cfi_restore z11 -+; CHECK-NEXT: .cfi_restore z12 -+; CHECK-NEXT: .cfi_restore z13 -+; CHECK-NEXT: .cfi_restore z14 -+; CHECK-NEXT: .cfi_restore z15 -+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload -+; CHECK-NEXT: .cfi_def_cfa_offset 0 -+; CHECK-NEXT: .cfi_restore w29 -+; CHECK-NEXT: ret -+entry: -+ call void asm sideeffect "", "~{z8},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23}" () -+ ret void -+} -+ -+define void @sve_1p_csr( %a) #0 { -+; CHECK-LABEL: sve_1p_csr: -+; CHECK: // %bb.0: // %entry -+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: .cfi_offset w29, -16 -+; CHECK-NEXT: addvl sp, sp, #-1 -+; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG -+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -+; CHECK-NEXT: //APP -+; CHECK-NEXT: //NO_APP -+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -+; CHECK-NEXT: addvl sp, sp, #1 -+; CHECK-NEXT: .cfi_def_cfa wsp, 16 -+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload -+; CHECK-NEXT: .cfi_def_cfa_offset 0 -+; CHECK-NEXT: .cfi_restore w29 -+; CHECK-NEXT: ret -+entry: -+ call void asm sideeffect "", "~{p8}" () -+ ret void -+} -+ -+define void @sve_4p_csr( %a) #0 { -+; CHECK-LABEL: sve_4p_csr: -+; CHECK: // %bb.0: // %entry -+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: .cfi_offset w29, -16 -+; CHECK-NEXT: addvl sp, sp, #-1 -+; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG -+; CHECK-NEXT: str p11, [sp, #4, mul vl] // 2-byte Folded Spill -+; CHECK-NEXT: str p10, [sp, #5, mul vl] // 2-byte Folded Spill -+; CHECK-NEXT: str p9, [sp, #6, mul vl] // 2-byte Folded Spill -+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -+; CHECK-NEXT: //APP -+; CHECK-NEXT: //NO_APP -+; CHECK-NEXT: ldr p11, [sp, #4, mul vl] // 2-byte Folded Reload -+; CHECK-NEXT: ldr p10, [sp, #5, mul vl] // 2-byte Folded Reload -+; CHECK-NEXT: ldr p9, [sp, #6, mul vl] // 2-byte Folded Reload -+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -+; CHECK-NEXT: addvl sp, sp, #1 -+; CHECK-NEXT: .cfi_def_cfa wsp, 16 -+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload -+; CHECK-NEXT: .cfi_def_cfa_offset 0 -+; CHECK-NEXT: .cfi_restore w29 -+; CHECK-NEXT: ret -+entry: -+ call void asm sideeffect "", "~{p8},~{p9},~{p10},~{p11}" () -+ ret void -+} -+ -+define void @sve_16v_1p_csr( %a) #0 { -+; CHECK-LABEL: sve_16v_1p_csr: -+; CHECK: // %bb.0: // %entry -+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: .cfi_offset w29, -16 -+; CHECK-NEXT: addvl x9, sp, #-17 -+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 136 * VG -+; CHECK-NEXT: .LBB9_1: // %entry -+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 -+; CHECK-NEXT: cmp sp, x9 -+; CHECK-NEXT: b.le .LBB9_3 -+; CHECK-NEXT: // %bb.2: // %entry -+; CHECK-NEXT: // in Loop: Header=BB9_1 Depth=1 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: b .LBB9_1 -+; CHECK-NEXT: .LBB9_3: // %entry -+; CHECK-NEXT: mov sp, x9 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: .cfi_def_cfa_register wsp -+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -+; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill -+; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill -+; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill -+; CHECK-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill -+; CHECK-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill -+; CHECK-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill -+; CHECK-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill -+; CHECK-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill -+; CHECK-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill -+; CHECK-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill -+; CHECK-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -+; CHECK-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill -+; CHECK-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill -+; CHECK-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill -+; CHECK-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill -+; CHECK-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill -+; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG -+; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG -+; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG -+; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG -+; CHECK-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG -+; CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG -+; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG -+; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG -+; CHECK-NEXT: //APP -+; CHECK-NEXT: //NO_APP -+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -+; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload -+; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload -+; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload -+; CHECK-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload -+; CHECK-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload -+; CHECK-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload -+; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload -+; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload -+; CHECK-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload -+; CHECK-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload -+; CHECK-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload -+; CHECK-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload -+; CHECK-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload -+; CHECK-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload -+; CHECK-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload -+; CHECK-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload -+; CHECK-NEXT: addvl sp, sp, #17 -+; CHECK-NEXT: .cfi_def_cfa wsp, 16 -+; CHECK-NEXT: .cfi_restore z8 -+; CHECK-NEXT: .cfi_restore z9 -+; CHECK-NEXT: .cfi_restore z10 -+; CHECK-NEXT: .cfi_restore z11 -+; CHECK-NEXT: .cfi_restore z12 -+; CHECK-NEXT: .cfi_restore z13 -+; CHECK-NEXT: .cfi_restore z14 -+; CHECK-NEXT: .cfi_restore z15 -+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload -+; CHECK-NEXT: .cfi_def_cfa_offset 0 -+; CHECK-NEXT: .cfi_restore w29 -+; CHECK-NEXT: ret -+entry: -+ call void asm sideeffect "", "~{p8},~{z8},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23}" () -+ ret void -+} -+ -+; A SVE vector and a 16-byte fixed size object. -+define void @sve_1_vector_16_arr(ptr %out) #0 { -+; CHECK-LABEL: sve_1_vector_16_arr: -+; CHECK: // %bb.0: // %entry -+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: .cfi_offset w29, -16 -+; CHECK-NEXT: sub sp, sp, #16 -+; CHECK-NEXT: .cfi_def_cfa_offset 32 -+; CHECK-NEXT: addvl sp, sp, #-1 -+; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 8 * VG -+; CHECK-NEXT: addvl sp, sp, #1 -+; CHECK-NEXT: .cfi_def_cfa wsp, 32 -+; CHECK-NEXT: add sp, sp, #16 -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload -+; CHECK-NEXT: .cfi_def_cfa_offset 0 -+; CHECK-NEXT: .cfi_restore w29 -+; CHECK-NEXT: ret -+entry: -+ %vec = alloca , align 16 -+ %arr = alloca i8, i64 16, align 1 -+ ret void -+} -+ -+; A large SVE stack object and a large stack slot, both of which need probing. -+; TODO: This could be optimised by combining the fixed-size offset into the -+; loop. -+define void @sve_1_vector_4096_arr(ptr %out) #0 { -+; CHECK-LABEL: sve_1_vector_4096_arr: -+; CHECK: // %bb.0: // %entry -+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: .cfi_offset w29, -16 -+; CHECK-NEXT: sub x9, sp, #3, lsl #12 // =12288 -+; CHECK-NEXT: .cfi_def_cfa w9, 12304 -+; CHECK-NEXT: addvl x9, x9, #-32 -+; CHECK-NEXT: .cfi_escape 0x0f, 0x0f, 0x79, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x80, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 12304 + 256 * VG -+; CHECK-NEXT: addvl x9, x9, #-32 -+; CHECK-NEXT: .cfi_escape 0x0f, 0x0f, 0x79, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x80, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 12304 + 512 * VG -+; CHECK-NEXT: .LBB11_1: // %entry -+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 -+; CHECK-NEXT: cmp sp, x9 -+; CHECK-NEXT: b.le .LBB11_3 -+; CHECK-NEXT: // %bb.2: // %entry -+; CHECK-NEXT: // in Loop: Header=BB11_1 Depth=1 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: b .LBB11_1 -+; CHECK-NEXT: .LBB11_3: // %entry -+; CHECK-NEXT: mov sp, x9 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: .cfi_def_cfa_register wsp -+; CHECK-NEXT: addvl sp, sp, #31 -+; CHECK-NEXT: .cfi_escape 0x0f, 0x0f, 0x8f, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x88, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 12304 + 264 * VG -+; CHECK-NEXT: addvl sp, sp, #31 -+; CHECK-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 12304 + 16 * VG -+; CHECK-NEXT: addvl sp, sp, #2 -+; CHECK-NEXT: .cfi_def_cfa wsp, 12304 -+; CHECK-NEXT: add sp, sp, #3, lsl #12 // =12288 -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload -+; CHECK-NEXT: .cfi_def_cfa_offset 0 -+; CHECK-NEXT: .cfi_restore w29 -+; CHECK-NEXT: ret -+entry: -+ %vec = alloca , align 16 -+ %arr = alloca i8, i64 12288, align 1 -+ ret void -+} -+ -+; Not tested: SVE stack objects with alignment >16 bytes, which isn't currently -+; supported even without stack-probing. -+ -+; An SVE vector, and a 16-byte fixed size object, which -+; has a large alignment requirement. -+define void @sve_1_vector_16_arr_align_8192(ptr %out) #0 { -+; CHECK-LABEL: sve_1_vector_16_arr_align_8192: -+; CHECK: // %bb.0: // %entry -+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: mov x29, sp -+; CHECK-NEXT: .cfi_def_cfa w29, 16 -+; CHECK-NEXT: .cfi_offset w30, -8 -+; CHECK-NEXT: .cfi_offset w29, -16 -+; CHECK-NEXT: sub x9, sp, #1, lsl #12 // =4096 -+; CHECK-NEXT: sub x9, x9, #4080 -+; CHECK-NEXT: addvl x9, x9, #-1 -+; CHECK-NEXT: and x9, x9, #0xffffffffffffe000 -+; CHECK-NEXT: .LBB12_1: // %entry -+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 -+; CHECK-NEXT: cmp sp, x9 -+; CHECK-NEXT: b.le .LBB12_3 -+; CHECK-NEXT: // %bb.2: // %entry -+; CHECK-NEXT: // in Loop: Header=BB12_1 Depth=1 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: b .LBB12_1 -+; CHECK-NEXT: .LBB12_3: // %entry -+; CHECK-NEXT: mov sp, x9 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: mov sp, x29 -+; CHECK-NEXT: .cfi_def_cfa wsp, 16 -+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload -+; CHECK-NEXT: .cfi_def_cfa_offset 0 -+; CHECK-NEXT: .cfi_restore w30 -+; CHECK-NEXT: .cfi_restore w29 -+; CHECK-NEXT: ret -+entry: -+ %vec = alloca , align 16 -+ %arr = alloca i8, i64 16, align 8192 -+ ret void -+} -+ -+; With 64k guard pages, we can allocate bigger SVE space without a probing loop. -+define void @sve_1024_64k_guard(ptr %out) #0 "stack-probe-size"="65536" { -+; CHECK-LABEL: sve_1024_64k_guard: -+; CHECK: // %bb.0: // %entry -+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: .cfi_offset w29, -16 -+; CHECK-NEXT: addvl sp, sp, #-32 -+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 256 * VG -+; CHECK-NEXT: addvl sp, sp, #-32 -+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 512 * VG -+; CHECK-NEXT: addvl sp, sp, #-32 -+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x06, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 768 * VG -+; CHECK-NEXT: addvl sp, sp, #-32 -+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1024 * VG -+; CHECK-NEXT: addvl sp, sp, #-32 -+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0a, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1280 * VG -+; CHECK-NEXT: addvl sp, sp, #-32 -+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0c, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1536 * VG -+; CHECK-NEXT: addvl sp, sp, #-32 -+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1792 * VG -+; CHECK-NEXT: addvl sp, sp, #-32 -+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 2048 * VG -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: addvl sp, sp, #31 -+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1800 * VG -+; CHECK-NEXT: addvl sp, sp, #31 -+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x0c, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1552 * VG -+; CHECK-NEXT: addvl sp, sp, #31 -+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x98, 0x0a, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1304 * VG -+; CHECK-NEXT: addvl sp, sp, #31 -+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa0, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1056 * VG -+; CHECK-NEXT: addvl sp, sp, #31 -+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa8, 0x06, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 808 * VG -+; CHECK-NEXT: addvl sp, sp, #31 -+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xb0, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 560 * VG -+; CHECK-NEXT: addvl sp, sp, #31 -+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xb8, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 312 * VG -+; CHECK-NEXT: addvl sp, sp, #31 -+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 64 * VG -+; CHECK-NEXT: addvl sp, sp, #8 -+; CHECK-NEXT: .cfi_def_cfa wsp, 16 -+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload -+; CHECK-NEXT: .cfi_def_cfa_offset 0 -+; CHECK-NEXT: .cfi_restore w29 -+; CHECK-NEXT: ret -+entry: -+ %vec = alloca , align 16 -+ ret void -+} -+ -+define void @sve_1028_64k_guard(ptr %out) #0 "stack-probe-size"="65536" { -+; CHECK-LABEL: sve_1028_64k_guard: -+; CHECK: // %bb.0: // %entry -+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: .cfi_offset w29, -16 -+; CHECK-NEXT: addvl x9, sp, #-32 -+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 256 * VG -+; CHECK-NEXT: addvl x9, x9, #-32 -+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 512 * VG -+; CHECK-NEXT: addvl x9, x9, #-32 -+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x06, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 768 * VG -+; CHECK-NEXT: addvl x9, x9, #-32 -+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 1024 * VG -+; CHECK-NEXT: addvl x9, x9, #-32 -+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0a, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 1280 * VG -+; CHECK-NEXT: addvl x9, x9, #-32 -+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0c, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 1536 * VG -+; CHECK-NEXT: addvl x9, x9, #-32 -+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 1792 * VG -+; CHECK-NEXT: addvl x9, x9, #-32 -+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 2048 * VG -+; CHECK-NEXT: addvl x9, x9, #-1 -+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 2056 * VG -+; CHECK-NEXT: .LBB14_1: // %entry -+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -+; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 -+; CHECK-NEXT: cmp sp, x9 -+; CHECK-NEXT: b.le .LBB14_3 -+; CHECK-NEXT: // %bb.2: // %entry -+; CHECK-NEXT: // in Loop: Header=BB14_1 Depth=1 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: b .LBB14_1 -+; CHECK-NEXT: .LBB14_3: // %entry -+; CHECK-NEXT: mov sp, x9 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: .cfi_def_cfa_register wsp -+; CHECK-NEXT: addvl sp, sp, #31 -+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1808 * VG -+; CHECK-NEXT: addvl sp, sp, #31 -+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x98, 0x0c, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1560 * VG -+; CHECK-NEXT: addvl sp, sp, #31 -+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa0, 0x0a, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1312 * VG -+; CHECK-NEXT: addvl sp, sp, #31 -+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa8, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1064 * VG -+; CHECK-NEXT: addvl sp, sp, #31 -+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xb0, 0x06, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 816 * VG -+; CHECK-NEXT: addvl sp, sp, #31 -+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xb8, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 568 * VG -+; CHECK-NEXT: addvl sp, sp, #31 -+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 320 * VG -+; CHECK-NEXT: addvl sp, sp, #31 -+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc8, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 72 * VG -+; CHECK-NEXT: addvl sp, sp, #9 -+; CHECK-NEXT: .cfi_def_cfa wsp, 16 -+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload -+; CHECK-NEXT: .cfi_def_cfa_offset 0 -+; CHECK-NEXT: .cfi_restore w29 -+; CHECK-NEXT: ret -+entry: -+ %vec = alloca , align 16 -+ %vec1 = alloca , align 16 -+ ret void -+} -+ -+; With 5 SVE vectors of stack space the unprobed area -+; at the top of the stack can exceed 1024 bytes (5 x 256 == 1280), -+; hence we need to issue a probe. -+define void @sve_5_vector(ptr %out) #0 { -+; CHECK-LABEL: sve_5_vector: -+; CHECK: // %bb.0: // %entry -+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: .cfi_offset w29, -16 -+; CHECK-NEXT: addvl sp, sp, #-5 -+; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 40 * VG -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: addvl sp, sp, #5 -+; CHECK-NEXT: .cfi_def_cfa wsp, 16 -+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload -+; CHECK-NEXT: .cfi_def_cfa_offset 0 -+; CHECK-NEXT: .cfi_restore w29 -+; CHECK-NEXT: ret -+entry: -+ %vec1 = alloca , align 16 -+ %vec2 = alloca , align 16 -+ %vec3 = alloca , align 16 -+ %vec4 = alloca , align 16 -+ %vec5 = alloca , align 16 -+ ret void -+} -+ -+; Test with a 14 scalable bytes (so up to 14 * 16 = 224) of unprobed -+; are bellow the save location of `p9`. -+define void @sve_unprobed_area( %a, i32 %n) #0 { -+; CHECK-LABEL: sve_unprobed_area: -+; CHECK: // %bb.0: // %entry -+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: .cfi_offset w29, -16 -+; CHECK-NEXT: addvl sp, sp, #-4 -+; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: str p9, [sp, #7, mul vl] // 2-byte Folded Spill -+; CHECK-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill -+; CHECK-NEXT: str z9, [sp, #2, mul vl] // 16-byte Folded Spill -+; CHECK-NEXT: str z8, [sp, #3, mul vl] // 16-byte Folded Spill -+; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG -+; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG -+; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG -+; CHECK-NEXT: addvl sp, sp, #-4 -+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 64 * VG -+; CHECK-NEXT: //APP -+; CHECK-NEXT: //NO_APP -+; CHECK-NEXT: addvl sp, sp, #4 -+; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG -+; CHECK-NEXT: ldr p9, [sp, #7, mul vl] // 2-byte Folded Reload -+; CHECK-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload -+; CHECK-NEXT: ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload -+; CHECK-NEXT: ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload -+; CHECK-NEXT: addvl sp, sp, #4 -+; CHECK-NEXT: .cfi_def_cfa wsp, 16 -+; CHECK-NEXT: .cfi_restore z8 -+; CHECK-NEXT: .cfi_restore z9 -+; CHECK-NEXT: .cfi_restore z10 -+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload -+; CHECK-NEXT: .cfi_def_cfa_offset 0 -+; CHECK-NEXT: .cfi_restore w29 -+; CHECK-NEXT: ret -+entry: -+ call void asm sideeffect "", "~{z8},~{z9},~{z10},~{p9}" () -+ -+ %v0 = alloca , align 16 -+ %v1 = alloca , align 16 -+ %v2 = alloca , align 16 -+ %v3 = alloca , align 16 -+ -+ ret void -+} -+ -+attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" "target-features"="+sve" } -\ No newline at end of file -diff --git a/llvm/test/CodeGen/AArch64/stack-probing.ll b/llvm/test/CodeGen/AArch64/stack-probing.ll -new file mode 100644 -index 000000000000..95001450622f ---- /dev/null -+++ b/llvm/test/CodeGen/AArch64/stack-probing.ll -@@ -0,0 +1,539 @@ -+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -+; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -enable-post-misched=false | FileCheck %s -+; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -enable-post-misched=false -global-isel | FileCheck %s -+ -+; Tests for prolog sequences for stack probing, when using a 4KiB stack guard. -+ -+; The stack probing parameters in function attributes take precedence over -+; ones in the module flags. -+ -+; Small stack frame, no probing required. -+define void @static_64(ptr %out) #0 { -+; CHECK-LABEL: static_64: -+; CHECK: // %bb.0: // %entry -+; CHECK-NEXT: sub sp, sp, #64 -+; CHECK-NEXT: .cfi_def_cfa_offset 64 -+; CHECK-NEXT: mov x8, sp -+; CHECK-NEXT: str x8, [x0] -+; CHECK-NEXT: add sp, sp, #64 -+; CHECK-NEXT: .cfi_def_cfa_offset 0 -+; CHECK-NEXT: ret -+entry: -+ %v = alloca i8, i64 64, align 1 -+ store ptr %v, ptr %out, align 8 -+ ret void -+} -+ -+; At 256 bytes we start to always create a frame pointer. No frame smaller then -+; this needs a probe, so we can use the saving of at least one CSR as a probe -+; at the top of our frame. -+define void @static_256(ptr %out) #0 { -+; CHECK-LABEL: static_256: -+; CHECK: // %bb.0: // %entry -+; CHECK-NEXT: sub sp, sp, #272 -+; CHECK-NEXT: .cfi_def_cfa_offset 272 -+; CHECK-NEXT: str x29, [sp, #256] // 8-byte Folded Spill -+; CHECK-NEXT: .cfi_offset w29, -16 -+; CHECK-NEXT: mov x8, sp -+; CHECK-NEXT: str x8, [x0] -+; CHECK-NEXT: add sp, sp, #272 -+; CHECK-NEXT: .cfi_def_cfa_offset 0 -+; CHECK-NEXT: .cfi_restore w29 -+; CHECK-NEXT: ret -+entry: -+ %v = alloca i8, i64 256, align 1 -+ store ptr %v, ptr %out, align 8 -+ ret void -+} -+ -+; At 1024 bytes, this is the largest frame which doesn't need probing. -+define void @static_1024(ptr %out) #0 { -+; CHECK-LABEL: static_1024: -+; CHECK: // %bb.0: // %entry -+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: .cfi_offset w29, -16 -+; CHECK-NEXT: sub sp, sp, #1024 -+; CHECK-NEXT: .cfi_def_cfa_offset 1040 -+; CHECK-NEXT: mov x8, sp -+; CHECK-NEXT: str x8, [x0] -+; CHECK-NEXT: add sp, sp, #1024 -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload -+; CHECK-NEXT: .cfi_def_cfa_offset 0 -+; CHECK-NEXT: .cfi_restore w29 -+; CHECK-NEXT: ret -+entry: -+ %v = alloca i8, i64 1024, align 1 -+ store ptr %v, ptr %out, align 8 -+ ret void -+} -+ -+; At 1024+16 bytes, this is the smallest frame which needs probing. -+define void @static_1040(ptr %out) #0 { -+; CHECK-LABEL: static_1040: -+; CHECK: // %bb.0: // %entry -+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: .cfi_offset w29, -16 -+; CHECK-NEXT: sub sp, sp, #1040 -+; CHECK-NEXT: .cfi_def_cfa_offset 1056 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: mov x8, sp -+; CHECK-NEXT: str x8, [x0] -+; CHECK-NEXT: add sp, sp, #1040 -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload -+; CHECK-NEXT: .cfi_def_cfa_offset 0 -+; CHECK-NEXT: .cfi_restore w29 -+; CHECK-NEXT: ret -+entry: -+ %v = alloca i8, i64 1040, align 1 -+ store ptr %v, ptr %out, align 8 -+ ret void -+} -+ -+; 4k bytes is the largest frame we can probe in one go. -+define void @static_4096(ptr %out) #0 { -+; CHECK-LABEL: static_4096: -+; CHECK: // %bb.0: // %entry -+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: .cfi_offset w29, -16 -+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 -+; CHECK-NEXT: .cfi_def_cfa_offset 4112 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: mov x8, sp -+; CHECK-NEXT: str x8, [x0] -+; CHECK-NEXT: add sp, sp, #1, lsl #12 // =4096 -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload -+; CHECK-NEXT: .cfi_def_cfa_offset 0 -+; CHECK-NEXT: .cfi_restore w29 -+; CHECK-NEXT: ret -+entry: -+ %v = alloca i8, i64 4096, align 1 -+ store ptr %v, ptr %out, align 8 -+ ret void -+} -+ -+; 4k+16 bytes, still needs just one probe. -+define void @static_4112(ptr %out) #0 { -+; CHECK-LABEL: static_4112: -+; CHECK: // %bb.0: // %entry -+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: .cfi_offset w29, -16 -+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 -+; CHECK-NEXT: .cfi_def_cfa_offset 4112 -+; CHECK-NEXT: str xzr, [sp], #-16 -+; CHECK-NEXT: .cfi_def_cfa_offset 4128 -+; CHECK-NEXT: mov x8, sp -+; CHECK-NEXT: str x8, [x0] -+; CHECK-NEXT: add sp, sp, #1, lsl #12 // =4096 -+; CHECK-NEXT: .cfi_def_cfa_offset 32 -+; CHECK-NEXT: add sp, sp, #16 -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload -+; CHECK-NEXT: .cfi_def_cfa_offset 0 -+; CHECK-NEXT: .cfi_restore w29 -+; CHECK-NEXT: ret -+entry: -+ %v = alloca i8, i64 4112, align 1 -+ store ptr %v, ptr %out, align 8 -+ ret void -+} -+ -+; 4k+1024 bytes, the largest frame which needs just one probe. -+define void @static_5120(ptr %out) #0 { -+; CHECK-LABEL: static_5120: -+; CHECK: // %bb.0: // %entry -+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: .cfi_offset w29, -16 -+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 -+; CHECK-NEXT: .cfi_def_cfa_offset 4112 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: sub sp, sp, #1024 -+; CHECK-NEXT: .cfi_def_cfa_offset 5136 -+; CHECK-NEXT: mov x8, sp -+; CHECK-NEXT: str x8, [x0] -+; CHECK-NEXT: add sp, sp, #1, lsl #12 // =4096 -+; CHECK-NEXT: .cfi_def_cfa_offset 1040 -+; CHECK-NEXT: add sp, sp, #1024 -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload -+; CHECK-NEXT: .cfi_def_cfa_offset 0 -+; CHECK-NEXT: .cfi_restore w29 -+; CHECK-NEXT: ret -+entry: -+ %v = alloca i8, i64 5120, align 1 -+ store ptr %v, ptr %out, align 8 -+ ret void -+} -+ -+; 4k+1024+16, the smallest frame which needs two probes. -+define void @static_5136(ptr %out) #0 { -+; CHECK-LABEL: static_5136: -+; CHECK: // %bb.0: // %entry -+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: .cfi_offset w29, -16 -+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 -+; CHECK-NEXT: .cfi_def_cfa_offset 4112 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: sub sp, sp, #1040 -+; CHECK-NEXT: .cfi_def_cfa_offset 5152 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: mov x8, sp -+; CHECK-NEXT: str x8, [x0] -+; CHECK-NEXT: add sp, sp, #1, lsl #12 // =4096 -+; CHECK-NEXT: .cfi_def_cfa_offset 1056 -+; CHECK-NEXT: add sp, sp, #1040 -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload -+; CHECK-NEXT: .cfi_def_cfa_offset 0 -+; CHECK-NEXT: .cfi_restore w29 -+; CHECK-NEXT: ret -+entry: -+ %v = alloca i8, i64 5136, align 1 -+ store ptr %v, ptr %out, align 8 -+ ret void -+} -+ -+; 2*4k+1024, the largest frame needing two probes -+define void @static_9216(ptr %out) #0 { -+; CHECK-LABEL: static_9216: -+; CHECK: // %bb.0: // %entry -+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: .cfi_offset w29, -16 -+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 -+; CHECK-NEXT: .cfi_def_cfa_offset 4112 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 -+; CHECK-NEXT: .cfi_def_cfa_offset 8208 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: sub sp, sp, #1024 -+; CHECK-NEXT: .cfi_def_cfa_offset 9232 -+; CHECK-NEXT: mov x8, sp -+; CHECK-NEXT: str x8, [x0] -+; CHECK-NEXT: add sp, sp, #2, lsl #12 // =8192 -+; CHECK-NEXT: .cfi_def_cfa_offset 1040 -+; CHECK-NEXT: add sp, sp, #1024 -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload -+; CHECK-NEXT: .cfi_def_cfa_offset 0 -+; CHECK-NEXT: .cfi_restore w29 -+; CHECK-NEXT: ret -+entry: -+ %v = alloca i8, i64 9216, align 1 -+ store ptr %v, ptr %out, align 8 -+ ret void -+} -+ -+; 5*4k-16, the largest frame probed without a loop -+define void @static_20464(ptr %out) #0 { -+; CHECK-LABEL: static_20464: -+; CHECK: // %bb.0: // %entry -+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: .cfi_offset w29, -16 -+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 -+; CHECK-NEXT: .cfi_def_cfa_offset 4112 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 -+; CHECK-NEXT: .cfi_def_cfa_offset 8208 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 -+; CHECK-NEXT: .cfi_def_cfa_offset 12304 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 -+; CHECK-NEXT: .cfi_def_cfa_offset 16400 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: sub sp, sp, #4080 -+; CHECK-NEXT: .cfi_def_cfa_offset 20480 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: mov x8, sp -+; CHECK-NEXT: str x8, [x0] -+; CHECK-NEXT: add sp, sp, #4, lsl #12 // =16384 -+; CHECK-NEXT: .cfi_def_cfa_offset 4096 -+; CHECK-NEXT: add sp, sp, #4080 -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload -+; CHECK-NEXT: .cfi_def_cfa_offset 0 -+; CHECK-NEXT: .cfi_restore w29 -+; CHECK-NEXT: ret -+entry: -+ %v = alloca i8, i64 20464, align 1 -+ store ptr %v, ptr %out, align 8 -+ ret void -+} -+ -+; 5*4k, the smallest frame probed with a loop -+define void @static_20480(ptr %out) #0 { -+; CHECK-LABEL: static_20480: -+; CHECK: // %bb.0: // %entry -+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: .cfi_offset w29, -16 -+; CHECK-NEXT: sub x9, sp, #5, lsl #12 // =20480 -+; CHECK-NEXT: .cfi_def_cfa w9, 20496 -+; CHECK-NEXT: .LBB10_1: // %entry -+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: cmp sp, x9 -+; CHECK-NEXT: b.ne .LBB10_1 -+; CHECK-NEXT: // %bb.2: // %entry -+; CHECK-NEXT: .cfi_def_cfa_register wsp -+; CHECK-NEXT: mov x8, sp -+; CHECK-NEXT: str x8, [x0] -+; CHECK-NEXT: add sp, sp, #5, lsl #12 // =20480 -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload -+; CHECK-NEXT: .cfi_def_cfa_offset 0 -+; CHECK-NEXT: .cfi_restore w29 -+; CHECK-NEXT: ret -+entry: -+ %v = alloca i8, i64 20480, align 1 -+ store ptr %v, ptr %out, align 8 -+ ret void -+} -+ -+; 5*4k + 1024, large enough to use a loop, but not a multiple of 4KiB -+; so has a reminder, but no extra probe. -+define void @static_21504(ptr %out) #0 { -+; CHECK-LABEL: static_21504: -+; CHECK: // %bb.0: // %entry -+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: .cfi_offset w29, -16 -+; CHECK-NEXT: sub x9, sp, #5, lsl #12 // =20480 -+; CHECK-NEXT: .cfi_def_cfa w9, 20496 -+; CHECK-NEXT: .LBB11_1: // %entry -+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: cmp sp, x9 -+; CHECK-NEXT: b.ne .LBB11_1 -+; CHECK-NEXT: // %bb.2: // %entry -+; CHECK-NEXT: .cfi_def_cfa_register wsp -+; CHECK-NEXT: sub sp, sp, #1024 -+; CHECK-NEXT: .cfi_def_cfa_offset 21520 -+; CHECK-NEXT: mov x8, sp -+; CHECK-NEXT: str x8, [x0] -+; CHECK-NEXT: add sp, sp, #5, lsl #12 // =20480 -+; CHECK-NEXT: .cfi_def_cfa_offset 1040 -+; CHECK-NEXT: add sp, sp, #1024 -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload -+; CHECK-NEXT: .cfi_def_cfa_offset 0 -+; CHECK-NEXT: .cfi_restore w29 -+; CHECK-NEXT: ret -+entry: -+ %v = alloca i8, i64 21504, align 1 -+ store ptr %v, ptr %out, align 8 -+ ret void -+} -+ -+; 5*4k+1040, large enough to use a loop, has a reminder and -+; an extra probe. -+define void @static_21520(ptr %out) #0 { -+; CHECK-LABEL: static_21520: -+; CHECK: // %bb.0: // %entry -+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: .cfi_offset w29, -16 -+; CHECK-NEXT: sub x9, sp, #5, lsl #12 // =20480 -+; CHECK-NEXT: .cfi_def_cfa w9, 20496 -+; CHECK-NEXT: .LBB12_1: // %entry -+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: cmp sp, x9 -+; CHECK-NEXT: b.ne .LBB12_1 -+; CHECK-NEXT: // %bb.2: // %entry -+; CHECK-NEXT: .cfi_def_cfa_register wsp -+; CHECK-NEXT: sub sp, sp, #1040 -+; CHECK-NEXT: .cfi_def_cfa_offset 21536 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: mov x8, sp -+; CHECK-NEXT: str x8, [x0] -+; CHECK-NEXT: add sp, sp, #5, lsl #12 // =20480 -+; CHECK-NEXT: .cfi_def_cfa_offset 1056 -+; CHECK-NEXT: add sp, sp, #1040 -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload -+; CHECK-NEXT: .cfi_def_cfa_offset 0 -+; CHECK-NEXT: .cfi_restore w29 -+; CHECK-NEXT: ret -+entry: -+ %v = alloca i8, i64 21520, align 1 -+ store ptr %v, ptr %out, align 8 -+ ret void -+} -+ -+; A small allocation, but with a very large alignment requirement. We do this -+; by moving SP far enough that a sufficiently-aligned block will exist -+; somewhere in the stack frame, so must probe the whole of that larger SP move. -+define void @static_16_align_8192(ptr %out) #0 { -+; CHECK-LABEL: static_16_align_8192: -+; CHECK: // %bb.0: // %entry -+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: mov x29, sp -+; CHECK-NEXT: .cfi_def_cfa w29, 16 -+; CHECK-NEXT: .cfi_offset w30, -8 -+; CHECK-NEXT: .cfi_offset w29, -16 -+; CHECK-NEXT: sub x9, sp, #1, lsl #12 // =4096 -+; CHECK-NEXT: sub x9, x9, #4080 -+; CHECK-NEXT: and x9, x9, #0xffffffffffffe000 -+; CHECK-NEXT: .LBB13_1: // %entry -+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 -+; CHECK-NEXT: cmp sp, x9 -+; CHECK-NEXT: b.le .LBB13_3 -+; CHECK-NEXT: // %bb.2: // %entry -+; CHECK-NEXT: // in Loop: Header=BB13_1 Depth=1 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: b .LBB13_1 -+; CHECK-NEXT: .LBB13_3: // %entry -+; CHECK-NEXT: mov sp, x9 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: mov x8, sp -+; CHECK-NEXT: str x8, [x0] -+; CHECK-NEXT: mov sp, x29 -+; CHECK-NEXT: .cfi_def_cfa wsp, 16 -+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload -+; CHECK-NEXT: .cfi_def_cfa_offset 0 -+; CHECK-NEXT: .cfi_restore w30 -+; CHECK-NEXT: .cfi_restore w29 -+; CHECK-NEXT: ret -+entry: -+ %v = alloca i8, i64 16, align 8192 -+ store ptr %v, ptr %out, align 8 -+ ret void -+} -+ -+; A small allocation with a very large alignment requirement, but -+; nevertheless small enough as to not need a loop. -+define void @static_16_align_2048(ptr %out) #0 { -+; CHECK-LABEL: static_16_align_2048: -+; CHECK: // %bb.0: // %entry -+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: mov x29, sp -+; CHECK-NEXT: .cfi_def_cfa w29, 16 -+; CHECK-NEXT: .cfi_offset w30, -8 -+; CHECK-NEXT: .cfi_offset w29, -16 -+; CHECK-NEXT: sub x9, sp, #2032 -+; CHECK-NEXT: and sp, x9, #0xfffffffffffff800 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: mov x8, sp -+; CHECK-NEXT: str x8, [x0] -+; CHECK-NEXT: mov sp, x29 -+; CHECK-NEXT: .cfi_def_cfa wsp, 16 -+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload -+; CHECK-NEXT: .cfi_def_cfa_offset 0 -+; CHECK-NEXT: .cfi_restore w30 -+; CHECK-NEXT: .cfi_restore w29 -+; CHECK-NEXT: ret -+entry: -+ %v = alloca i8, i64 16, align 2048 -+ store ptr %v, ptr %out, align 8 -+ ret void -+} -+ -+; A large(-ish) allocation with a very large alignment requirement, but -+; nevertheless small enough as to not need a loop. -+define void @static_2032_align_2048(ptr %out) #0 { -+; CHECK-LABEL: static_2032_align_2048: -+; CHECK: // %bb.0: // %entry -+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: mov x29, sp -+; CHECK-NEXT: .cfi_def_cfa w29, 16 -+; CHECK-NEXT: .cfi_offset w30, -8 -+; CHECK-NEXT: .cfi_offset w29, -16 -+; CHECK-NEXT: sub x9, sp, #2032 -+; CHECK-NEXT: and sp, x9, #0xfffffffffffff800 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: mov x8, sp -+; CHECK-NEXT: str x8, [x0] -+; CHECK-NEXT: mov sp, x29 -+; CHECK-NEXT: .cfi_def_cfa wsp, 16 -+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload -+; CHECK-NEXT: .cfi_def_cfa_offset 0 -+; CHECK-NEXT: .cfi_restore w30 -+; CHECK-NEXT: .cfi_restore w29 -+; CHECK-NEXT: ret -+entry: -+ %v = alloca i8, i64 2032, align 2048 -+ store ptr %v, ptr %out, align 8 -+ ret void -+} -+ -+; Test stack probing is enabled by module flags -+define void @static_9232(ptr %out) uwtable(async) { -+; CHECK-LABEL: static_9232: -+; CHECK: // %bb.0: // %entry -+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: .cfi_offset w29, -16 -+; CHECK-NEXT: sub sp, sp, #2, lsl #12 // =8192 -+; CHECK-NEXT: .cfi_def_cfa_offset 8208 -+; CHECK-NEXT: sub sp, sp, #800 -+; CHECK-NEXT: .cfi_def_cfa_offset 9008 -+; CHECK-NEXT: str xzr, [sp], #-240 -+; CHECK-NEXT: .cfi_def_cfa_offset 9248 -+; CHECK-NEXT: mov x8, sp -+; CHECK-NEXT: str x8, [x0] -+; CHECK-NEXT: add sp, sp, #2, lsl #12 // =8192 -+; CHECK-NEXT: .cfi_def_cfa_offset 1056 -+; CHECK-NEXT: add sp, sp, #1040 -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload -+; CHECK-NEXT: .cfi_def_cfa_offset 0 -+; CHECK-NEXT: .cfi_restore w29 -+; CHECK-NEXT: ret -+entry: -+ %v = alloca i8, i64 9232, align 1 -+ store ptr %v, ptr %out, align 8 -+ ret void -+} -+ -+; Test for a tight upper bound on the amount of stack adjustment -+; due to stack realignment. No probes should appear. -+define void @static_1008(ptr %out) #0 { -+; CHECK-LABEL: static_1008: -+; CHECK: // %bb.0: // %entry -+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: mov x29, sp -+; CHECK-NEXT: .cfi_def_cfa w29, 16 -+; CHECK-NEXT: .cfi_offset w30, -8 -+; CHECK-NEXT: .cfi_offset w29, -16 -+; CHECK-NEXT: sub x9, sp, #1008 -+; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0 -+; CHECK-NEXT: mov x8, sp -+; CHECK-NEXT: str x8, [x0] -+; CHECK-NEXT: mov sp, x29 -+; CHECK-NEXT: .cfi_def_cfa wsp, 16 -+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload -+; CHECK-NEXT: .cfi_def_cfa_offset 0 -+; CHECK-NEXT: .cfi_restore w30 -+; CHECK-NEXT: .cfi_restore w29 -+; CHECK-NEXT: ret -+entry: -+ %v = alloca i8, i32 1008, align 32 -+ store ptr %v, ptr %out, align 8 -+ ret void -+} -+ -+attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "stack-probe-size"="4096" "frame-pointer"="none" } -+ -+!llvm.module.flags = !{!0, !1} -+ -+!0 = !{i32 4, !"probe-stack", !"inline-asm"} -+!1 = !{i32 8, !"stack-probe-size", i32 9000} -\ No newline at end of file --- -2.42.0.windows.2 - diff --git a/0024-Backport-AArch64-Stack-probing-for-dynamic-allocas-in-SelectionDAG.patch b/0024-Backport-AArch64-Stack-probing-for-dynamic-allocas-in-SelectionDAG.patch deleted file mode 100644 index 2cd9d3e..0000000 --- a/0024-Backport-AArch64-Stack-probing-for-dynamic-allocas-in-SelectionDAG.patch +++ /dev/null @@ -1,744 +0,0 @@ -From e433199a7dbe87324a671299f6509f19d295382f Mon Sep 17 00:00:00 2001 -From: rickyleung -Date: Fri, 26 Apr 2024 16:59:48 +0800 -Subject: [PATCH 5/7] [backport][AArch64] Stack probing for dynamic allocas in - SelectionDAG - -Reference: https://github.com/llvm/llvm-project/commit/b1806e6a1f0589acc88499419531c4eb82488f1a - -Add support for probing for dynamic allocas (variable-size objects and -outgoing stack arguments). - -Co-authored-by: Oliver Stannard ---- - .../Target/AArch64/AArch64FrameLowering.cpp | 26 ++ - .../Target/AArch64/AArch64ISelLowering.cpp | 152 +++++--- - llvm/lib/Target/AArch64/AArch64ISelLowering.h | 13 +- - llvm/lib/Target/AArch64/AArch64InstrInfo.td | 14 + - .../stack-probing-dynamic-no-frame-setup.ll | 14 + - .../CodeGen/AArch64/stack-probing-dynamic.ll | 362 ++++++++++++++++++ - 6 files changed, 526 insertions(+), 55 deletions(-) - create mode 100644 llvm/test/CodeGen/AArch64/stack-probing-dynamic-no-frame-setup.ll - create mode 100644 llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll - -diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp -index af019ab23770..fe21173f531f 100644 ---- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp -+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp -@@ -462,6 +462,11 @@ bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const { - /// included as part of the stack frame. - bool - AArch64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const { -+ // The stack probing code for the dynamically allocated outgoing arguments -+ // area assumes that the stack is probed at the top - either by the prologue -+ // code, which issues a probe if `hasVarSizedObjects` return true, or by the -+ // most recent variable-sized object allocation. Changing the condition here -+ // may need to be followed up by changes to the probe issuing logic. - return !MF.getFrameInfo().hasVarSizedObjects(); - } - -@@ -470,6 +475,9 @@ MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr( - MachineBasicBlock::iterator I) const { - const AArch64InstrInfo *TII = - static_cast(MF.getSubtarget().getInstrInfo()); -+ const AArch64TargetLowering *TLI = -+ MF.getSubtarget().getTargetLowering(); -+ MachineFrameInfo &MFI = MF.getFrameInfo(); - DebugLoc DL = I->getDebugLoc(); - unsigned Opc = I->getOpcode(); - bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode(); -@@ -496,6 +504,24 @@ MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr( - // Most call frames will be allocated at the start of a function so - // this is OK, but it is a limitation that needs dealing with. - assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large"); -+ -+ if (TLI->hasInlineStackProbe(MF) && -+ -Amount >= AArch64::StackProbeMaxUnprobedStack) { -+ // When stack probing is enabled, the decrement of SP may need to be -+ // probed. We only need to do this if the call site needs 1024 bytes of -+ // space or more, because a region smaller than that is allowed to be -+ // unprobed at an ABI boundary. We rely on the fact that SP has been -+ // probed exactly at this point, either by the prologue or most recent -+ // dynamic allocation. -+ assert(MFI.hasVarSizedObjects() && -+ "non-reserved call frame without var sized objects?"); -+ Register ScratchReg = -+ MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass); -+ inlineStackProbeFixed(I, ScratchReg, -Amount, StackOffset::get(0, 0)); -+ } else { -+ emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, -+ StackOffset::getFixed(Amount), TII); -+ } - emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, - StackOffset::getFixed(Amount), TII); - } -diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp -index 082043420fb9..eff0722e1c77 100644 ---- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp -+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp -@@ -556,10 +556,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, - setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); - setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); - -- if (Subtarget->isTargetWindows()) -- setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom); -- else -- setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); -+ setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom); - - // Constant pool entries - setOperationAction(ISD::ConstantPool, MVT::i64, Custom); -@@ -2288,6 +2285,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { - MAKE_CASE(AArch64ISD::CSINC) - MAKE_CASE(AArch64ISD::THREAD_POINTER) - MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ) -+ MAKE_CASE(AArch64ISD::PROBED_ALLOCA) - MAKE_CASE(AArch64ISD::ABDS_PRED) - MAKE_CASE(AArch64ISD::ABDU_PRED) - MAKE_CASE(AArch64ISD::HADDS_PRED) -@@ -2646,6 +2644,22 @@ MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet( - return BB; - } - -+MachineBasicBlock * -+AArch64TargetLowering::EmitDynamicProbedAlloc(MachineInstr &MI, -+ MachineBasicBlock *MBB) const { -+ MachineFunction &MF = *MBB->getParent(); -+ MachineBasicBlock::iterator MBBI = MI.getIterator(); -+ DebugLoc DL = MBB->findDebugLoc(MBBI); -+ const AArch64InstrInfo &TII = -+ *MF.getSubtarget().getInstrInfo(); -+ Register TargetReg = MI.getOperand(0).getReg(); -+ MachineBasicBlock::iterator NextInst = -+ TII.probedStackAlloc(MBBI, TargetReg, false); -+ -+ MI.eraseFromParent(); -+ return NextInst->getParent(); -+} -+ - MachineBasicBlock * - AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg, - MachineInstr &MI, -@@ -2774,6 +2788,8 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( - - case AArch64::CATCHRET: - return EmitLoweredCatchRet(MI, BB); -+ case AArch64::PROBED_STACKALLOC_DYN: -+ return EmitDynamicProbedAlloc(MI, BB); - case AArch64::LD1_MXIPXX_H_PSEUDO_B: - return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB); - case AArch64::LD1_MXIPXX_H_PSEUDO_H: -@@ -13666,9 +13682,34 @@ SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op, - AN->getMemOperand()); - } - --SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC( -- SDValue Op, SDValue Chain, SDValue &Size, SelectionDAG &DAG) const { -+SDValue -+AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, -+ SelectionDAG &DAG) const { -+ - SDLoc dl(Op); -+ // Get the inputs. -+ SDNode *Node = Op.getNode(); -+ SDValue Chain = Op.getOperand(0); -+ SDValue Size = Op.getOperand(1); -+ MaybeAlign Align = -+ cast(Op.getOperand(2))->getMaybeAlignValue(); -+ EVT VT = Node->getValueType(0); -+ -+ if (DAG.getMachineFunction().getFunction().hasFnAttribute( -+ "no-stack-arg-probe")) { -+ SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64); -+ Chain = SP.getValue(1); -+ SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); -+ if (Align) -+ SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), -+ DAG.getConstant(-(uint64_t)Align->value(), dl, VT)); -+ Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP); -+ SDValue Ops[2] = {SP, Chain}; -+ return DAG.getMergeValues(Ops, dl); -+ } -+ -+ Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl); -+ - EVT PtrVT = getPointerTy(DAG.getDataLayout()); - SDValue Callee = DAG.getTargetExternalSymbol(Subtarget->getChkStkName(), - PtrVT, 0); -@@ -13692,7 +13733,59 @@ SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC( - - Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size, - DAG.getConstant(4, dl, MVT::i64)); -- return Chain; -+ -+ SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64); -+ Chain = SP.getValue(1); -+ SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); -+ if (Align) -+ SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), -+ DAG.getConstant(-(uint64_t)Align->value(), dl, VT)); -+ Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP); -+ -+ Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl); -+ -+ SDValue Ops[2] = {SP, Chain}; -+ return DAG.getMergeValues(Ops, dl); -+} -+ -+SDValue -+AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op, -+ SelectionDAG &DAG) const { -+ // Get the inputs. -+ SDNode *Node = Op.getNode(); -+ SDValue Chain = Op.getOperand(0); -+ SDValue Size = Op.getOperand(1); -+ -+ MaybeAlign Align = -+ cast(Op.getOperand(2))->getMaybeAlignValue(); -+ SDLoc dl(Op); -+ EVT VT = Node->getValueType(0); -+ -+ // Construct the new SP value in a GPR. -+ SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64); -+ Chain = SP.getValue(1); -+ SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); -+ if (Align) -+ SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), -+ DAG.getConstant(-(uint64_t)Align->value(), dl, VT)); -+ -+ // Set the real SP to the new value with a probing loop. -+ Chain = DAG.getNode(AArch64ISD::PROBED_ALLOCA, dl, MVT::Other, Chain, SP); -+ SDValue Ops[2] = {SP, Chain}; -+ return DAG.getMergeValues(Ops, dl); -+} -+ -+SDValue -+AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, -+ SelectionDAG &DAG) const { -+ MachineFunction &MF = DAG.getMachineFunction(); -+ -+ if (Subtarget->isTargetWindows()) -+ return LowerWindowsDYNAMIC_STACKALLOC(Op, DAG); -+ else if (hasInlineStackProbe(MF)) -+ return LowerInlineDYNAMIC_STACKALLOC(Op, DAG); -+ else -+ return SDValue(); - } - - // When x and y are extended, lower: -@@ -13746,51 +13839,6 @@ SDValue AArch64TargetLowering::LowerAVG(SDValue Op, SelectionDAG &DAG, - return DAG.getNode(ISD::ADD, dl, VT, Add, tmp); - } - --SDValue --AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, -- SelectionDAG &DAG) const { -- assert(Subtarget->isTargetWindows() && -- "Only Windows alloca probing supported"); -- SDLoc dl(Op); -- // Get the inputs. -- SDNode *Node = Op.getNode(); -- SDValue Chain = Op.getOperand(0); -- SDValue Size = Op.getOperand(1); -- MaybeAlign Align = -- cast(Op.getOperand(2))->getMaybeAlignValue(); -- EVT VT = Node->getValueType(0); -- -- if (DAG.getMachineFunction().getFunction().hasFnAttribute( -- "no-stack-arg-probe")) { -- SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64); -- Chain = SP.getValue(1); -- SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); -- if (Align) -- SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), -- DAG.getConstant(-(uint64_t)Align->value(), dl, VT)); -- Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP); -- SDValue Ops[2] = {SP, Chain}; -- return DAG.getMergeValues(Ops, dl); -- } -- -- Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl); -- -- Chain = LowerWindowsDYNAMIC_STACKALLOC(Op, Chain, Size, DAG); -- -- SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64); -- Chain = SP.getValue(1); -- SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); -- if (Align) -- SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), -- DAG.getConstant(-(uint64_t)Align->value(), dl, VT)); -- Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP); -- -- Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl); -- -- SDValue Ops[2] = {SP, Chain}; -- return DAG.getMergeValues(Ops, dl); --} -- - SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op, - SelectionDAG &DAG) const { - EVT VT = Op.getValueType(); -diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h -index 643d363e234a..9b388c7f8668 100644 ---- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h -+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h -@@ -90,6 +90,10 @@ enum NodeType : unsigned { - ADC, - SBC, // adc, sbc instructions - -+ // To avoid stack clash, allocation is performed by block and each block is -+ // probed. -+ PROBED_ALLOCA, -+ - // Predicated instructions where inactive lanes produce undefined results. - ABDS_PRED, - ABDU_PRED, -@@ -610,6 +614,9 @@ public: - MachineBasicBlock *EmitLoweredCatchRet(MachineInstr &MI, - MachineBasicBlock *BB) const; - -+ MachineBasicBlock *EmitDynamicProbedAlloc(MachineInstr &MI, -+ MachineBasicBlock *MBB) const; -+ - MachineBasicBlock *EmitTileLoad(unsigned Opc, unsigned BaseReg, - MachineInstr &MI, - MachineBasicBlock *BB) const; -@@ -1113,10 +1120,10 @@ private: - SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerATOMIC_LOAD_SUB(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerATOMIC_LOAD_AND(SDValue Op, SelectionDAG &DAG) const; -+ SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; -+ SDValue LowerInlineDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; -- SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SDValue Chain, -- SDValue &Size, -- SelectionDAG &DAG) const; -+ - SDValue LowerAVG(SDValue Op, SelectionDAG &DAG, unsigned NewOp) const; - - SDValue LowerFixedLengthVectorIntDivideToSVE(SDValue Op, -diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td -index 09980c2f45e6..9b9103e01d67 100644 ---- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td -+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td -@@ -818,6 +818,12 @@ def AArch64stilp : SDNode<"AArch64ISD::STILP", SDT_AArch64stilp, [SDNPHasChain, - def AArch64stnp : SDNode<"AArch64ISD::STNP", SDT_AArch64stnp, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; - - def AArch64tbl : SDNode<"AArch64ISD::TBL", SDT_AArch64TBL>; -+ -+def AArch64probedalloca -+ : SDNode<"AArch64ISD::PROBED_ALLOCA", -+ SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>, -+ [SDNPHasChain, SDNPMayStore]>; -+ - def AArch64mrs : SDNode<"AArch64ISD::MRS", - SDTypeProfile<1, 1, [SDTCisVT<0, i64>, SDTCisVT<1, i32>]>, - [SDNPHasChain, SDNPOutGlue]>; -@@ -908,6 +914,14 @@ def PROBED_STACKALLOC_VAR : Pseudo<(outs), - []>, - Sched<[]>; - -+// Probed stack allocations of a variable size, used for allocas of unknown size -+// when stack-clash protection is enabled. -+let usesCustomInserter = 1 in -+def PROBED_STACKALLOC_DYN : Pseudo<(outs), -+ (ins GPR64common:$target), -+ [(AArch64probedalloca GPR64common:$target)]>, -+ Sched<[]>; -+ - } // Defs = [SP, NZCV], Uses = [SP] in - } // hasSideEffects = 1, isCodeGenOnly = 1 - -diff --git a/llvm/test/CodeGen/AArch64/stack-probing-dynamic-no-frame-setup.ll b/llvm/test/CodeGen/AArch64/stack-probing-dynamic-no-frame-setup.ll -new file mode 100644 -index 000000000000..673f9038a35f ---- /dev/null -+++ b/llvm/test/CodeGen/AArch64/stack-probing-dynamic-no-frame-setup.ll -@@ -0,0 +1,14 @@ -+; RUN: llc --stop-after=finalize-isel -o - | FileCheck %s -+target triple = "aarch64-linux" -+ -+; Check dynamic stack allocation and probing instructions do not have -+; the FrameSetup flag. -+ -+; CHECK-NOT: frame-setup -+define void @no_frame_setup(i64 %size, ptr %out) #0 { -+ %v = alloca i8, i64 %size, align 1 -+ store ptr %v, ptr %out, align 8 -+ ret void -+} -+ -+attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" } -\ No newline at end of file -diff --git a/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll b/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll -new file mode 100644 -index 000000000000..4d9ef77f7a0d ---- /dev/null -+++ b/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll -@@ -0,0 +1,362 @@ -+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -+; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs | FileCheck %s -+ -+; Dynamically-sized allocation, needs a loop which can handle any size at -+; runtime. The final iteration of the loop will temporarily put SP below the -+; target address, but this doesn't break any of the ABI constraints on the -+; stack, and also doesn't probe below the target SP value. -+define void @dynamic(i64 %size, ptr %out) #0 { -+; CHECK-LABEL: dynamic: -+; CHECK: // %bb.0: -+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: mov x29, sp -+; CHECK-NEXT: .cfi_def_cfa w29, 16 -+; CHECK-NEXT: .cfi_offset w30, -8 -+; CHECK-NEXT: .cfi_offset w29, -16 -+; CHECK-NEXT: add x9, x0, #15 -+; CHECK-NEXT: mov x8, sp -+; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 -+; CHECK-NEXT: sub x8, x8, x9 -+; CHECK-NEXT: .LBB0_1: // =>This Inner Loop Header: Depth=1 -+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 -+; CHECK-NEXT: cmp sp, x8 -+; CHECK-NEXT: b.le .LBB0_3 -+; CHECK-NEXT: // %bb.2: // in Loop: Header=BB0_1 Depth=1 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: b .LBB0_1 -+; CHECK-NEXT: .LBB0_3: -+; CHECK-NEXT: mov sp, x8 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: str x8, [x1] -+; CHECK-NEXT: mov sp, x29 -+; CHECK-NEXT: .cfi_def_cfa wsp, 16 -+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload -+; CHECK-NEXT: .cfi_def_cfa_offset 0 -+; CHECK-NEXT: .cfi_restore w30 -+; CHECK-NEXT: .cfi_restore w29 -+; CHECK-NEXT: ret -+ %v = alloca i8, i64 %size, align 1 -+ store ptr %v, ptr %out, align 8 -+ ret void -+} -+ -+; This function has a fixed-size stack slot and a dynamic one. The fixed size -+; slot isn't large enough that we would normally probe it, but we need to do so -+; here otherwise the gap between the CSR save and the first probe of the -+; dynamic allocation could be too far apart when the size of the dynamic -+; allocation is close to the guard size. -+define void @dynamic_fixed(i64 %size, ptr %out1, ptr %out2) #0 { -+; CHECK-LABEL: dynamic_fixed: -+; CHECK: // %bb.0: -+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: mov x29, sp -+; CHECK-NEXT: .cfi_def_cfa w29, 16 -+; CHECK-NEXT: .cfi_offset w30, -8 -+; CHECK-NEXT: .cfi_offset w29, -16 -+; CHECK-NEXT: str xzr, [sp, #-64]! -+; CHECK-NEXT: add x9, x0, #15 -+; CHECK-NEXT: mov x8, sp -+; CHECK-NEXT: sub x10, x29, #64 -+; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 -+; CHECK-NEXT: str x10, [x1] -+; CHECK-NEXT: sub x8, x8, x9 -+; CHECK-NEXT: .LBB1_1: // =>This Inner Loop Header: Depth=1 -+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 -+; CHECK-NEXT: cmp sp, x8 -+; CHECK-NEXT: b.le .LBB1_3 -+; CHECK-NEXT: // %bb.2: // in Loop: Header=BB1_1 Depth=1 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: b .LBB1_1 -+; CHECK-NEXT: .LBB1_3: -+; CHECK-NEXT: mov sp, x8 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: str x8, [x2] -+; CHECK-NEXT: mov sp, x29 -+; CHECK-NEXT: .cfi_def_cfa wsp, 16 -+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload -+; CHECK-NEXT: .cfi_def_cfa_offset 0 -+; CHECK-NEXT: .cfi_restore w30 -+; CHECK-NEXT: .cfi_restore w29 -+; CHECK-NEXT: ret -+ %v1 = alloca i8, i64 64, align 1 -+ store ptr %v1, ptr %out1, align 8 -+ %v2 = alloca i8, i64 %size, align 1 -+ store ptr %v2, ptr %out2, align 8 -+ ret void -+} -+ -+; Dynamic allocation, with an alignment requirement greater than the alignment -+; of SP. Done by ANDing the target SP with a constant to align it down, then -+; doing the loop as normal. Note that we also re-align the stack in the prolog, -+; which isn't actually needed because the only aligned allocations are dynamic, -+; this is done even without stack probing. -+define void @dynamic_align_64(i64 %size, ptr %out) #0 { -+; CHECK-LABEL: dynamic_align_64: -+; CHECK: // %bb.0: -+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -+; CHECK-NEXT: .cfi_def_cfa_offset 32 -+; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill -+; CHECK-NEXT: mov x29, sp -+; CHECK-NEXT: .cfi_def_cfa w29, 32 -+; CHECK-NEXT: .cfi_offset w19, -16 -+; CHECK-NEXT: .cfi_offset w30, -24 -+; CHECK-NEXT: .cfi_offset w29, -32 -+; CHECK-NEXT: sub x9, sp, #32 -+; CHECK-NEXT: and sp, x9, #0xffffffffffffffc0 -+; CHECK-NEXT: add x9, x0, #15 -+; CHECK-NEXT: mov x8, sp -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 -+; CHECK-NEXT: mov x19, sp -+; CHECK-NEXT: sub x8, x8, x9 -+; CHECK-NEXT: and x8, x8, #0xffffffffffffffc0 -+; CHECK-NEXT: .LBB2_1: // =>This Inner Loop Header: Depth=1 -+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 -+; CHECK-NEXT: cmp sp, x8 -+; CHECK-NEXT: b.le .LBB2_3 -+; CHECK-NEXT: // %bb.2: // in Loop: Header=BB2_1 Depth=1 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: b .LBB2_1 -+; CHECK-NEXT: .LBB2_3: -+; CHECK-NEXT: mov sp, x8 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: str x8, [x1] -+; CHECK-NEXT: mov sp, x29 -+; CHECK-NEXT: .cfi_def_cfa wsp, 32 -+; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload -+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload -+; CHECK-NEXT: .cfi_def_cfa_offset 0 -+; CHECK-NEXT: .cfi_restore w19 -+; CHECK-NEXT: .cfi_restore w30 -+; CHECK-NEXT: .cfi_restore w29 -+; CHECK-NEXT: ret -+ %v = alloca i8, i64 %size, align 64 -+ store ptr %v, ptr %out, align 8 -+ ret void -+} -+ -+; Dynamic allocation, with an alignment greater than the stack guard size. The -+; only difference to the dynamic allocation is the constant used for aligning -+; the target SP, the loop will probe the whole allocation without needing to -+; know about the alignment padding. -+define void @dynamic_align_8192(i64 %size, ptr %out) #0 { -+; CHECK-LABEL: dynamic_align_8192: -+; CHECK: // %bb.0: -+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -+; CHECK-NEXT: .cfi_def_cfa_offset 32 -+; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill -+; CHECK-NEXT: mov x29, sp -+; CHECK-NEXT: .cfi_def_cfa w29, 32 -+; CHECK-NEXT: .cfi_offset w19, -16 -+; CHECK-NEXT: .cfi_offset w30, -24 -+; CHECK-NEXT: .cfi_offset w29, -32 -+; CHECK-NEXT: sub x9, sp, #1, lsl #12 // =4096 -+; CHECK-NEXT: sub x9, x9, #4064 -+; CHECK-NEXT: and x9, x9, #0xffffffffffffe000 -+; CHECK-NEXT: .LBB3_1: // =>This Inner Loop Header: Depth=1 -+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 -+; CHECK-NEXT: cmp sp, x9 -+; CHECK-NEXT: b.le .LBB3_3 -+; CHECK-NEXT: // %bb.2: // in Loop: Header=BB3_1 Depth=1 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: b .LBB3_1 -+; CHECK-NEXT: .LBB3_3: -+; CHECK-NEXT: mov sp, x9 -+; CHECK-NEXT: add x9, x0, #15 -+; CHECK-NEXT: mov x8, sp -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 -+; CHECK-NEXT: mov x19, sp -+; CHECK-NEXT: sub x8, x8, x9 -+; CHECK-NEXT: and x8, x8, #0xffffffffffffe000 -+; CHECK-NEXT: .LBB3_4: // =>This Inner Loop Header: Depth=1 -+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 -+; CHECK-NEXT: cmp sp, x8 -+; CHECK-NEXT: b.le .LBB3_6 -+; CHECK-NEXT: // %bb.5: // in Loop: Header=BB3_4 Depth=1 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: b .LBB3_4 -+; CHECK-NEXT: .LBB3_6: -+; CHECK-NEXT: mov sp, x8 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: str x8, [x1] -+; CHECK-NEXT: mov sp, x29 -+; CHECK-NEXT: .cfi_def_cfa wsp, 32 -+; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload -+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload -+; CHECK-NEXT: .cfi_def_cfa_offset 0 -+; CHECK-NEXT: .cfi_restore w19 -+; CHECK-NEXT: .cfi_restore w30 -+; CHECK-NEXT: .cfi_restore w29 -+; CHECK-NEXT: ret -+ %v = alloca i8, i64 %size, align 8192 -+ store ptr %v, ptr %out, align 8 -+ ret void -+} -+ -+; For 64k guard pages, the only difference is the constant subtracted from SP -+; in the loop. -+define void @dynamic_64k_guard(i64 %size, ptr %out) #0 "stack-probe-size"="65536" { -+; CHECK-LABEL: dynamic_64k_guard: -+; CHECK: // %bb.0: -+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: mov x29, sp -+; CHECK-NEXT: .cfi_def_cfa w29, 16 -+; CHECK-NEXT: .cfi_offset w30, -8 -+; CHECK-NEXT: .cfi_offset w29, -16 -+; CHECK-NEXT: add x9, x0, #15 -+; CHECK-NEXT: mov x8, sp -+; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 -+; CHECK-NEXT: sub x8, x8, x9 -+; CHECK-NEXT: .LBB4_1: // =>This Inner Loop Header: Depth=1 -+; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 -+; CHECK-NEXT: cmp sp, x8 -+; CHECK-NEXT: b.le .LBB4_3 -+; CHECK-NEXT: // %bb.2: // in Loop: Header=BB4_1 Depth=1 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: b .LBB4_1 -+; CHECK-NEXT: .LBB4_3: -+; CHECK-NEXT: mov sp, x8 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: str x8, [x1] -+; CHECK-NEXT: mov sp, x29 -+; CHECK-NEXT: .cfi_def_cfa wsp, 16 -+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload -+; CHECK-NEXT: .cfi_def_cfa_offset 0 -+; CHECK-NEXT: .cfi_restore w30 -+; CHECK-NEXT: .cfi_restore w29 -+; CHECK-NEXT: ret -+ %v = alloca i8, i64 %size, align 1 -+ store ptr %v, ptr %out, align 8 -+ ret void -+} -+ -+; If a function has variable-sized stack objects, then any function calls which -+; need to pass arguments on the stack must allocate the stack space for them -+; dynamically, to ensure they are at the bottom of the frame. We need to probe -+; that space when it is larger than the unprobed space allowed by the ABI (1024 -+; bytes), so this needs a very large number of arguments. -+define void @no_reserved_call_frame(i64 %n) #0 { -+; CHECK-LABEL: no_reserved_call_frame: -+; CHECK: // %bb.0: // %entry -+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: mov x29, sp -+; CHECK-NEXT: .cfi_def_cfa w29, 16 -+; CHECK-NEXT: .cfi_offset w30, -8 -+; CHECK-NEXT: .cfi_offset w29, -16 -+; CHECK-NEXT: lsl x9, x0, #2 -+; CHECK-NEXT: mov x8, sp -+; CHECK-NEXT: add x9, x9, #15 -+; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 -+; CHECK-NEXT: sub x0, x8, x9 -+; CHECK-NEXT: .LBB5_1: // %entry -+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 -+; CHECK-NEXT: cmp sp, x0 -+; CHECK-NEXT: b.le .LBB5_3 -+; CHECK-NEXT: // %bb.2: // %entry -+; CHECK-NEXT: // in Loop: Header=BB5_1 Depth=1 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: b .LBB5_1 -+; CHECK-NEXT: .LBB5_3: // %entry -+; CHECK-NEXT: mov sp, x0 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: sub sp, sp, #1104 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: bl callee_stack_args -+; CHECK-NEXT: add sp, sp, #1104 -+; CHECK-NEXT: mov sp, x29 -+; CHECK-NEXT: .cfi_def_cfa wsp, 16 -+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload -+; CHECK-NEXT: .cfi_def_cfa_offset 0 -+; CHECK-NEXT: .cfi_restore w30 -+; CHECK-NEXT: .cfi_restore w29 -+; CHECK-NEXT: ret -+entry: -+ %v = alloca i32, i64 %n -+ call void @callee_stack_args(ptr %v, [138 x i64] undef) -+ ret void -+} -+ -+; Same as above but without a variable-sized allocation, so the reserved call -+; frame can be folded into the fixed-size allocation in the prologue. -+define void @reserved_call_frame(i64 %n) #0 { -+; CHECK-LABEL: reserved_call_frame: -+; CHECK: // %bb.0: // %entry -+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -+; CHECK-NEXT: .cfi_def_cfa_offset 32 -+; CHECK-NEXT: str x28, [sp, #16] // 8-byte Folded Spill -+; CHECK-NEXT: mov x29, sp -+; CHECK-NEXT: .cfi_def_cfa w29, 32 -+; CHECK-NEXT: .cfi_offset w28, -16 -+; CHECK-NEXT: .cfi_offset w30, -24 -+; CHECK-NEXT: .cfi_offset w29, -32 -+; CHECK-NEXT: sub sp, sp, #1504 -+; CHECK-NEXT: add x0, sp, #1104 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: bl callee_stack_args -+; CHECK-NEXT: add sp, sp, #1504 -+; CHECK-NEXT: .cfi_def_cfa wsp, 32 -+; CHECK-NEXT: ldr x28, [sp, #16] // 8-byte Folded Reload -+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload -+; CHECK-NEXT: .cfi_def_cfa_offset 0 -+; CHECK-NEXT: .cfi_restore w28 -+; CHECK-NEXT: .cfi_restore w30 -+; CHECK-NEXT: .cfi_restore w29 -+; CHECK-NEXT: ret -+entry: -+ %v = alloca i32, i64 100 -+ call void @callee_stack_args(ptr %v, [138 x i64] undef) -+ ret void -+} -+ -+declare void @callee_stack_args(ptr, [138 x i64]) -+ -+; Dynamic allocation of SVE vectors -+define void @dynamic_sve(i64 %size, ptr %out) #0 "target-features"="+sve" { -+; CHECK-LABEL: dynamic_sve: -+; CHECK: // %bb.0: -+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -+; CHECK-NEXT: .cfi_def_cfa_offset 32 -+; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill -+; CHECK-NEXT: mov x29, sp -+; CHECK-NEXT: .cfi_def_cfa w29, 32 -+; CHECK-NEXT: .cfi_offset w19, -16 -+; CHECK-NEXT: .cfi_offset w30, -24 -+; CHECK-NEXT: .cfi_offset w29, -32 -+; CHECK-NEXT: rdvl x9, #1 -+; CHECK-NEXT: mov x10, #15 // =0xf -+; CHECK-NEXT: mov x8, sp -+; CHECK-NEXT: madd x9, x0, x9, x10 -+; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 -+; CHECK-NEXT: sub x8, x8, x9 -+; CHECK-NEXT: .LBB7_1: // =>This Inner Loop Header: Depth=1 -+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 -+; CHECK-NEXT: cmp sp, x8 -+; CHECK-NEXT: b.le .LBB7_3 -+; CHECK-NEXT: // %bb.2: // in Loop: Header=BB7_1 Depth=1 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: b .LBB7_1 -+; CHECK-NEXT: .LBB7_3: -+; CHECK-NEXT: mov sp, x8 -+; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: str x8, [x1] -+; CHECK-NEXT: mov sp, x29 -+; CHECK-NEXT: .cfi_def_cfa wsp, 32 -+; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload -+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload -+; CHECK-NEXT: .cfi_def_cfa_offset 0 -+; CHECK-NEXT: .cfi_restore w19 -+; CHECK-NEXT: .cfi_restore w30 -+; CHECK-NEXT: .cfi_restore w29 -+; CHECK-NEXT: ret -+ %v = alloca , i64 %size, align 16 -+ store ptr %v, ptr %out, align 8 -+ ret void -+} -+ -+attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" } -\ No newline at end of file --- -2.42.0.windows.2 - diff --git a/0025-Backport-AArch64-Stack-probing-for-dynamic-allocas-in-GlobalISel.patch b/0025-Backport-AArch64-Stack-probing-for-dynamic-allocas-in-GlobalISel.patch deleted file mode 100644 index 42d7350..0000000 --- a/0025-Backport-AArch64-Stack-probing-for-dynamic-allocas-in-GlobalISel.patch +++ /dev/null @@ -1,496 +0,0 @@ -From dbca022577e0da1f411ee84143d59c6c9d941969 Mon Sep 17 00:00:00 2001 -From: rickyleung -Date: Fri, 26 Apr 2024 17:29:18 +0800 -Subject: [PATCH 6/7] [backport][AArch64] Stack probing for dynamic allocas in - GlobalISel - -Reference: https://github.com/llvm/llvm-project/commit/c1140d49ec3363bf903e4c1dbf7a3f5e8c1b6523 - -Co-authored-by: Oliver Stannard ---- - .../llvm/CodeGen/GlobalISel/LegalizerHelper.h | 2 + - .../CodeGen/GlobalISel/LegalizerHelper.cpp | 37 ++- - .../AArch64/GISel/AArch64LegalizerInfo.cpp | 47 +++- - .../AArch64/GISel/AArch64LegalizerInfo.h | 1 + - .../GlobalISel/legalize-dyn-alloca.mir | 255 ++++++++++++++---- - .../GlobalISel/legalizer-info-validation.mir | 7 + - .../CodeGen/AArch64/stack-probing-dynamic.ll | 3 +- - 7 files changed, 284 insertions(+), 68 deletions(-) - -diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h -index 9288091874cf..7abbd1f03f16 100644 ---- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h -+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h -@@ -400,6 +400,8 @@ public: - LegalizeResult lowerUnmergeValues(MachineInstr &MI); - LegalizeResult lowerExtractInsertVectorElt(MachineInstr &MI); - LegalizeResult lowerShuffleVector(MachineInstr &MI); -+ Register getDynStackAllocTargetPtr(Register SPReg, Register AllocSize, -+ Align Alignment, LLT PtrTy); - LegalizeResult lowerDynStackAlloc(MachineInstr &MI); - LegalizeResult lowerStackSave(MachineInstr &MI); - LegalizeResult lowerStackRestore(MachineInstr &MI); -diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp -index 75d9789be4d0..5557456e706d 100644 ---- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp -+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp -@@ -6777,21 +6777,12 @@ LegalizerHelper::lowerShuffleVector(MachineInstr &MI) { - return Legalized; - } - --LegalizerHelper::LegalizeResult --LegalizerHelper::lowerDynStackAlloc(MachineInstr &MI) { -- const auto &MF = *MI.getMF(); -- const auto &TFI = *MF.getSubtarget().getFrameLowering(); -- if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp) -- return UnableToLegalize; -- -- Register Dst = MI.getOperand(0).getReg(); -- Register AllocSize = MI.getOperand(1).getReg(); -- Align Alignment = assumeAligned(MI.getOperand(2).getImm()); -- -- LLT PtrTy = MRI.getType(Dst); -+Register LegalizerHelper::getDynStackAllocTargetPtr(Register SPReg, -+ Register AllocSize, -+ Align Alignment, -+ LLT PtrTy) { - LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits()); - -- Register SPReg = TLI.getStackPointerRegisterToSaveRestore(); - auto SPTmp = MIRBuilder.buildCopy(PtrTy, SPReg); - SPTmp = MIRBuilder.buildCast(IntPtrTy, SPTmp); - -@@ -6806,7 +6797,25 @@ LegalizerHelper::lowerDynStackAlloc(MachineInstr &MI) { - Alloc = MIRBuilder.buildAnd(IntPtrTy, Alloc, AlignCst); - } - -- SPTmp = MIRBuilder.buildCast(PtrTy, Alloc); -+ return MIRBuilder.buildCast(PtrTy, Alloc).getReg(0); -+} -+ -+LegalizerHelper::LegalizeResult -+LegalizerHelper::lowerDynStackAlloc(MachineInstr &MI) { -+ const auto &MF = *MI.getMF(); -+ const auto &TFI = *MF.getSubtarget().getFrameLowering(); -+ if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp) -+ return UnableToLegalize; -+ -+ Register Dst = MI.getOperand(0).getReg(); -+ Register AllocSize = MI.getOperand(1).getReg(); -+ Align Alignment = assumeAligned(MI.getOperand(2).getImm()); -+ -+ LLT PtrTy = MRI.getType(Dst); -+ Register SPReg = TLI.getStackPointerRegisterToSaveRestore(); -+ Register SPTmp = -+ getDynStackAllocTargetPtr(SPReg, AllocSize, Alignment, PtrTy); -+ - MIRBuilder.buildCopy(SPReg, SPTmp); - MIRBuilder.buildCopy(Dst, SPTmp); - -diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp -index f0130a0be29d..0dd2b4d48dd6 100644 ---- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp -+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp -@@ -797,9 +797,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) - return Query.Types[0] == p0 && Query.Types[1] == s64; - }); - -- getActionDefinitionsBuilder({G_DYN_STACKALLOC, -- G_STACKSAVE, -- G_STACKRESTORE}).lower(); -+ getActionDefinitionsBuilder(G_DYN_STACKALLOC).custom(); -+ -+ getActionDefinitionsBuilder({G_STACKSAVE, G_STACKRESTORE}).lower(); - - if (ST.hasMOPS()) { - // G_BZERO is not supported. Currently it is only emitted by -@@ -993,6 +993,8 @@ bool AArch64LegalizerInfo::legalizeCustom(LegalizerHelper &Helper, - return legalizeMemOps(MI, Helper); - case TargetOpcode::G_FCOPYSIGN: - return legalizeFCopySign(MI, Helper); -+ case TargetOpcode::G_DYN_STACKALLOC: -+ return legalizeDynStackAlloc(MI, Helper); - } - - llvm_unreachable("expected switch to return"); -@@ -1689,3 +1691,42 @@ bool AArch64LegalizerInfo::legalizeFCopySign(MachineInstr &MI, - MI.eraseFromParent(); - return true; - } -+ -+bool AArch64LegalizerInfo::legalizeDynStackAlloc( -+ MachineInstr &MI, LegalizerHelper &Helper) const { -+ MachineFunction &MF = *MI.getParent()->getParent(); -+ MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; -+ MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); -+ -+ // If stack probing is not enabled for this function, use the default -+ // lowering. -+ if (!MF.getFunction().hasFnAttribute("probe-stack") || -+ MF.getFunction().getFnAttribute("probe-stack").getValueAsString() != -+ "inline-asm") { -+ Helper.lowerDynStackAlloc(MI); -+ return true; -+ } -+ -+ Register Dst = MI.getOperand(0).getReg(); -+ Register AllocSize = MI.getOperand(1).getReg(); -+ Align Alignment = assumeAligned(MI.getOperand(2).getImm()); -+ -+ assert(MRI.getType(Dst) == LLT::pointer(0, 64) && -+ "Unexpected type for dynamic alloca"); -+ assert(MRI.getType(AllocSize) == LLT::scalar(64) && -+ "Unexpected type for dynamic alloca"); -+ -+ LLT PtrTy = MRI.getType(Dst); -+ Register SPReg = -+ Helper.getTargetLowering().getStackPointerRegisterToSaveRestore(); -+ Register SPTmp = -+ Helper.getDynStackAllocTargetPtr(SPReg, AllocSize, Alignment, PtrTy); -+ auto NewMI = -+ MIRBuilder.buildInstr(AArch64::PROBED_STACKALLOC_DYN, {}, {SPTmp}); -+ MRI.setRegClass(NewMI.getReg(0), &AArch64::GPR64commonRegClass); -+ MIRBuilder.setInsertPt(*NewMI->getParent(), NewMI); -+ MIRBuilder.buildCopy(Dst, SPTmp); -+ -+ MI.eraseFromParent(); -+ return true; -+} -\ No newline at end of file -diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h -index c10f6e071ed4..94484ea59d15 100644 ---- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h -+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h -@@ -58,6 +58,7 @@ private: - bool legalizeCTTZ(MachineInstr &MI, LegalizerHelper &Helper) const; - bool legalizeMemOps(MachineInstr &MI, LegalizerHelper &Helper) const; - bool legalizeFCopySign(MachineInstr &MI, LegalizerHelper &Helper) const; -+ bool legalizeDynStackAlloc(MachineInstr &MI, LegalizerHelper &Helper) const; - const AArch64Subtarget *ST; - }; - } // End llvm namespace. -diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-dyn-alloca.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-dyn-alloca.mir -index e9188fb89f69..882c7468e70f 100644 ---- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-dyn-alloca.mir -+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-dyn-alloca.mir -@@ -19,6 +19,21 @@ - ret i128* %addr - } - -+ define i8* @test_simple_alloca_stack_probing(i32 %numelts) "probe-stack"="inline-asm" { -+ %addr = alloca i8, i32 %numelts -+ ret i8* %addr -+ } -+ -+ define i8* @test_aligned_alloca_stack_probing(i32 %numelts) "probe-stack"="inline-asm" { -+ %addr = alloca i8, i32 %numelts, align 32 -+ ret i8* %addr -+ } -+ -+ define i128* @test_natural_alloca_stack_probing(i32 %numelts) "probe-stack"="inline-asm" { -+ %addr = alloca i128, i32 %numelts -+ ret i128* %addr -+ } -+ - ... - --- - name: test_simple_alloca -@@ -37,22 +52,23 @@ body: | - - ; CHECK-LABEL: name: test_simple_alloca - ; CHECK: liveins: $w0 -- ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 -- ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 -- ; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32) -- ; CHECK: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[ZEXT]], [[C]] -- ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 15 -- ; CHECK: [[ADD:%[0-9]+]]:_(s64) = nuw G_ADD [[MUL]], [[C1]] -- ; CHECK: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -16 -- ; CHECK: [[AND:%[0-9]+]]:_(s64) = G_AND [[ADD]], [[C2]] -- ; CHECK: [[COPY1:%[0-9]+]]:_(p0) = COPY $sp -- ; CHECK: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[COPY1]](p0) -- ; CHECK: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[PTRTOINT]], [[AND]] -- ; CHECK: [[INTTOPTR:%[0-9]+]]:_(p0) = G_INTTOPTR [[SUB]](s64) -- ; CHECK: $sp = COPY [[INTTOPTR]](p0) -- ; CHECK: [[COPY2:%[0-9]+]]:_(p0) = COPY [[INTTOPTR]](p0) -- ; CHECK: $x0 = COPY [[COPY2]](p0) -- ; CHECK: RET_ReallyLR implicit $x0 -+ ; CHECK-NEXT: {{ $}} -+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 -+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 -+ ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32) -+ ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[ZEXT]], [[C]] -+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 15 -+ ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = nuw G_ADD [[MUL]], [[C1]] -+ ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -16 -+ ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ADD]], [[C2]] -+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $sp -+ ; CHECK-NEXT: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[COPY1]](p0) -+ ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[PTRTOINT]], [[AND]] -+ ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:_(p0) = G_INTTOPTR [[SUB]](s64) -+ ; CHECK-NEXT: $sp = COPY [[INTTOPTR]](p0) -+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(p0) = COPY [[INTTOPTR]](p0) -+ ; CHECK-NEXT: $x0 = COPY [[COPY2]](p0) -+ ; CHECK-NEXT: RET_ReallyLR implicit $x0 - %0:_(s32) = COPY $w0 - %3:_(s64) = G_CONSTANT i64 1 - %1:_(s64) = G_ZEXT %0(s32) -@@ -83,24 +99,25 @@ body: | - - ; CHECK-LABEL: name: test_aligned_alloca - ; CHECK: liveins: $w0 -- ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 -- ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 -- ; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32) -- ; CHECK: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[ZEXT]], [[C]] -- ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 15 -- ; CHECK: [[ADD:%[0-9]+]]:_(s64) = nuw G_ADD [[MUL]], [[C1]] -- ; CHECK: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -16 -- ; CHECK: [[AND:%[0-9]+]]:_(s64) = G_AND [[ADD]], [[C2]] -- ; CHECK: [[COPY1:%[0-9]+]]:_(p0) = COPY $sp -- ; CHECK: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[COPY1]](p0) -- ; CHECK: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[PTRTOINT]], [[AND]] -- ; CHECK: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 -32 -- ; CHECK: [[AND1:%[0-9]+]]:_(s64) = G_AND [[SUB]], [[C3]] -- ; CHECK: [[INTTOPTR:%[0-9]+]]:_(p0) = G_INTTOPTR [[AND1]](s64) -- ; CHECK: $sp = COPY [[INTTOPTR]](p0) -- ; CHECK: [[COPY2:%[0-9]+]]:_(p0) = COPY [[INTTOPTR]](p0) -- ; CHECK: $x0 = COPY [[COPY2]](p0) -- ; CHECK: RET_ReallyLR implicit $x0 -+ ; CHECK-NEXT: {{ $}} -+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 -+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 -+ ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32) -+ ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[ZEXT]], [[C]] -+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 15 -+ ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = nuw G_ADD [[MUL]], [[C1]] -+ ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -16 -+ ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ADD]], [[C2]] -+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $sp -+ ; CHECK-NEXT: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[COPY1]](p0) -+ ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[PTRTOINT]], [[AND]] -+ ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 -32 -+ ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[SUB]], [[C3]] -+ ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:_(p0) = G_INTTOPTR [[AND1]](s64) -+ ; CHECK-NEXT: $sp = COPY [[INTTOPTR]](p0) -+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(p0) = COPY [[INTTOPTR]](p0) -+ ; CHECK-NEXT: $x0 = COPY [[COPY2]](p0) -+ ; CHECK-NEXT: RET_ReallyLR implicit $x0 - %0:_(s32) = COPY $w0 - %3:_(s64) = G_CONSTANT i64 1 - %1:_(s64) = G_ZEXT %0(s32) -@@ -131,22 +148,23 @@ body: | - - ; CHECK-LABEL: name: test_natural_alloca - ; CHECK: liveins: $w0 -- ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 -- ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 -- ; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32) -- ; CHECK: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[ZEXT]], [[C]] -- ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 15 -- ; CHECK: [[ADD:%[0-9]+]]:_(s64) = nuw G_ADD [[MUL]], [[C1]] -- ; CHECK: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -16 -- ; CHECK: [[AND:%[0-9]+]]:_(s64) = G_AND [[ADD]], [[C2]] -- ; CHECK: [[COPY1:%[0-9]+]]:_(p0) = COPY $sp -- ; CHECK: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[COPY1]](p0) -- ; CHECK: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[PTRTOINT]], [[AND]] -- ; CHECK: [[INTTOPTR:%[0-9]+]]:_(p0) = G_INTTOPTR [[SUB]](s64) -- ; CHECK: $sp = COPY [[INTTOPTR]](p0) -- ; CHECK: [[COPY2:%[0-9]+]]:_(p0) = COPY [[INTTOPTR]](p0) -- ; CHECK: $x0 = COPY [[COPY2]](p0) -- ; CHECK: RET_ReallyLR implicit $x0 -+ ; CHECK-NEXT: {{ $}} -+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 -+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 -+ ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32) -+ ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[ZEXT]], [[C]] -+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 15 -+ ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = nuw G_ADD [[MUL]], [[C1]] -+ ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -16 -+ ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ADD]], [[C2]] -+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $sp -+ ; CHECK-NEXT: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[COPY1]](p0) -+ ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[PTRTOINT]], [[AND]] -+ ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:_(p0) = G_INTTOPTR [[SUB]](s64) -+ ; CHECK-NEXT: $sp = COPY [[INTTOPTR]](p0) -+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(p0) = COPY [[INTTOPTR]](p0) -+ ; CHECK-NEXT: $x0 = COPY [[COPY2]](p0) -+ ; CHECK-NEXT: RET_ReallyLR implicit $x0 - %0:_(s32) = COPY $w0 - %3:_(s64) = G_CONSTANT i64 16 - %1:_(s64) = G_ZEXT %0(s32) -@@ -160,3 +178,140 @@ body: | - RET_ReallyLR implicit $x0 - - ... -+--- -+name: test_simple_alloca_stack_probing -+alignment: 4 -+tracksRegLiveness: true -+liveins: -+ - { reg: '$w0' } -+frameInfo: -+ maxAlignment: 1 -+stack: -+ - { id: 0, name: addr, type: variable-sized, alignment: 1 } -+machineFunctionInfo: {} -+body: | -+ bb.1 (%ir-block.0): -+ liveins: $w0 -+ ; CHECK-LABEL: name: test_simple_alloca_stack_probing -+ ; CHECK: liveins: $w0 -+ ; CHECK-NEXT: {{ $}} -+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 -+ ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32) -+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 -+ ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[ZEXT]], [[C]](s64) -+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 15 -+ ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = nuw G_ADD [[SHL]], [[C1]] -+ ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -16 -+ ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ADD]], [[C2]] -+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $sp -+ ; CHECK-NEXT: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[COPY1]](p0) -+ ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[PTRTOINT]], [[AND]] -+ ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:gpr64common(p0) = G_INTTOPTR [[SUB]](s64) -+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(p0) = COPY [[INTTOPTR]](p0) -+ ; CHECK-NEXT: PROBED_STACKALLOC_DYN [[INTTOPTR]](p0), implicit-def $sp, implicit-def $nzcv, implicit $sp -+ ; CHECK-NEXT: $x0 = COPY [[COPY2]](p0) -+ ; CHECK-NEXT: RET_ReallyLR implicit $x0 -+ %0:_(s32) = COPY $w0 -+ %1:_(s64) = G_ZEXT %0(s32) -+ %9:_(s64) = G_CONSTANT i64 0 -+ %2:_(s64) = G_SHL %1, %9(s64) -+ %4:_(s64) = G_CONSTANT i64 15 -+ %5:_(s64) = nuw G_ADD %2, %4 -+ %6:_(s64) = G_CONSTANT i64 -16 -+ %7:_(s64) = G_AND %5, %6 -+ %8:_(p0) = G_DYN_STACKALLOC %7(s64), 1 -+ $x0 = COPY %8(p0) -+ RET_ReallyLR implicit $x0 -+... -+--- -+name: test_aligned_alloca_stack_probing -+alignment: 4 -+tracksRegLiveness: true -+liveins: -+ - { reg: '$w0' } -+frameInfo: -+ maxAlignment: 32 -+stack: -+ - { id: 0, name: addr, type: variable-sized, alignment: 32 } -+machineFunctionInfo: {} -+body: | -+ bb.1 (%ir-block.0): -+ liveins: $w0 -+ ; CHECK-LABEL: name: test_aligned_alloca_stack_probing -+ ; CHECK: liveins: $w0 -+ ; CHECK-NEXT: {{ $}} -+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 -+ ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32) -+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 -+ ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[ZEXT]], [[C]](s64) -+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 15 -+ ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = nuw G_ADD [[SHL]], [[C1]] -+ ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -16 -+ ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ADD]], [[C2]] -+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $sp -+ ; CHECK-NEXT: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[COPY1]](p0) -+ ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[PTRTOINT]], [[AND]] -+ ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 -32 -+ ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[SUB]], [[C3]] -+ ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:gpr64common(p0) = G_INTTOPTR [[AND1]](s64) -+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(p0) = COPY [[INTTOPTR]](p0) -+ ; CHECK-NEXT: PROBED_STACKALLOC_DYN [[INTTOPTR]](p0), implicit-def $sp, implicit-def $nzcv, implicit $sp -+ ; CHECK-NEXT: $x0 = COPY [[COPY2]](p0) -+ ; CHECK-NEXT: RET_ReallyLR implicit $x0 -+ %0:_(s32) = COPY $w0 -+ %1:_(s64) = G_ZEXT %0(s32) -+ %9:_(s64) = G_CONSTANT i64 0 -+ %2:_(s64) = G_SHL %1, %9(s64) -+ %4:_(s64) = G_CONSTANT i64 15 -+ %5:_(s64) = nuw G_ADD %2, %4 -+ %6:_(s64) = G_CONSTANT i64 -16 -+ %7:_(s64) = G_AND %5, %6 -+ %8:_(p0) = G_DYN_STACKALLOC %7(s64), 32 -+ $x0 = COPY %8(p0) -+ RET_ReallyLR implicit $x0 -+... -+--- -+name: test_natural_alloca_stack_probing -+alignment: 4 -+tracksRegLiveness: true -+liveins: -+ - { reg: '$w0' } -+frameInfo: -+ maxAlignment: 1 -+stack: -+ - { id: 0, name: addr, type: variable-sized, alignment: 1 } -+machineFunctionInfo: {} -+body: | -+ bb.1 (%ir-block.0): -+ liveins: $w0 -+ ; CHECK-LABEL: name: test_natural_alloca_stack_probing -+ ; CHECK: liveins: $w0 -+ ; CHECK-NEXT: {{ $}} -+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 -+ ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32) -+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 -+ ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[ZEXT]], [[C]](s64) -+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 15 -+ ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = nuw G_ADD [[SHL]], [[C1]] -+ ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -16 -+ ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ADD]], [[C2]] -+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $sp -+ ; CHECK-NEXT: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[COPY1]](p0) -+ ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[PTRTOINT]], [[AND]] -+ ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:gpr64common(p0) = G_INTTOPTR [[SUB]](s64) -+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(p0) = COPY [[INTTOPTR]](p0) -+ ; CHECK-NEXT: PROBED_STACKALLOC_DYN [[INTTOPTR]](p0), implicit-def $sp, implicit-def $nzcv, implicit $sp -+ ; CHECK-NEXT: $x0 = COPY [[COPY2]](p0) -+ ; CHECK-NEXT: RET_ReallyLR implicit $x0 -+ %0:_(s32) = COPY $w0 -+ %1:_(s64) = G_ZEXT %0(s32) -+ %9:_(s64) = G_CONSTANT i64 4 -+ %2:_(s64) = G_SHL %1, %9(s64) -+ %4:_(s64) = G_CONSTANT i64 15 -+ %5:_(s64) = nuw G_ADD %2, %4 -+ %6:_(s64) = G_CONSTANT i64 -16 -+ %7:_(s64) = G_AND %5, %6 -+ %8:_(p0) = G_DYN_STACKALLOC %7(s64), 1 -+ $x0 = COPY %8(p0) -+ RET_ReallyLR implicit $x0 -+... -\ No newline at end of file -diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir -index 461161f5b338..efae9b66b53d 100644 ---- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir -+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir -@@ -652,6 +652,13 @@ - # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to [[DYN_STACKALLOC]] - # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected - # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected -+# DEBUG-NEXT: G_STACKSAVE (opcode [[STACKSAVE:[0-9]+]]): 1 type index, 0 imm indices -+# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected -+# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected -+# DEBUG-NEXT: G_STACKRESTORE (opcode {{[0-9]+}}): 1 type index, 0 imm indices -+# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to [[STACKSAVE]] -+# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected -+# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected - # DEBUG-NEXT: G_STRICT_FADD (opcode {{[0-9]+}}): 1 type index, 0 imm indices - # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined - # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined -diff --git a/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll b/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll -index 4d9ef77f7a0d..ad9cdbe92b23 100644 ---- a/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll -+++ b/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll -@@ -1,5 +1,6 @@ - ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py --; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs | FileCheck %s -+; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs | FileCheck %s -+; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -global-isel -global-isel-abort=2 | FileCheck %s - - ; Dynamically-sized allocation, needs a loop which can handle any size at - ; runtime. The final iteration of the loop will temporarily put SP below the --- -2.42.0.windows.2 - diff --git a/0026-Update-testcase-for-stack-clash-protection-backport.patch b/0026-Update-testcase-for-stack-clash-protection-backport.patch deleted file mode 100644 index 4b36a24..0000000 --- a/0026-Update-testcase-for-stack-clash-protection-backport.patch +++ /dev/null @@ -1,177 +0,0 @@ -From 9425ee5f8608ff8611628d83386f61950d7fff85 Mon Sep 17 00:00:00 2001 -From: rickyleung -Date: Tue, 7 May 2024 21:37:03 +0800 -Subject: [PATCH 7/7] Update testcase for stack clash protection backport - ---- - .../GlobalISel/legalize-dyn-alloca.mir | 3 +- - .../GlobalISel/stacksave-stackrestore.ll | 14 ++++++---- - .../CodeGen/AArch64/stack-probing-dynamic.ll | 16 ++++++----- - .../AArch64/stack-probing-last-in-block.mir | 4 +-- - .../X86/GlobalISel/stacksave-stackrestore.ll | 28 +++++++++++-------- - 5 files changed, 36 insertions(+), 29 deletions(-) - -diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-dyn-alloca.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-dyn-alloca.mir -index 882c7468e70f..82781cebc55a 100644 ---- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-dyn-alloca.mir -+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-dyn-alloca.mir -@@ -313,5 +313,4 @@ body: | - %7:_(s64) = G_AND %5, %6 - %8:_(p0) = G_DYN_STACKALLOC %7(s64), 1 - $x0 = COPY %8(p0) -- RET_ReallyLR implicit $x0 --... -\ No newline at end of file -+ RET_ReallyLR implicit $x0 -\ No newline at end of file -diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/stacksave-stackrestore.ll b/llvm/test/CodeGen/AArch64/GlobalISel/stacksave-stackrestore.ll -index 16bf85af9c17..97ecca0bd77b 100644 ---- a/llvm/test/CodeGen/AArch64/GlobalISel/stacksave-stackrestore.ll -+++ b/llvm/test/CodeGen/AArch64/GlobalISel/stacksave-stackrestore.ll -@@ -15,14 +15,18 @@ define void @test_scoped_alloca(i64 %n) { - ; CHECK-NEXT: .cfi_offset w19, -16 - ; CHECK-NEXT: .cfi_offset w30, -24 - ; CHECK-NEXT: .cfi_offset w29, -32 --; CHECK-NEXT: add x9, x0, #15 -+; CHECK-NEXT: mov x19, x0 -+; CHECK-NEXT: bl llvm.stacksave.p0 -+; CHECK-NEXT: add x9, x19, #15 - ; CHECK-NEXT: mov x8, sp - ; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 --; CHECK-NEXT: mov x19, sp --; CHECK-NEXT: sub x0, x8, x9 --; CHECK-NEXT: mov sp, x0 -+; CHECK-NEXT: mov x19, x0 -+; CHECK-NEXT: sub x8, x8, x9 -+; CHECK-NEXT: mov sp, x8 -+; CHECK-NEXT: mov x0, x8 - ; CHECK-NEXT: bl use_addr --; CHECK-NEXT: mov sp, x19 -+; CHECK-NEXT: mov x0, x19 -+; CHECK-NEXT: bl llvm.stackrestore.p0 - ; CHECK-NEXT: mov sp, x29 - ; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload - ; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload -diff --git a/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll b/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll -index ad9cdbe92b23..3cbcf7749b2a 100644 ---- a/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll -+++ b/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll -@@ -59,10 +59,10 @@ define void @dynamic_fixed(i64 %size, ptr %out1, ptr %out2) #0 { - ; CHECK-NEXT: str xzr, [sp, #-64]! - ; CHECK-NEXT: add x9, x0, #15 - ; CHECK-NEXT: mov x8, sp --; CHECK-NEXT: sub x10, x29, #64 - ; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 --; CHECK-NEXT: str x10, [x1] -+; CHECK-NEXT: sub x10, x29, #64 - ; CHECK-NEXT: sub x8, x8, x9 -+; CHECK-NEXT: str x10, [x1] - ; CHECK-NEXT: .LBB1_1: // =>This Inner Loop Header: Depth=1 - ; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 - ; CHECK-NEXT: cmp sp, x8 -@@ -108,10 +108,10 @@ define void @dynamic_align_64(i64 %size, ptr %out) #0 { - ; CHECK-NEXT: and sp, x9, #0xffffffffffffffc0 - ; CHECK-NEXT: add x9, x0, #15 - ; CHECK-NEXT: mov x8, sp --; CHECK-NEXT: str xzr, [sp] - ; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 --; CHECK-NEXT: mov x19, sp -+; CHECK-NEXT: str xzr, [sp] - ; CHECK-NEXT: sub x8, x8, x9 -+; CHECK-NEXT: mov x19, sp - ; CHECK-NEXT: and x8, x8, #0xffffffffffffffc0 - ; CHECK-NEXT: .LBB2_1: // =>This Inner Loop Header: Depth=1 - ; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 -@@ -167,10 +167,10 @@ define void @dynamic_align_8192(i64 %size, ptr %out) #0 { - ; CHECK-NEXT: mov sp, x9 - ; CHECK-NEXT: add x9, x0, #15 - ; CHECK-NEXT: mov x8, sp --; CHECK-NEXT: str xzr, [sp] - ; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 --; CHECK-NEXT: mov x19, sp -+; CHECK-NEXT: str xzr, [sp] - ; CHECK-NEXT: sub x8, x8, x9 -+; CHECK-NEXT: mov x19, sp - ; CHECK-NEXT: and x8, x8, #0xffffffffffffe000 - ; CHECK-NEXT: .LBB3_4: // =>This Inner Loop Header: Depth=1 - ; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 -@@ -268,8 +268,10 @@ define void @no_reserved_call_frame(i64 %n) #0 { - ; CHECK-NEXT: str xzr, [sp] - ; CHECK-NEXT: sub sp, sp, #1104 - ; CHECK-NEXT: str xzr, [sp] -+; CHECK-NEXT: sub sp, sp, #1104 - ; CHECK-NEXT: bl callee_stack_args - ; CHECK-NEXT: add sp, sp, #1104 -+; CHECK-NEXT: add sp, sp, #1104 - ; CHECK-NEXT: mov sp, x29 - ; CHECK-NEXT: .cfi_def_cfa wsp, 16 - ; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload -@@ -331,8 +333,8 @@ define void @dynamic_sve(i64 %size, ptr %out) #0 "target-features"="+sve" { - ; CHECK-NEXT: .cfi_offset w29, -32 - ; CHECK-NEXT: rdvl x9, #1 - ; CHECK-NEXT: mov x10, #15 // =0xf --; CHECK-NEXT: mov x8, sp - ; CHECK-NEXT: madd x9, x0, x9, x10 -+; CHECK-NEXT: mov x8, sp - ; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 - ; CHECK-NEXT: sub x8, x8, x9 - ; CHECK-NEXT: .LBB7_1: // =>This Inner Loop Header: Depth=1 -diff --git a/llvm/test/CodeGen/AArch64/stack-probing-last-in-block.mir b/llvm/test/CodeGen/AArch64/stack-probing-last-in-block.mir -index a8a21ab330ba..9a173be5857e 100644 ---- a/llvm/test/CodeGen/AArch64/stack-probing-last-in-block.mir -+++ b/llvm/test/CodeGen/AArch64/stack-probing-last-in-block.mir -@@ -141,6 +141,4 @@ body: | - B %bb.2 - - bb.2.exit: -- RET_ReallyLR -- --... -\ No newline at end of file -+ RET_ReallyLR -\ No newline at end of file -diff --git a/llvm/test/CodeGen/X86/GlobalISel/stacksave-stackrestore.ll b/llvm/test/CodeGen/X86/GlobalISel/stacksave-stackrestore.ll -index e86c04ee22db..8f665924577f 100644 ---- a/llvm/test/CodeGen/X86/GlobalISel/stacksave-stackrestore.ll -+++ b/llvm/test/CodeGen/X86/GlobalISel/stacksave-stackrestore.ll -@@ -13,21 +13,25 @@ define void @test_scoped_alloca(i64 %n) { - ; CHECK-NEXT: .cfi_offset %rbp, -16 - ; CHECK-NEXT: movq %rsp, %rbp - ; CHECK-NEXT: .cfi_def_cfa_register %rbp -+; CHECK-NEXT: pushq %r14 - ; CHECK-NEXT: pushq %rbx --; CHECK-NEXT: pushq %rax --; CHECK-NEXT: .cfi_offset %rbx, -24 --; CHECK-NEXT: movq %rsp, %rbx --; CHECK-NEXT: movq %rsp, %rax --; CHECK-NEXT: imulq $1, %rdi, %rcx --; CHECK-NEXT: addq $15, %rcx --; CHECK-NEXT: andq $-16, %rcx --; CHECK-NEXT: subq %rcx, %rax --; CHECK-NEXT: movq %rax, %rsp --; CHECK-NEXT: movq %rax, %rdi -+; CHECK-NEXT: .cfi_offset %rbx, -32 -+; CHECK-NEXT: .cfi_offset %r14, -24 -+; CHECK-NEXT: movq %rdi, %rbx -+; CHECK-NEXT: callq llvm.stacksave.p0 -+; CHECK-NEXT: movq %rax, %r14 -+; CHECK-NEXT: movq %rsp, %rdi -+; CHECK-NEXT: imulq $1, %rbx, %rax -+; CHECK-NEXT: addq $15, %rax -+; CHECK-NEXT: andq $-16, %rax -+; CHECK-NEXT: subq %rax, %rdi -+; CHECK-NEXT: movq %rdi, %rsp - ; CHECK-NEXT: callq use_addr --; CHECK-NEXT: movq %rbx, %rsp --; CHECK-NEXT: leaq -8(%rbp), %rsp -+; CHECK-NEXT: movq %r14, %rdi -+; CHECK-NEXT: callq llvm.stackrestore.p0 -+; CHECK-NEXT: leaq -16(%rbp), %rsp - ; CHECK-NEXT: popq %rbx -+; CHECK-NEXT: popq %r14 - ; CHECK-NEXT: popq %rbp - ; CHECK-NEXT: .cfi_def_cfa %rsp, 8 - ; CHECK-NEXT: retq --- -2.42.0.windows.2 - diff --git a/llvm.spec b/llvm.spec index 6a215a4..73dfde2 100644 --- a/llvm.spec +++ b/llvm.spec @@ -38,7 +38,7 @@ Name: %{pkg_name} Version: %{maj_ver}.%{min_ver}.%{patch_ver} -Release: 11 +Release: 10 Summary: The Low Level Virtual Machine License: NCSA @@ -70,13 +70,6 @@ Patch18: 0018-Fix-declaration-definition-mismatch-for-classic-flang.patch Patch19: 0019-Backport-LoongArch-Improve-the-support-for-atomic-and-clear_cache.patch Patch20: 0020-Update-llvm-lit-config-to-support-build_for_openeule.patch -Patch21: 0021-Backport-GlobalISel-Don-t-expand-stacksave-stackrestore-in-IRTranslator.patch -Patch22: 0022-Backport-AArch64-Refactor-allocation-of-locals-and-stack-realignment.patch -Patch23: 0023-Backport-AArch64-Stack-probing-for-function-prologues.patch -Patch24: 0024-Backport-AArch64-Stack-probing-for-dynamic-allocas-in-SelectionDAG.patch -Patch25: 0025-Backport-AArch64-Stack-probing-for-dynamic-allocas-in-GlobalISel.patch -Patch26: 0026-Update-testcase-for-stack-clash-protection-backport.patch - BuildRequires: binutils-devel BuildRequires: cmake BuildRequires: gcc @@ -360,9 +353,6 @@ LD_LIBRARY_PATH=%{buildroot}/%{install_libdir} %{__ninja} check-all -C ./_build %{install_includedir}/llvm-gmock %changelog -* Fri May 10 2024 rickyleung - 17.0.6-11 -- Backport the patches to support stack clash protection - * Mon Apr 29 2024 wangqiang - 17.0.6-10 - Update llvm-lit config to support macro `build_for_openeuler` -- Gitee