[GreedyRA] Improve RA for nested loop induction variables (#72093)

Imagine a loop of the form: ``` preheader: %r = def header: bcc latch, inner inner1: .. inner2: b latch latch: %r = subs %r bcc header ``` It can be possible for code to spend a decent amount of time in the header<->latch loop, not going into the inner part of the loop as much. The greedy register allocator can prefer to spill _around_ %r though, adding spills around the subs in the loop, which can be very detrimental for performance. (The case I am looking at is actually a very deeply nested set of loops that repeat the header<->latch pattern at multiple different levels). The greedy RA will apply a preference to spill to the IV, as it is live through the header block. This patch attempts to add a heuristic to prevent that in this case for variables that look like IVs, in a similar regard to the extra spill weight that gets added to variables that look like IVs, that are expensive to spill. That will mean spills are more likely to be pushed into the inner blocks, where they are less likely to be executed and not as expensive as spills around the IV. This gives a 8% speedup in the exchange benchmark from spec2017 when compiled with flang-new, whilst importantly stabilising the scores to be less chaotic to other changes. Running ctmark showed no difference in the compile time. I've tried to run a range of benchmarking for performance, most of which were relatively flat not showing many large differences. One matrix multiply case improved 21.3% due to removing a cascading chains of spills, and some other knock-on effects happen which usually cause small differences in the scores.
author: David Green <david.green@arm.com> 2023-11-18 09:55:19 +0000
committer: GitHub <noreply@github.com> 2023-11-18 09:55:19 +0000
commit: 303a7835ff833278a0de20cf5a70085b2ae8fee1 (patch)
tree: c6a58b16ea7e57df2870f7de5cc9a15eb2b64c72
parent: 56d0e8ccf424ddcd74a505837b8966204aaba415 (diff)
5 files changed, 62 insertions, 25 deletions
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index 2ad534c985b0..a208bf89fadf 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -767,10 +767,28 @@ bool RAGreedy::growRegion(GlobalSplitCandidate &Cand) {
     if (Cand.PhysReg) {
       if (!addThroughConstraints(Cand.Intf, NewBlocks))
         return false;
-    } else
-      // Provide a strong negative bias on through blocks to prevent unwanted
-      // liveness on loop backedges.
-      SpillPlacer->addPrefSpill(NewBlocks, /* Strong= */ true);
+    } else {
+      // Providing that the variable being spilled does not look like a loop
+      // induction variable, which is expensive to spill around and better
+      // pushed into a condition inside the loop if possible, provide a strong
+      // negative bias on through blocks to prevent unwanted liveness on loop
+      // backedges.
+      bool PrefSpill = true;
+      if (SA->looksLikeLoopIV() && NewBlocks.size() >= 2) {
+        // Check that the current bundle is adding a Header + start+end of
+        // loop-internal blocks. If the block is indeed a header, don't make
+        // the NewBlocks as PrefSpill to allow the variable to be live in
+        // Header<->Latch.
+        MachineLoop *L = Loops->getLoopFor(MF->getBlockNumbered(NewBlocks[0]));
+        if (L && L->getHeader()->getNumber() == (int)NewBlocks[0] &&
+            all_of(NewBlocks.drop_front(), [&](unsigned Block) {
+              return L == Loops->getLoopFor(MF->getBlockNumbered(Block));
+            }))
+          PrefSpill = false;
+      }
+      if (PrefSpill)
+        SpillPlacer->addPrefSpill(NewBlocks, /* Strong= */ true);
+    }
     AddedTo = ActiveBlocks.size();
 
     // Perhaps iterating can enable more bundles?
diff --git a/llvm/lib/CodeGen/SplitKit.cpp b/llvm/lib/CodeGen/SplitKit.cpp
index b1c862210932..d6c0a782465e 100644
--- a/llvm/lib/CodeGen/SplitKit.cpp
+++ b/llvm/lib/CodeGen/SplitKit.cpp
@@ -45,6 +45,11 @@ using namespace llvm;
 
 #define DEBUG_TYPE "regalloc"
 
+static cl::opt<bool>
+    EnableLoopIVHeuristic("enable-split-loopiv-heuristic",
+                          cl::desc("Enable loop iv regalloc heuristic"),
+                          cl::init(true));
+
 STATISTIC(NumFinished, "Number of splits finished");
 STATISTIC(NumSimple,   "Number of splits that were simple");
 STATISTIC(NumCopies,   "Number of copies inserted for splitting");
@@ -293,6 +298,13 @@ void SplitAnalysis::calcLiveBlockInfo() {
       MFI = LIS.getMBBFromIndex(LVI->start)->getIterator();
   }
 
+  LooksLikeLoopIV = EnableLoopIVHeuristic && UseBlocks.size() == 2 &&
+                    any_of(UseBlocks, [this](BlockInfo &BI) {
+                      MachineLoop *L = Loops.getLoopFor(BI.MBB);
+                      return BI.LiveIn && BI.LiveOut && BI.FirstDef && L &&
+                             L->isLoopLatch(BI.MBB);
+                    });
+
   assert(getNumLiveBlocks() == countLiveBlocks(CurLI) && "Bad block count");
 }
 
diff --git a/llvm/lib/CodeGen/SplitKit.h b/llvm/lib/CodeGen/SplitKit.h
index 1174e392e4e4..cc277ecc0e88 100644
--- a/llvm/lib/CodeGen/SplitKit.h
+++ b/llvm/lib/CodeGen/SplitKit.h
@@ -159,6 +159,10 @@ private:
   /// NumThroughBlocks - Number of live-through blocks.
   unsigned NumThroughBlocks = 0u;
 
+  /// LooksLikeLoopIV - The variable defines what looks like it could be a loop
+  /// IV, where it defs a variable in the latch.
+  bool LooksLikeLoopIV = false;
+
   // Sumarize statistics by counting instructions using CurLI.
   void analyzeUses();
 
@@ -209,6 +213,8 @@ public:
     return getUseBlocks().size() - NumGapBlocks + getNumThroughBlocks();
   }
 
+  bool looksLikeLoopIV() const { return LooksLikeLoopIV; }
+
   /// countLiveBlocks - Return the number of blocks where li is live. This is
   /// guaranteed to return the same number as getNumLiveBlocks() after calling
   /// analyze(li).
diff --git a/llvm/test/CodeGen/AArch64/nested-iv-regalloc.mir b/llvm/test/CodeGen/AArch64/nested-iv-regalloc.mir
index ac406df6fece..3bd8f83d27c2 100644
--- a/llvm/test/CodeGen/AArch64/nested-iv-regalloc.mir
+++ b/llvm/test/CodeGen/AArch64/nested-iv-regalloc.mir
@@ -1,6 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple aarch64 --run-pass=greedy,virtregrewriter -verify-machineinstrs %s -o - | FileCheck %s
 
+# We should ideally not spill around any of the SUBSWri in the loop exit blocks (if.end and if.end27).
+
 --- |
   target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
   target triple = "aarch64"
@@ -206,8 +208,7 @@ body:             |
   ; CHECK-NEXT:   renamable $w8 = LDRWui renamable $x0, 0 :: (load (s32) from %ir.p)
   ; CHECK-NEXT:   renamable $w9 = LDRWui killed renamable $x0, 1 :: (load (s32) from %ir.arrayidx2)
   ; CHECK-NEXT:   dead $wzr = SUBSWri renamable $w8, 1, 0, implicit-def $nzcv
-  ; CHECK-NEXT:   renamable $w8 = CSINCWr killed renamable $w8, $wzr, 12, implicit $nzcv
-  ; CHECK-NEXT:   STRWui killed renamable $w8, %stack.2, 0 :: (store (s32) into %stack.2)
+  ; CHECK-NEXT:   renamable $w11 = CSINCWr killed renamable $w8, $wzr, 12, implicit $nzcv
   ; CHECK-NEXT:   renamable $x8 = COPY killed renamable $x10
   ; CHECK-NEXT:   dead $wzr = SUBSWri renamable $w9, 1, 0, implicit-def $nzcv
   ; CHECK-NEXT:   renamable $w10 = CSINCWr killed renamable $w9, $wzr, 12, implicit $nzcv
@@ -215,7 +216,7 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1.do.body:
   ; CHECK-NEXT:   successors: %bb.3(0x50000000), %bb.2(0x30000000)
-  ; CHECK-NEXT:   liveins: $w10, $x2, $x8
+  ; CHECK-NEXT:   liveins: $w10, $w11, $x2, $x8
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   STRXui renamable $x8, %stack.1, 0 :: (store (s64) into %stack.1)
   ; CHECK-NEXT:   renamable $w9 = MOVi32imm 36, implicit-def $x9
@@ -227,23 +228,24 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.10(0x80000000)
-  ; CHECK-NEXT:   liveins: $w10, $x2
+  ; CHECK-NEXT:   liveins: $w10, $w11, $x2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   renamable $x8 = LDRXui %stack.1, 0 :: (load (s64) from %stack.1)
   ; CHECK-NEXT:   B %bb.10
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3.do.body12.preheader:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
-  ; CHECK-NEXT:   liveins: $w10, $x2
+  ; CHECK-NEXT:   liveins: $w10, $w11, $x2
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $x11 = COPY $xzr
+  ; CHECK-NEXT:   renamable $x12 = COPY $xzr
+  ; CHECK-NEXT:   STRWui renamable $w11, %stack.2, 0 :: (store (s32) into %stack.2)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4.do.body12:
   ; CHECK-NEXT:   successors: %bb.5(0x50000000), %bb.8(0x30000000)
-  ; CHECK-NEXT:   liveins: $w10, $x2, $x11
+  ; CHECK-NEXT:   liveins: $w10, $w11, $x2, $x12
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   renamable $w8 = MOVi32imm 36, implicit-def $x8
-  ; CHECK-NEXT:   renamable $x8 = MADDXrrr renamable $x11, killed renamable $x8, $xzr
+  ; CHECK-NEXT:   renamable $x8 = MADDXrrr renamable $x12, killed renamable $x8, $xzr
   ; CHECK-NEXT:   renamable $x9 = MOVaddr target-flags(aarch64-page) @g, target-flags(aarch64-pageoff, aarch64-nc) @g
   ; CHECK-NEXT:   renamable $w8 = LDRWroX killed renamable $x9, killed renamable $x8, 0, 0 :: (load (s32) from %ir.arrayidx14)
   ; CHECK-NEXT:   dead $wzr = SUBSWri killed renamable $w8, 1, 0, implicit-def $nzcv
@@ -252,11 +254,11 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5.if.then17:
   ; CHECK-NEXT:   successors: %bb.7(0x80000000)
-  ; CHECK-NEXT:   liveins: $w10, $x2, $x11
+  ; CHECK-NEXT:   liveins: $w10, $x2, $x12
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   STRWui killed renamable $w10, %stack.3, 0 :: (store (s32) into %stack.3)
-  ; CHECK-NEXT:   STRXui renamable $x11, %stack.4, 0 :: (store (s64) into %stack.4)
-  ; CHECK-NEXT:   renamable $w20 = LDRWroX killed renamable $x2, killed renamable $x11, 0, 1 :: (volatile load (s32) from %ir.arrayidx19)
+  ; CHECK-NEXT:   STRXui renamable $x12, %stack.4, 0 :: (store (s64) into %stack.4)
+  ; CHECK-NEXT:   renamable $w20 = LDRWroX killed renamable $x2, killed renamable $x12, 0, 1 :: (volatile load (s32) from %ir.arrayidx19)
   ; CHECK-NEXT:   renamable $w19 = MOVi32imm 100
   ; CHECK-NEXT:   B %bb.7
   ; CHECK-NEXT: {{  $}}
@@ -267,8 +269,9 @@ body:             |
   ; CHECK-NEXT:   renamable $w1 = COPY killed renamable $w20
   ; CHECK-NEXT:   INLINEASM &"nop;nop", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def dead early-clobber $x0, 12 /* clobber */, implicit-def dead early-clobber $x2, 12 /* clobber */, implicit-def dead early-clobber $x3, 12 /* clobber */, implicit-def dead early-clobber $x4, 12 /* clobber */, implicit-def dead early-clobber $x5, 12 /* clobber */, implicit-def dead early-clobber $x6, 12 /* clobber */, implicit-def dead early-clobber $x7, 12 /* clobber */, implicit-def dead early-clobber $x8, 12 /* clobber */, implicit-def dead early-clobber $x9, 12 /* clobber */, implicit-def dead early-clobber $x10, 12 /* clobber */, implicit-def dead early-clobber $x11, 12 /* clobber */, implicit-def dead early-clobber $x12, 12 /* clobber */, implicit-def dead early-clobber $x13, 12 /* clobber */, implicit-def dead early-clobber $x14, 12 /* clobber */, implicit-def dead early-clobber $x15, 12 /* clobber */, implicit-def dead early-clobber $x16, 12 /* clobber */, implicit-def dead early-clobber $x17, 12 /* clobber */, implicit-def dead early-clobber $x18, 12 /* clobber */, implicit-def dead early-clobber $x19, 12 /* clobber */, implicit-def dead early-clobber $x20, 12 /* clobber */, implicit-def dead early-clobber $x21, 12 /* clobber */, implicit-def dead early-clobber $x22, 12 /* clobber */, implicit-def dead early-clobber $x23, 12 /* clobber */, implicit-def dead early-clobber $x24, 12 /* clobber */, implicit-def dead early-clobber $x25, 12 /* clobber */, implicit-def dead early-clobber $x26, 12 /* clobber */, implicit-def dead early-clobber $x27, 12 /* clobber */, implicit-def dead early-clobber $x28, 12 /* clobber */, implicit-def dead early-clobber $fp, 12 /* clobber */, implicit-def dead early-clobber $lr
   ; CHECK-NEXT:   renamable $x2 = LDRXui %stack.0, 0 :: (load (s64) from %stack.0)
-  ; CHECK-NEXT:   renamable $x11 = LDRXui %stack.4, 0 :: (load (s64) from %stack.4)
-  ; CHECK-NEXT:   STRWroX killed renamable $w1, renamable $x2, renamable $x11, 0, 1 :: (volatile store (s32) into %ir.sunkaddr1)
+  ; CHECK-NEXT:   renamable $x12 = LDRXui %stack.4, 0 :: (load (s64) from %stack.4)
+  ; CHECK-NEXT:   STRWroX killed renamable $w1, renamable $x2, renamable $x12, 0, 1 :: (volatile store (s32) into %ir.sunkaddr1)
+  ; CHECK-NEXT:   renamable $w11 = LDRWui %stack.2, 0 :: (load (s32) from %stack.2)
   ; CHECK-NEXT:   renamable $w10 = LDRWui %stack.3, 0 :: (load (s32) from %stack.3)
   ; CHECK-NEXT:   B %bb.8
   ; CHECK-NEXT: {{  $}}
@@ -286,16 +289,16 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.8.if.end:
   ; CHECK-NEXT:   successors: %bb.9(0x04000000), %bb.4(0x7c000000)
-  ; CHECK-NEXT:   liveins: $w10, $x2, $x11
+  ; CHECK-NEXT:   liveins: $w10, $w11, $x2, $x12
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   renamable $w10 = nsw SUBSWri killed renamable $w10, 1, 0, implicit-def $nzcv
-  ; CHECK-NEXT:   renamable $x11 = nuw nsw ADDXri killed renamable $x11, 1, 0
+  ; CHECK-NEXT:   renamable $x12 = nuw nsw ADDXri killed renamable $x12, 1, 0
   ; CHECK-NEXT:   Bcc 1, %bb.4, implicit $nzcv
   ; CHECK-NEXT:   B %bb.9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.9.do.end:
   ; CHECK-NEXT:   successors: %bb.10(0x80000000)
-  ; CHECK-NEXT:   liveins: $x2
+  ; CHECK-NEXT:   liveins: $w11, $x2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   renamable $w10 = COPY $wzr
   ; CHECK-NEXT:   renamable $x8 = LDRXui %stack.1, 0 :: (load (s64) from %stack.1)
@@ -303,11 +306,9 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.10.if.end27:
   ; CHECK-NEXT:   successors: %bb.11(0x04000000), %bb.1(0x7c000000)
-  ; CHECK-NEXT:   liveins: $w10, $x2, $x8
+  ; CHECK-NEXT:   liveins: $w10, $w11, $x2, $x8
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w9 = LDRWui %stack.2, 0 :: (load (s32) from %stack.2)
-  ; CHECK-NEXT:   renamable $w9 = nsw SUBSWri killed renamable $w9, 1, 0, implicit-def $nzcv
-  ; CHECK-NEXT:   STRWui killed renamable $w9, %stack.2, 0 :: (store (s32) into %stack.2)
+  ; CHECK-NEXT:   renamable $w11 = nsw SUBSWri killed renamable $w11, 1, 0, implicit-def $nzcv
   ; CHECK-NEXT:   renamable $x8 = nuw nsw ADDXri killed renamable $x8, 1, 0
   ; CHECK-NEXT:   Bcc 1, %bb.1, implicit $nzcv
   ; CHECK-NEXT:   B %bb.11
diff --git a/llvm/test/DebugInfo/MIR/InstrRef/memory-operand-folding-tieddef.mir b/llvm/test/DebugInfo/MIR/InstrRef/memory-operand-folding-tieddef.mir
index 0ef750f94b86..7d074b3faefc 100644
--- a/llvm/test/DebugInfo/MIR/InstrRef/memory-operand-folding-tieddef.mir
+++ b/llvm/test/DebugInfo/MIR/InstrRef/memory-operand-folding-tieddef.mir
@@ -1,6 +1,6 @@
 # RUN: llc %s -o - -experimental-debug-variable-locations \
 # RUN:    -start-before=x86-flags-copy-lowering -stop-after=virtregrewriter \
-# RUN:    -mtriple x86_64-unknown-unknown \
+# RUN:    -mtriple x86_64-unknown-unknown -enable-split-loopiv-heuristic=false \
 # RUN: | FileCheck %s
 #
 # This test is for stack spill folding -- the INC32r near the end of the MIR
author	David Green <david.green@arm.com>	2023-11-18 09:55:19 +0000
committer	GitHub <noreply@github.com>	2023-11-18 09:55:19 +0000
commit	303a7835ff833278a0de20cf5a70085b2ae8fee1 (patch)
tree	c6a58b16ea7e57df2870f7de5cc9a15eb2b64c72
parent	56d0e8ccf424ddcd74a505837b8966204aaba415 (diff)