summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorShatianWang <38512325+ShatianWang@users.noreply.github.com>2023-12-21 16:17:10 -0500
committerGitHub <noreply@github.com>2023-12-21 16:17:10 -0500
commit157748341358f38ab55ea3a7a64276a5d4431d77 (patch)
treed94af9dc4fc7197a3881e3bc3798297ae1f88adf
parent0cf3af0c5176cc067bf90b315466a8997498b988 (diff)
[BOLT] Don't split likely fallthrough in CDSplit (#76164)
This diff speeds up CDSplit by not considering any hot-warm splitting point that could break a fall-through branch from a basic block to its most likely successor. Co-authored-by: spupyrev <spupyrev@fb.com>
-rw-r--r--bolt/lib/Passes/SplitFunctions.cpp100
-rw-r--r--bolt/test/X86/cdsplit-call-scale.s9
2 files changed, 68 insertions, 41 deletions
diff --git a/bolt/lib/Passes/SplitFunctions.cpp b/bolt/lib/Passes/SplitFunctions.cpp
index d9f5da3e3bca..5de075973004 100644
--- a/bolt/lib/Passes/SplitFunctions.cpp
+++ b/bolt/lib/Passes/SplitFunctions.cpp
@@ -175,8 +175,12 @@ struct SplitCacheDirected final : public SplitStrategy {
void fragment(const BlockIt Start, const BlockIt End) override {
BasicBlockOrder BlockOrder(Start, End);
BinaryFunction &BF = *BlockOrder.front()->getFunction();
+ // No need to re-split small functions.
+ if (BlockOrder.size() <= 2)
+ return;
size_t BestSplitIndex = findSplitIndex(BF, BlockOrder);
+ assert(BestSplitIndex < BlockOrder.size());
// Assign fragments based on the computed best split index.
// All basic blocks with index up to the best split index become hot.
@@ -200,10 +204,12 @@ private:
};
struct SplitScore {
- size_t SplitIndex;
+ size_t SplitIndex = size_t(-1);
size_t HotSizeReduction = 0;
double LocalScore = 0;
double CoverCallScore = 0;
+
+ double sum() const { return LocalScore + CoverCallScore; }
};
// Auxiliary variables used by the algorithm.
@@ -303,7 +309,7 @@ private:
const size_t SplitIndex) {
assert(SplitIndex < BlockOrder.size() && "Invalid split index");
- // Update function layout assuming hot-warm splitting at SplitIndex
+ // Update function layout assuming hot-warm splitting at SplitIndex.
for (size_t Index = 0; Index < BlockOrder.size(); Index++) {
BinaryBasicBlock *BB = BlockOrder[Index];
if (BB->getFragmentNum() == FragmentNum::cold())
@@ -319,8 +325,8 @@ private:
// Populate BB.OutputAddressRange with estimated new start and end addresses
// and compute the old end address of the hot section and the new end
// address of the hot section.
- size_t OldHotEndAddr;
- size_t NewHotEndAddr;
+ size_t OldHotEndAddr{0};
+ size_t NewHotEndAddr{0};
size_t CurrentAddr = BBOffsets[BlockOrder[0]];
for (BinaryBasicBlock *BB : BlockOrder) {
// We only care about new addresses of blocks in hot/warm.
@@ -492,20 +498,15 @@ private:
}
/// Compute the split score of splitting a function at a given index.
- /// The split score consists of local score and cover score. Cover call score
- /// is expensive to compute. As a result, we pass in a \p ReferenceScore and
- /// compute cover score only when the local score exceeds that in the
- /// ReferenceScore or that the size reduction of the hot fragment is larger
- /// than that achieved by the split index of the ReferenceScore. This function
- /// returns \p Score of SplitScore type. It contains the local score and cover
- /// score (if computed) of the current splitting index. For easier book
- /// keeping and comparison, it also stores the split index and the resulting
- /// reduction in hot fragment size.
+ /// The split score consists of local score and cover score. This function
+ /// returns \p Score of SplitScore type. It contains the local score and
+ /// cover score of the current splitting index. For easier book keeping and
+ /// comparison, it also stores the split index and the resulting reduction
+ /// in hot fragment size.
SplitScore computeSplitScore(const BinaryFunction &BF,
const BasicBlockOrder &BlockOrder,
const size_t SplitIndex,
- const std::vector<CallInfo> &CoverCalls,
- const SplitScore &ReferenceScore) {
+ const std::vector<CallInfo> &CoverCalls) {
// Populate BinaryBasicBlock::OutputAddressRange with estimated
// new start and end addresses after hot-warm splitting at SplitIndex.
size_t OldHotEnd;
@@ -533,47 +534,74 @@ private:
// increamented in place.
computeJumpScore(BlockOrder, SplitIndex, Score);
- // There is no need to compute CoverCallScore if we have already found
- // another split index with a bigger LocalScore and bigger HotSizeReduction.
- if (Score.LocalScore <= ReferenceScore.LocalScore &&
- Score.HotSizeReduction <= ReferenceScore.HotSizeReduction)
- return Score;
-
// Compute CoverCallScore and store in Score in place.
computeCoverCallScore(BlockOrder, SplitIndex, CoverCalls, Score);
return Score;
}
+ /// Find the most likely successor of a basic block when it has one or two
+ /// successors. Return nullptr otherwise.
+ const BinaryBasicBlock *getMostLikelySuccessor(const BinaryBasicBlock *BB) {
+ if (BB->succ_size() == 1)
+ return BB->getSuccessor();
+ if (BB->succ_size() == 2) {
+ uint64_t TakenCount = BB->getTakenBranchInfo().Count;
+ assert(TakenCount != BinaryBasicBlock::COUNT_NO_PROFILE);
+ uint64_t NonTakenCount = BB->getFallthroughBranchInfo().Count;
+ assert(NonTakenCount != BinaryBasicBlock::COUNT_NO_PROFILE);
+ if (TakenCount > NonTakenCount)
+ return BB->getConditionalSuccessor(true);
+ else if (TakenCount < NonTakenCount)
+ return BB->getConditionalSuccessor(false);
+ }
+ return nullptr;
+ }
+
/// Find the best index for splitting. The returned value is the index of the
/// last hot basic block. Hence, "no splitting" is equivalent to returning the
/// value which is one less than the size of the function.
size_t findSplitIndex(const BinaryFunction &BF,
const BasicBlockOrder &BlockOrder) {
+ assert(BlockOrder.size() > 2);
// Find all function calls that can be shortened if we move blocks of the
// current function to warm/cold
const std::vector<CallInfo> CoverCalls = extractCoverCalls(BF);
- // Try all possible split indices (blocks with Index <= SplitIndex are in
- // hot) and find the one maximizing the splitting score.
+ // Find the existing hot-cold splitting index.
+ size_t HotColdIndex = 0;
+ while (HotColdIndex + 1 < BlockOrder.size()) {
+ if (BlockOrder[HotColdIndex + 1]->getFragmentNum() == FragmentNum::cold())
+ break;
+ HotColdIndex++;
+ }
+ assert(HotColdIndex + 1 == BlockOrder.size() ||
+ (BlockOrder[HotColdIndex]->getFragmentNum() == FragmentNum::main() &&
+ BlockOrder[HotColdIndex + 1]->getFragmentNum() ==
+ FragmentNum::cold()));
+
+ // Try all possible split indices up to HotColdIndex (blocks that have
+ // Index <= SplitIndex are in hot) and find the one maximizing the
+ // splitting score.
SplitScore BestScore;
- double BestScoreSum = -1.0;
- SplitScore ReferenceScore;
- for (size_t Index = 0; Index < BlockOrder.size(); Index++) {
+ for (size_t Index = 0; Index <= HotColdIndex; Index++) {
const BinaryBasicBlock *LastHotBB = BlockOrder[Index];
- // No need to keep cold blocks in the hot section.
- if (LastHotBB->getFragmentNum() == FragmentNum::cold())
- break;
+ assert(LastHotBB->getFragmentNum() != FragmentNum::cold());
+
+ // Do not break jump to the most likely successor.
+ if (Index + 1 < BlockOrder.size() &&
+ BlockOrder[Index + 1] == getMostLikelySuccessor(LastHotBB))
+ continue;
+
const SplitScore Score =
- computeSplitScore(BF, BlockOrder, Index, CoverCalls, ReferenceScore);
- double ScoreSum = Score.LocalScore + Score.CoverCallScore;
- if (ScoreSum > BestScoreSum) {
- BestScoreSum = ScoreSum;
+ computeSplitScore(BF, BlockOrder, Index, CoverCalls);
+ if (Score.sum() > BestScore.sum())
BestScore = Score;
- }
- if (Score.LocalScore > ReferenceScore.LocalScore)
- ReferenceScore = Score;
}
+ // If we don't find a good splitting point, fallback to the original one.
+ if (BestScore.SplitIndex == size_t(-1))
+ return HotColdIndex;
+
return BestScore.SplitIndex;
}
};
diff --git a/bolt/test/X86/cdsplit-call-scale.s b/bolt/test/X86/cdsplit-call-scale.s
index 5b4f92832624..5701d9e6dfd6 100644
--- a/bolt/test/X86/cdsplit-call-scale.s
+++ b/bolt/test/X86/cdsplit-call-scale.s
@@ -2,8 +2,9 @@
# When -call-scale=0.0, the tested function is 2-way splitted.
# When -call-scale=1.0, the tested function is 3-way splitted with 5 blocks
# in warm because of the increased benefit of shortening the call edges.
-# When -call-scale=1000.0, the tested function is 3-way splitted with 7 blocks
-# in warm because of the strong benefit of shortening the call edges.
+# When -call-scale=1000.0, the tested function is still 3-way splitted with
+# 5 blocks in warm because cdsplit does not allow hot-warm splitting to break
+# a fall through branch from a basic block to its most likely successor.
# RUN: llvm-mc --filetype=obj --triple x86_64-unknown-unknown %s -o %t.o
# RUN: link_fdata %s %t.o %t.fdata
@@ -39,12 +40,10 @@
# MEDINCENTIVE: {{^\.Ltmp5}}
# HIGHINCENTIVE: Binary Function "chain" after split-functions
-# HIGHINCENTIVE: {{^\.LBB00}}
+# HIGHINCENTIVE: {{^\.Ltmp1}}
# HIGHINCENTIVE: ------- HOT-COLD SPLIT POINT -------
# HIGHINCENTIVE: {{^\.LFT1}}
# HIGHINCENTIVE: ------- HOT-COLD SPLIT POINT -------
-# HIGHINCENTIVE: {{^\.LFT0}}
-# HIGHINCENTIVE: {{^\.Ltmp1}}
# HIGHINCENTIVE: {{^\.Ltmp0}}
# HIGHINCENTIVE: {{^\.Ltmp2}}
# HIGHINCENTIVE: {{^\.Ltmp3}}