diff options
author | ShatianWang <38512325+ShatianWang@users.noreply.github.com> | 2023-12-21 16:17:10 -0500 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-12-21 16:17:10 -0500 |
commit | 157748341358f38ab55ea3a7a64276a5d4431d77 (patch) | |
tree | d94af9dc4fc7197a3881e3bc3798297ae1f88adf | |
parent | 0cf3af0c5176cc067bf90b315466a8997498b988 (diff) |
[BOLT] Don't split likely fallthrough in CDSplit (#76164)
This diff speeds up CDSplit by not considering any hot-warm splitting
point that could break a fall-through branch from a basic block to its
most likely successor.
Co-authored-by: spupyrev <spupyrev@fb.com>
-rw-r--r-- | bolt/lib/Passes/SplitFunctions.cpp | 100 | ||||
-rw-r--r-- | bolt/test/X86/cdsplit-call-scale.s | 9 |
2 files changed, 68 insertions, 41 deletions
diff --git a/bolt/lib/Passes/SplitFunctions.cpp b/bolt/lib/Passes/SplitFunctions.cpp index d9f5da3e3bca..5de075973004 100644 --- a/bolt/lib/Passes/SplitFunctions.cpp +++ b/bolt/lib/Passes/SplitFunctions.cpp @@ -175,8 +175,12 @@ struct SplitCacheDirected final : public SplitStrategy { void fragment(const BlockIt Start, const BlockIt End) override { BasicBlockOrder BlockOrder(Start, End); BinaryFunction &BF = *BlockOrder.front()->getFunction(); + // No need to re-split small functions. + if (BlockOrder.size() <= 2) + return; size_t BestSplitIndex = findSplitIndex(BF, BlockOrder); + assert(BestSplitIndex < BlockOrder.size()); // Assign fragments based on the computed best split index. // All basic blocks with index up to the best split index become hot. @@ -200,10 +204,12 @@ private: }; struct SplitScore { - size_t SplitIndex; + size_t SplitIndex = size_t(-1); size_t HotSizeReduction = 0; double LocalScore = 0; double CoverCallScore = 0; + + double sum() const { return LocalScore + CoverCallScore; } }; // Auxiliary variables used by the algorithm. @@ -303,7 +309,7 @@ private: const size_t SplitIndex) { assert(SplitIndex < BlockOrder.size() && "Invalid split index"); - // Update function layout assuming hot-warm splitting at SplitIndex + // Update function layout assuming hot-warm splitting at SplitIndex. for (size_t Index = 0; Index < BlockOrder.size(); Index++) { BinaryBasicBlock *BB = BlockOrder[Index]; if (BB->getFragmentNum() == FragmentNum::cold()) @@ -319,8 +325,8 @@ private: // Populate BB.OutputAddressRange with estimated new start and end addresses // and compute the old end address of the hot section and the new end // address of the hot section. - size_t OldHotEndAddr; - size_t NewHotEndAddr; + size_t OldHotEndAddr{0}; + size_t NewHotEndAddr{0}; size_t CurrentAddr = BBOffsets[BlockOrder[0]]; for (BinaryBasicBlock *BB : BlockOrder) { // We only care about new addresses of blocks in hot/warm. @@ -492,20 +498,15 @@ private: } /// Compute the split score of splitting a function at a given index. - /// The split score consists of local score and cover score. Cover call score - /// is expensive to compute. As a result, we pass in a \p ReferenceScore and - /// compute cover score only when the local score exceeds that in the - /// ReferenceScore or that the size reduction of the hot fragment is larger - /// than that achieved by the split index of the ReferenceScore. This function - /// returns \p Score of SplitScore type. It contains the local score and cover - /// score (if computed) of the current splitting index. For easier book - /// keeping and comparison, it also stores the split index and the resulting - /// reduction in hot fragment size. + /// The split score consists of local score and cover score. This function + /// returns \p Score of SplitScore type. It contains the local score and + /// cover score of the current splitting index. For easier book keeping and + /// comparison, it also stores the split index and the resulting reduction + /// in hot fragment size. SplitScore computeSplitScore(const BinaryFunction &BF, const BasicBlockOrder &BlockOrder, const size_t SplitIndex, - const std::vector<CallInfo> &CoverCalls, - const SplitScore &ReferenceScore) { + const std::vector<CallInfo> &CoverCalls) { // Populate BinaryBasicBlock::OutputAddressRange with estimated // new start and end addresses after hot-warm splitting at SplitIndex. size_t OldHotEnd; @@ -533,47 +534,74 @@ private: // increamented in place. computeJumpScore(BlockOrder, SplitIndex, Score); - // There is no need to compute CoverCallScore if we have already found - // another split index with a bigger LocalScore and bigger HotSizeReduction. - if (Score.LocalScore <= ReferenceScore.LocalScore && - Score.HotSizeReduction <= ReferenceScore.HotSizeReduction) - return Score; - // Compute CoverCallScore and store in Score in place. computeCoverCallScore(BlockOrder, SplitIndex, CoverCalls, Score); return Score; } + /// Find the most likely successor of a basic block when it has one or two + /// successors. Return nullptr otherwise. + const BinaryBasicBlock *getMostLikelySuccessor(const BinaryBasicBlock *BB) { + if (BB->succ_size() == 1) + return BB->getSuccessor(); + if (BB->succ_size() == 2) { + uint64_t TakenCount = BB->getTakenBranchInfo().Count; + assert(TakenCount != BinaryBasicBlock::COUNT_NO_PROFILE); + uint64_t NonTakenCount = BB->getFallthroughBranchInfo().Count; + assert(NonTakenCount != BinaryBasicBlock::COUNT_NO_PROFILE); + if (TakenCount > NonTakenCount) + return BB->getConditionalSuccessor(true); + else if (TakenCount < NonTakenCount) + return BB->getConditionalSuccessor(false); + } + return nullptr; + } + /// Find the best index for splitting. The returned value is the index of the /// last hot basic block. Hence, "no splitting" is equivalent to returning the /// value which is one less than the size of the function. size_t findSplitIndex(const BinaryFunction &BF, const BasicBlockOrder &BlockOrder) { + assert(BlockOrder.size() > 2); // Find all function calls that can be shortened if we move blocks of the // current function to warm/cold const std::vector<CallInfo> CoverCalls = extractCoverCalls(BF); - // Try all possible split indices (blocks with Index <= SplitIndex are in - // hot) and find the one maximizing the splitting score. + // Find the existing hot-cold splitting index. + size_t HotColdIndex = 0; + while (HotColdIndex + 1 < BlockOrder.size()) { + if (BlockOrder[HotColdIndex + 1]->getFragmentNum() == FragmentNum::cold()) + break; + HotColdIndex++; + } + assert(HotColdIndex + 1 == BlockOrder.size() || + (BlockOrder[HotColdIndex]->getFragmentNum() == FragmentNum::main() && + BlockOrder[HotColdIndex + 1]->getFragmentNum() == + FragmentNum::cold())); + + // Try all possible split indices up to HotColdIndex (blocks that have + // Index <= SplitIndex are in hot) and find the one maximizing the + // splitting score. SplitScore BestScore; - double BestScoreSum = -1.0; - SplitScore ReferenceScore; - for (size_t Index = 0; Index < BlockOrder.size(); Index++) { + for (size_t Index = 0; Index <= HotColdIndex; Index++) { const BinaryBasicBlock *LastHotBB = BlockOrder[Index]; - // No need to keep cold blocks in the hot section. - if (LastHotBB->getFragmentNum() == FragmentNum::cold()) - break; + assert(LastHotBB->getFragmentNum() != FragmentNum::cold()); + + // Do not break jump to the most likely successor. + if (Index + 1 < BlockOrder.size() && + BlockOrder[Index + 1] == getMostLikelySuccessor(LastHotBB)) + continue; + const SplitScore Score = - computeSplitScore(BF, BlockOrder, Index, CoverCalls, ReferenceScore); - double ScoreSum = Score.LocalScore + Score.CoverCallScore; - if (ScoreSum > BestScoreSum) { - BestScoreSum = ScoreSum; + computeSplitScore(BF, BlockOrder, Index, CoverCalls); + if (Score.sum() > BestScore.sum()) BestScore = Score; - } - if (Score.LocalScore > ReferenceScore.LocalScore) - ReferenceScore = Score; } + // If we don't find a good splitting point, fallback to the original one. + if (BestScore.SplitIndex == size_t(-1)) + return HotColdIndex; + return BestScore.SplitIndex; } }; diff --git a/bolt/test/X86/cdsplit-call-scale.s b/bolt/test/X86/cdsplit-call-scale.s index 5b4f92832624..5701d9e6dfd6 100644 --- a/bolt/test/X86/cdsplit-call-scale.s +++ b/bolt/test/X86/cdsplit-call-scale.s @@ -2,8 +2,9 @@ # When -call-scale=0.0, the tested function is 2-way splitted. # When -call-scale=1.0, the tested function is 3-way splitted with 5 blocks # in warm because of the increased benefit of shortening the call edges. -# When -call-scale=1000.0, the tested function is 3-way splitted with 7 blocks -# in warm because of the strong benefit of shortening the call edges. +# When -call-scale=1000.0, the tested function is still 3-way splitted with +# 5 blocks in warm because cdsplit does not allow hot-warm splitting to break +# a fall through branch from a basic block to its most likely successor. # RUN: llvm-mc --filetype=obj --triple x86_64-unknown-unknown %s -o %t.o # RUN: link_fdata %s %t.o %t.fdata @@ -39,12 +40,10 @@ # MEDINCENTIVE: {{^\.Ltmp5}} # HIGHINCENTIVE: Binary Function "chain" after split-functions -# HIGHINCENTIVE: {{^\.LBB00}} +# HIGHINCENTIVE: {{^\.Ltmp1}} # HIGHINCENTIVE: ------- HOT-COLD SPLIT POINT ------- # HIGHINCENTIVE: {{^\.LFT1}} # HIGHINCENTIVE: ------- HOT-COLD SPLIT POINT ------- -# HIGHINCENTIVE: {{^\.LFT0}} -# HIGHINCENTIVE: {{^\.Ltmp1}} # HIGHINCENTIVE: {{^\.Ltmp0}} # HIGHINCENTIVE: {{^\.Ltmp2}} # HIGHINCENTIVE: {{^\.Ltmp3}} |