diff options
author | Alexey Bataev <a.bataev@outlook.com> | 2024-03-29 17:23:51 +0000 |
---|---|---|
committer | Alexey Bataev <a.bataev@outlook.com> | 2024-03-29 17:23:51 +0000 |
commit | 84cdd61dfee3c46855c0193c3dedc3cd76e66cf2 (patch) | |
tree | dc69c0f8dc85d725b0efdb8180a9875982d1f9bc | |
parent | 84299df301dc07ea83fa8378051103195c3a7c65 (diff) |
[𝘀𝗽𝗿] initial versionupstream/users/alexey-bataev/spr/slpimprove-reordering-for-consts-splats-and-ops-from-same-nodes-improved-analysis
Created using spr 1.3.5
11 files changed, 212 insertions, 178 deletions
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 2875e71081d9..46243c60324a 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1334,12 +1334,19 @@ public: return LookAheadHeuristics::ScoreSplat; } + auto CheckSameEntryOrFail = [&]() { + if (const TreeEntry *TE1 = R.getTreeEntry(V1); + TE1 && TE1 == R.getTreeEntry(V2)) + return LookAheadHeuristics::ScoreSplatLoads; + return LookAheadHeuristics::ScoreFail; + }; + auto *LI1 = dyn_cast<LoadInst>(V1); auto *LI2 = dyn_cast<LoadInst>(V2); if (LI1 && LI2) { if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() || !LI2->isSimple()) - return LookAheadHeuristics::ScoreFail; + return CheckSameEntryOrFail(); std::optional<int> Dist = getPointersDiff( LI1->getType(), LI1->getPointerOperand(), LI2->getType(), @@ -1351,7 +1358,7 @@ public: FixedVectorType::get(LI1->getType(), NumLanes), LI1->getAlign())) return LookAheadHeuristics::ScoreMaskedGatherCandidate; - return LookAheadHeuristics::ScoreFail; + return CheckSameEntryOrFail(); } // The distance is too large - still may be profitable to use masked // loads/gathers. @@ -1408,14 +1415,14 @@ public: } return LookAheadHeuristics::ScoreAltOpcodes; } - return LookAheadHeuristics::ScoreFail; + return CheckSameEntryOrFail(); } auto *I1 = dyn_cast<Instruction>(V1); auto *I2 = dyn_cast<Instruction>(V2); if (I1 && I2) { if (I1->getParent() != I2->getParent()) - return LookAheadHeuristics::ScoreFail; + return CheckSameEntryOrFail(); SmallVector<Value *, 4> Ops(MainAltOps.begin(), MainAltOps.end()); Ops.push_back(I1); Ops.push_back(I2); @@ -1436,7 +1443,7 @@ public: if (isa<UndefValue>(V2)) return LookAheadHeuristics::ScoreUndef; - return LookAheadHeuristics::ScoreFail; + return CheckSameEntryOrFail(); } /// Go through the operands of \p LHS and \p RHS recursively until @@ -1599,6 +1606,7 @@ public: const DataLayout &DL; ScalarEvolution &SE; const BoUpSLP &R; + const Loop *L = nullptr; /// \returns the operand data at \p OpIdx and \p Lane. OperandData &getData(unsigned OpIdx, unsigned Lane) { @@ -1767,8 +1775,9 @@ public: // Track if the operand must be marked as used. If the operand is set to // Score 1 explicitly (because of non power-of-2 unique scalars, we may // want to reestimate the operands again on the following iterations). - bool IsUsed = - RMode == ReorderingMode::Splat || RMode == ReorderingMode::Constant; + bool IsUsed = RMode == ReorderingMode::Splat || + RMode == ReorderingMode::Constant || + RMode == ReorderingMode::Load; // Iterate through all unused operands and look for the best. for (unsigned Idx = 0; Idx != NumOperands; ++Idx) { // Get the operand at Idx and Lane. @@ -1789,23 +1798,44 @@ public: // Look for an operand that matches the current mode. switch (RMode) { case ReorderingMode::Load: - case ReorderingMode::Constant: case ReorderingMode::Opcode: { bool LeftToRight = Lane > LastLane; Value *OpLeft = (LeftToRight) ? OpLastLane : Op; Value *OpRight = (LeftToRight) ? Op : OpLastLane; int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane, OpIdx, Idx, IsUsed); - if (Score > static_cast<int>(BestOp.Score)) { + if (Score > static_cast<int>(BestOp.Score) || + (Score > 0 && Score == static_cast<int>(BestOp.Score) && + Idx == OpIdx)) { BestOp.Idx = Idx; BestOp.Score = Score; BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score; } break; } + case ReorderingMode::Constant: + if (isa<Constant>(Op) || + (!BestOp.Score && L && L->isLoopInvariant(Op))) { + BestOp.Idx = Idx; + if (isa<Constant>(Op)) { + BestOp.Score = LookAheadHeuristics::ScoreConstants; + BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = + LookAheadHeuristics::ScoreConstants; + } + if (isa<UndefValue>(Op) || !isa<Constant>(Op)) + IsUsed = false; + } + break; case ReorderingMode::Splat: - if (Op == OpLastLane) + if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) { + IsUsed = Op == OpLastLane; + if (Op == OpLastLane) { + BestOp.Score = LookAheadHeuristics::ScoreSplat; + BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = + LookAheadHeuristics::ScoreSplat; + } BestOp.Idx = Idx; + } break; case ReorderingMode::Failed: llvm_unreachable("Not expected Failed reordering mode."); @@ -1999,6 +2029,8 @@ public: /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow. bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) { bool OpAPO = getData(OpIdx, Lane).APO; + bool IsInvariant = L && L->isLoopInvariant(Op); + unsigned Cnt = 0; for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) { if (Ln == Lane) continue; @@ -2008,22 +2040,37 @@ public: OperandData &Data = getData(OpI, Ln); if (Data.APO != OpAPO || Data.IsUsed) continue; - if (Data.V == Op) { + Value *OpILane = getValue(OpI, Lane); + bool IsConstantOp = isa<Constant>(OpILane); + if (Data.V == Op || + (!IsConstantOp && + ((Lns > 2 && isa<Constant>(Data.V)) || + (Lns == 2 && + !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI) + .getOpcode() && + isa<Constant>(Data.V)))) || + (IsInvariant && !isa<Constant>(Data.V) && + !getSameOpcode({Op, Data.V}, TLI).getOpcode() && + L->isLoopInvariant(Data.V))) { FoundCandidate = true; - Data.IsUsed = true; + Data.IsUsed = Data.V == Op; + if (Data.V == Op) + ++Cnt; break; } } if (!FoundCandidate) return false; } - return true; + return getNumLanes() == 2 || Cnt > 1; } public: /// Initialize with all the operands of the instruction vector \p RootVL. VLOperands(ArrayRef<Value *> RootVL, const BoUpSLP &R) - : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R) { + : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R), + L(R.LI->getLoopFor( + (cast<Instruction>(RootVL.front())->getParent()))) { // Append all the operands of RootVL. appendOperandsOfVL(RootVL); } @@ -2155,8 +2202,6 @@ public: // getBestOperand(). swap(OpIdx, *BestIdx, Lane); } else { - // We failed to find a best operand, set mode to 'Failed'. - ReorderingModes[OpIdx] = ReorderingMode::Failed; // Enable the second pass. StrategyFailed = true; } diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll index d87bdfe26899..aa9a070a7945 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll @@ -37,10 +37,10 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 4 ; CHECK-NEXT: [[TMP15:%.*]] = load <2 x i8>, ptr [[ADD_PTR_1]], align 1 -; CHECK-NEXT: [[TMP101:%.*]] = zext <2 x i8> [[TMP15]] to <2 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = zext <2 x i8> [[TMP15]] to <2 x i32> ; CHECK-NEXT: [[TMP17:%.*]] = load <2 x i8>, ptr [[ADD_PTR64_1]], align 1 ; CHECK-NEXT: [[TMP18:%.*]] = zext <2 x i8> [[TMP17]] to <2 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = sub <2 x i32> [[TMP101]], [[TMP18]] +; CHECK-NEXT: [[TMP19:%.*]] = sub <2 x i32> [[TMP16]], [[TMP18]] ; CHECK-NEXT: [[TMP20:%.*]] = load <2 x i8>, ptr [[ARRAYIDX3_2]], align 1 ; CHECK-NEXT: [[TMP21:%.*]] = zext <2 x i8> [[TMP20]] to <2 x i32> ; CHECK-NEXT: [[TMP22:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_2]], align 1 @@ -64,15 +64,15 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[TMP36:%.*]] = sub <2 x i32> [[TMP33]], [[TMP35]] ; CHECK-NEXT: [[TMP37:%.*]] = shl <2 x i32> [[TMP36]], <i32 16, i32 16> ; CHECK-NEXT: [[TMP38:%.*]] = add <2 x i32> [[TMP37]], [[TMP31]] -; CHECK-NEXT: [[TMP39:%.*]] = extractelement <2 x i32> [[TMP26]], i32 0 -; CHECK-NEXT: [[TMP40:%.*]] = extractelement <2 x i32> [[TMP26]], i32 1 -; CHECK-NEXT: [[ADD44_2:%.*]] = add i32 [[TMP40]], [[TMP39]] -; CHECK-NEXT: [[SUB45_2:%.*]] = sub i32 [[TMP39]], [[TMP40]] -; CHECK-NEXT: [[TMP41:%.*]] = extractelement <2 x i32> [[TMP38]], i32 0 -; CHECK-NEXT: [[TMP42:%.*]] = extractelement <2 x i32> [[TMP38]], i32 1 -; CHECK-NEXT: [[CONV:%.*]] = add i32 [[TMP42]], [[TMP41]] -; CHECK-NEXT: [[SUB47_2:%.*]] = sub i32 [[TMP41]], [[TMP42]] -; CHECK-NEXT: [[ADD48_2:%.*]] = add i32 [[CONV]], [[ADD44_2]] +; CHECK-NEXT: [[ADD44_2:%.*]] = extractelement <2 x i32> [[TMP26]], i32 0 +; CHECK-NEXT: [[CONV:%.*]] = extractelement <2 x i32> [[TMP26]], i32 1 +; CHECK-NEXT: [[ADD44_3:%.*]] = add i32 [[CONV]], [[ADD44_2]] +; CHECK-NEXT: [[SUB51_2:%.*]] = sub i32 [[ADD44_2]], [[CONV]] +; CHECK-NEXT: [[SUB45_2:%.*]] = extractelement <2 x i32> [[TMP38]], i32 0 +; CHECK-NEXT: [[SUB47_2:%.*]] = extractelement <2 x i32> [[TMP38]], i32 1 +; CHECK-NEXT: [[ADD46_2:%.*]] = add i32 [[SUB47_2]], [[SUB45_2]] +; CHECK-NEXT: [[SUB59_2:%.*]] = sub i32 [[SUB45_2]], [[SUB47_2]] +; CHECK-NEXT: [[ADD48_2:%.*]] = add i32 [[ADD46_2]], [[ADD44_3]] ; CHECK-NEXT: [[TMP43:%.*]] = load i8, ptr null, align 1 ; CHECK-NEXT: [[ARRAYIDX20_3:%.*]] = getelementptr i8, ptr null, i64 2 ; CHECK-NEXT: [[ARRAYIDX22_3:%.*]] = getelementptr i8, ptr null, i64 2 @@ -104,10 +104,10 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[TMP69:%.*]] = sub <2 x i32> [[TMP66]], [[TMP68]] ; CHECK-NEXT: [[TMP70:%.*]] = shl <2 x i32> [[TMP69]], <i32 16, i32 16> ; CHECK-NEXT: [[TMP71:%.*]] = add <2 x i32> [[TMP70]], [[TMP63]] -; CHECK-NEXT: [[TMP16:%.*]] = add <2 x i32> [[TMP71]], [[TMP58]] -; CHECK-NEXT: [[TMP73:%.*]] = sub <2 x i32> [[TMP58]], [[TMP71]] -; CHECK-NEXT: [[TMP74:%.*]] = extractelement <2 x i32> [[TMP16]], i32 0 -; CHECK-NEXT: [[TMP75:%.*]] = extractelement <2 x i32> [[TMP16]], i32 1 +; CHECK-NEXT: [[TMP72:%.*]] = add <2 x i32> [[TMP71]], [[TMP58]] +; CHECK-NEXT: [[TMP190:%.*]] = sub <2 x i32> [[TMP58]], [[TMP71]] +; CHECK-NEXT: [[TMP74:%.*]] = extractelement <2 x i32> [[TMP72]], i32 0 +; CHECK-NEXT: [[TMP75:%.*]] = extractelement <2 x i32> [[TMP72]], i32 1 ; CHECK-NEXT: [[ADD48_3:%.*]] = add i32 [[TMP74]], [[TMP75]] ; CHECK-NEXT: [[ADD94:%.*]] = add i32 [[ADD48_3]], [[ADD48_2]] ; CHECK-NEXT: [[SUB102:%.*]] = sub i32 [[ADD48_2]], [[ADD48_3]] @@ -115,19 +115,19 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[SHR_I49_2:%.*]] = lshr i32 [[TMP79]], 15 ; CHECK-NEXT: [[AND_I50_2:%.*]] = and i32 [[SHR_I49_2]], 65537 ; CHECK-NEXT: [[MUL_I51_2:%.*]] = mul i32 [[AND_I50_2]], 65535 -; CHECK-NEXT: [[SHR_I49_3:%.*]] = lshr i32 [[CONV]], 15 +; CHECK-NEXT: [[SHR_I49_3:%.*]] = lshr i32 [[ADD46_2]], 15 ; CHECK-NEXT: [[AND_I50_3:%.*]] = and i32 [[SHR_I49_3]], 65537 ; CHECK-NEXT: [[MUL_I51_3:%.*]] = mul i32 [[AND_I50_3]], 65535 -; CHECK-NEXT: [[TMP107:%.*]] = extractelement <2 x i32> [[TMP101]], i32 0 -; CHECK-NEXT: [[SHR_I49_1:%.*]] = lshr i32 [[TMP107]], 15 -; CHECK-NEXT: [[AND_I50_1:%.*]] = and i32 [[SHR_I49_1]], 65537 -; CHECK-NEXT: [[MUL_I51_1:%.*]] = mul i32 [[AND_I50_1]], 65535 +; CHECK-NEXT: [[TMP107:%.*]] = extractelement <2 x i32> [[TMP16]], i32 0 +; CHECK-NEXT: [[SHR_I49_5:%.*]] = lshr i32 [[TMP107]], 15 +; CHECK-NEXT: [[AND_I50_5:%.*]] = and i32 [[SHR_I49_5]], 65537 +; CHECK-NEXT: [[MUL_I51_5:%.*]] = mul i32 [[AND_I50_5]], 65535 ; CHECK-NEXT: [[SHR_I49_4:%.*]] = lshr i32 [[CONV_1]], 15 ; CHECK-NEXT: [[AND_I50_4:%.*]] = and i32 [[SHR_I49_4]], 65537 ; CHECK-NEXT: [[MUL_I51_4:%.*]] = mul i32 [[AND_I50_4]], 65535 -; CHECK-NEXT: [[SHR_I49_5:%.*]] = lshr i32 [[CONV1]], 15 -; CHECK-NEXT: [[AND_I50_5:%.*]] = and i32 [[SHR_I49_5]], 65537 -; CHECK-NEXT: [[MUL_I51_5:%.*]] = mul i32 [[AND_I50_5]], 65535 +; CHECK-NEXT: [[SHR_I49_6:%.*]] = lshr i32 [[CONV1]], 15 +; CHECK-NEXT: [[AND_I50_6:%.*]] = and i32 [[SHR_I49_6]], 65537 +; CHECK-NEXT: [[MUL_I51_6:%.*]] = mul i32 [[AND_I50_6]], 65535 ; CHECK-NEXT: [[TMP78:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1 ; CHECK-NEXT: [[TMP102:%.*]] = zext <2 x i8> [[TMP78]] to <2 x i32> ; CHECK-NEXT: [[TMP80:%.*]] = insertelement <2 x ptr> [[TMP5]], ptr [[ARRAYIDX22]], i32 1 @@ -151,21 +151,21 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[TMP98:%.*]] = sub <2 x i32> [[TMP97]], [[TMP90]] ; CHECK-NEXT: [[TMP104:%.*]] = add <2 x i32> [[TMP96]], [[TMP98]] ; CHECK-NEXT: [[TMP100:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV1]], i32 0 -; CHECK-NEXT: [[TMP103:%.*]] = sub <2 x i32> [[TMP100]], [[TMP82]] -; CHECK-NEXT: [[TMP200:%.*]] = add <2 x i32> [[TMP88]], [[TMP103]] +; CHECK-NEXT: [[TMP101:%.*]] = sub <2 x i32> [[TMP100]], [[TMP82]] +; CHECK-NEXT: [[TMP200:%.*]] = add <2 x i32> [[TMP88]], [[TMP101]] ; CHECK-NEXT: [[TMP128:%.*]] = shufflevector <2 x i32> [[TMP104]], <2 x i32> [[TMP200]], <2 x i32> <i32 0, i32 2> -; CHECK-NEXT: [[TMP165:%.*]] = add <2 x i32> [[TMP104]], [[TMP200]] +; CHECK-NEXT: [[TMP106:%.*]] = add <2 x i32> [[TMP104]], [[TMP200]] ; CHECK-NEXT: [[TMP105:%.*]] = sub <2 x i32> [[TMP200]], [[TMP104]] -; CHECK-NEXT: [[TMP238:%.*]] = extractelement <2 x i32> [[TMP165]], i32 0 -; CHECK-NEXT: [[TMP143:%.*]] = extractelement <2 x i32> [[TMP165]], i32 1 -; CHECK-NEXT: [[ADD48:%.*]] = add i32 [[TMP143]], [[TMP238]] -; CHECK-NEXT: [[TMP108:%.*]] = extractelement <2 x i32> [[TMP105]], i32 1 -; CHECK-NEXT: [[SHR_I59:%.*]] = lshr i32 [[TMP143]], 15 -; CHECK-NEXT: [[AND_I60:%.*]] = and i32 [[SHR_I59]], 65537 -; CHECK-NEXT: [[MUL_I61:%.*]] = mul i32 [[AND_I60]], 65535 +; CHECK-NEXT: [[TMP238:%.*]] = extractelement <2 x i32> [[TMP106]], i32 0 +; CHECK-NEXT: [[TMP108:%.*]] = extractelement <2 x i32> [[TMP106]], i32 1 +; CHECK-NEXT: [[ADD48:%.*]] = add i32 [[TMP108]], [[TMP238]] +; CHECK-NEXT: [[TMP142:%.*]] = extractelement <2 x i32> [[TMP105]], i32 1 ; CHECK-NEXT: [[SHR_I59_1:%.*]] = lshr i32 [[TMP108]], 15 ; CHECK-NEXT: [[AND_I60_1:%.*]] = and i32 [[SHR_I59_1]], 65537 ; CHECK-NEXT: [[MUL_I61_1:%.*]] = mul i32 [[AND_I60_1]], 65535 +; CHECK-NEXT: [[SHR_I59_4:%.*]] = lshr i32 [[TMP142]], 15 +; CHECK-NEXT: [[AND_I60_4:%.*]] = and i32 [[SHR_I59_4]], 65537 +; CHECK-NEXT: [[MUL_I61_4:%.*]] = mul i32 [[AND_I60_4]], 65535 ; CHECK-NEXT: [[TMP109:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1 ; CHECK-NEXT: [[TMP110:%.*]] = zext <2 x i8> [[TMP109]] to <2 x i32> ; CHECK-NEXT: [[TMP111:%.*]] = insertelement <2 x i8> poison, i8 [[TMP12]], i32 0 @@ -185,7 +185,7 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[TMP125:%.*]] = shl <2 x i32> [[TMP124]], <i32 16, i32 16> ; CHECK-NEXT: [[TMP126:%.*]] = getelementptr i8, <2 x ptr> [[TMP120]], <2 x i64> <i64 1, i64 3> ; CHECK-NEXT: [[TMP127:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP126]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison) -; CHECK-NEXT: [[TMP153:%.*]] = zext <2 x i8> [[TMP127]] to <2 x i32> +; CHECK-NEXT: [[TMP144:%.*]] = zext <2 x i8> [[TMP127]] to <2 x i32> ; CHECK-NEXT: [[TMP129:%.*]] = getelementptr i8, <2 x ptr> [[TMP115]], <2 x i64> <i64 5, i64 7> ; CHECK-NEXT: [[TMP130:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP129]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison) ; CHECK-NEXT: [[TMP131:%.*]] = zext <2 x i8> [[TMP130]] to <2 x i32> @@ -195,15 +195,15 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[TMP135:%.*]] = sub <2 x i32> [[TMP131]], [[TMP134]] ; CHECK-NEXT: [[TMP136:%.*]] = shl <2 x i32> [[TMP135]], <i32 16, i32 16> ; CHECK-NEXT: [[TMP137:%.*]] = insertelement <2 x i32> [[TMP110]], i32 [[CONV33_1]], i32 1 -; CHECK-NEXT: [[TMP138:%.*]] = sub <2 x i32> [[TMP137]], [[TMP153]] +; CHECK-NEXT: [[TMP138:%.*]] = sub <2 x i32> [[TMP137]], [[TMP144]] ; CHECK-NEXT: [[TMP139:%.*]] = add <2 x i32> [[TMP136]], [[TMP138]] ; CHECK-NEXT: [[TMP140:%.*]] = insertelement <2 x i32> [[TMP110]], i32 [[CONV_1]], i32 0 ; CHECK-NEXT: [[TMP141:%.*]] = sub <2 x i32> [[TMP140]], [[TMP113]] -; CHECK-NEXT: [[TMP142:%.*]] = add <2 x i32> [[TMP125]], [[TMP141]] -; CHECK-NEXT: [[TMP257:%.*]] = add <2 x i32> [[TMP139]], [[TMP142]] -; CHECK-NEXT: [[TMP144:%.*]] = sub <2 x i32> [[TMP142]], [[TMP139]] -; CHECK-NEXT: [[TMP145:%.*]] = extractelement <2 x i32> [[TMP257]], i32 0 -; CHECK-NEXT: [[TMP146:%.*]] = extractelement <2 x i32> [[TMP257]], i32 1 +; CHECK-NEXT: [[TMP155:%.*]] = add <2 x i32> [[TMP125]], [[TMP141]] +; CHECK-NEXT: [[TMP143:%.*]] = add <2 x i32> [[TMP139]], [[TMP155]] +; CHECK-NEXT: [[TMP189:%.*]] = sub <2 x i32> [[TMP155]], [[TMP139]] +; CHECK-NEXT: [[TMP145:%.*]] = extractelement <2 x i32> [[TMP143]], i32 0 +; CHECK-NEXT: [[TMP146:%.*]] = extractelement <2 x i32> [[TMP143]], i32 1 ; CHECK-NEXT: [[ADD48_1:%.*]] = add i32 [[TMP146]], [[TMP145]] ; CHECK-NEXT: [[SHR_I54:%.*]] = lshr i32 [[TMP146]], 15 ; CHECK-NEXT: [[AND_I55:%.*]] = and i32 [[SHR_I54]], 65537 @@ -220,37 +220,37 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[ADD_I:%.*]] = add i32 [[MUL_I51_2]], [[ADD103]] ; CHECK-NEXT: [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP79]] ; CHECK-NEXT: [[ADD_I52:%.*]] = add i32 [[MUL_I51_3]], [[ADD105]] -; CHECK-NEXT: [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[CONV]] +; CHECK-NEXT: [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[ADD46_2]] ; CHECK-NEXT: [[ADD_I57:%.*]] = add i32 [[MUL_I56]], [[SUB104]] ; CHECK-NEXT: [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP146]] -; CHECK-NEXT: [[ADD_I62:%.*]] = add i32 [[MUL_I61]], [[SUB106]] -; CHECK-NEXT: [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[TMP143]] +; CHECK-NEXT: [[ADD_I62:%.*]] = add i32 [[MUL_I61_1]], [[SUB106]] +; CHECK-NEXT: [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[TMP108]] ; CHECK-NEXT: [[ADD110:%.*]] = add i32 [[XOR_I53]], [[XOR_I]] ; CHECK-NEXT: [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I58]] ; CHECK-NEXT: [[ADD113:%.*]] = add i32 [[ADD112]], [[XOR_I63]] ; CHECK-NEXT: [[TMP150:%.*]] = shufflevector <2 x i32> [[TMP105]], <2 x i32> poison, <2 x i32> <i32 1, i32 0> -; CHECK-NEXT: [[TMP151:%.*]] = insertelement <2 x i32> [[TMP150]], i32 [[SUB47_2]], i32 1 -; CHECK-NEXT: [[TMP152:%.*]] = insertelement <2 x i32> [[TMP105]], i32 [[SUB45_2]], i32 1 -; CHECK-NEXT: [[TMP163:%.*]] = add <2 x i32> [[TMP151]], [[TMP152]] -; CHECK-NEXT: [[TMP154:%.*]] = shufflevector <2 x i32> [[TMP144]], <2 x i32> [[TMP73]], <2 x i32> <i32 1, i32 2> -; CHECK-NEXT: [[TMP155:%.*]] = shufflevector <2 x i32> [[TMP144]], <2 x i32> [[TMP73]], <2 x i32> <i32 0, i32 3> -; CHECK-NEXT: [[TMP156:%.*]] = add <2 x i32> [[TMP154]], [[TMP155]] -; CHECK-NEXT: [[TMP157:%.*]] = extractelement <2 x i32> [[TMP163]], i32 1 +; CHECK-NEXT: [[TMP151:%.*]] = insertelement <2 x i32> [[TMP150]], i32 [[SUB59_2]], i32 1 +; CHECK-NEXT: [[TMP152:%.*]] = insertelement <2 x i32> [[TMP105]], i32 [[SUB51_2]], i32 1 +; CHECK-NEXT: [[TMP153:%.*]] = add <2 x i32> [[TMP151]], [[TMP152]] +; CHECK-NEXT: [[TMP154:%.*]] = shufflevector <2 x i32> [[TMP189]], <2 x i32> [[TMP190]], <2 x i32> <i32 1, i32 2> +; CHECK-NEXT: [[TMP184:%.*]] = shufflevector <2 x i32> [[TMP189]], <2 x i32> [[TMP190]], <2 x i32> <i32 0, i32 3> +; CHECK-NEXT: [[TMP156:%.*]] = add <2 x i32> [[TMP154]], [[TMP184]] +; CHECK-NEXT: [[TMP157:%.*]] = extractelement <2 x i32> [[TMP153]], i32 1 ; CHECK-NEXT: [[TMP158:%.*]] = extractelement <2 x i32> [[TMP156]], i32 1 -; CHECK-NEXT: [[TMP159:%.*]] = shufflevector <2 x i32> [[TMP156]], <2 x i32> [[TMP163]], <2 x i32> <i32 1, i32 3> +; CHECK-NEXT: [[TMP159:%.*]] = shufflevector <2 x i32> [[TMP156]], <2 x i32> [[TMP153]], <2 x i32> <i32 1, i32 3> ; CHECK-NEXT: [[ADD78_2:%.*]] = add i32 [[TMP158]], [[TMP157]] -; CHECK-NEXT: [[TMP160:%.*]] = extractelement <2 x i32> [[TMP163]], i32 0 +; CHECK-NEXT: [[TMP160:%.*]] = extractelement <2 x i32> [[TMP153]], i32 0 ; CHECK-NEXT: [[TMP161:%.*]] = extractelement <2 x i32> [[TMP156]], i32 0 -; CHECK-NEXT: [[TMP162:%.*]] = shufflevector <2 x i32> [[TMP156]], <2 x i32> [[TMP163]], <2 x i32> <i32 0, i32 2> +; CHECK-NEXT: [[TMP162:%.*]] = shufflevector <2 x i32> [[TMP156]], <2 x i32> [[TMP153]], <2 x i32> <i32 0, i32 2> ; CHECK-NEXT: [[ADD94_1:%.*]] = add i32 [[TMP161]], [[TMP160]] -; CHECK-NEXT: [[TMP164:%.*]] = sub <2 x i32> [[TMP163]], [[TMP156]] -; CHECK-NEXT: [[TMP173:%.*]] = extractelement <2 x i32> [[TMP164]], i32 0 -; CHECK-NEXT: [[TMP174:%.*]] = extractelement <2 x i32> [[TMP164]], i32 1 -; CHECK-NEXT: [[ADD105_1:%.*]] = add i32 [[TMP174]], [[TMP173]] -; CHECK-NEXT: [[SUB106_1:%.*]] = sub i32 [[TMP173]], [[TMP174]] -; CHECK-NEXT: [[ADD_I52_1:%.*]] = add i32 [[MUL_I51_1]], [[ADD105_1]] +; CHECK-NEXT: [[TMP163:%.*]] = sub <2 x i32> [[TMP153]], [[TMP156]] +; CHECK-NEXT: [[TMP164:%.*]] = extractelement <2 x i32> [[TMP163]], i32 0 +; CHECK-NEXT: [[TMP165:%.*]] = extractelement <2 x i32> [[TMP163]], i32 1 +; CHECK-NEXT: [[ADD105_1:%.*]] = add i32 [[TMP165]], [[TMP164]] +; CHECK-NEXT: [[SUB106_1:%.*]] = sub i32 [[TMP164]], [[TMP165]] +; CHECK-NEXT: [[ADD_I52_1:%.*]] = add i32 [[MUL_I51_5]], [[ADD105_1]] ; CHECK-NEXT: [[XOR_I53_1:%.*]] = xor i32 [[ADD_I52_1]], [[TMP107]] -; CHECK-NEXT: [[TMP166:%.*]] = shufflevector <2 x i32> [[TMP101]], <2 x i32> [[TMP144]], <2 x i32> <i32 1, i32 3> +; CHECK-NEXT: [[TMP166:%.*]] = shufflevector <2 x i32> [[TMP16]], <2 x i32> [[TMP189]], <2 x i32> <i32 1, i32 3> ; CHECK-NEXT: [[TMP167:%.*]] = lshr <2 x i32> [[TMP166]], <i32 15, i32 15> ; CHECK-NEXT: [[TMP168:%.*]] = and <2 x i32> [[TMP167]], <i32 65537, i32 65537> ; CHECK-NEXT: [[TMP169:%.*]] = mul <2 x i32> [[TMP168]], <i32 65535, i32 65535> @@ -263,44 +263,44 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[TMP283:%.*]] = shufflevector <2 x i32> [[TMP282]], <2 x i32> [[TMP211]], <2 x i32> <i32 0, i32 3> ; CHECK-NEXT: [[TMP177:%.*]] = add <2 x i32> [[TMP169]], [[TMP283]] ; CHECK-NEXT: [[TMP178:%.*]] = xor <2 x i32> [[TMP177]], [[TMP166]] -; CHECK-NEXT: [[ADD_I62_1:%.*]] = add i32 [[MUL_I61_1]], [[SUB106_1]] -; CHECK-NEXT: [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[TMP108]] +; CHECK-NEXT: [[ADD_I62_1:%.*]] = add i32 [[MUL_I61_4]], [[SUB106_1]] +; CHECK-NEXT: [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[TMP142]] ; CHECK-NEXT: [[ADD108_1:%.*]] = add i32 [[XOR_I53_1]], [[ADD113]] ; CHECK-NEXT: [[TMP179:%.*]] = extractelement <2 x i32> [[TMP178]], i32 0 ; CHECK-NEXT: [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[TMP179]] ; CHECK-NEXT: [[TMP180:%.*]] = extractelement <2 x i32> [[TMP178]], i32 1 ; CHECK-NEXT: [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[TMP180]] ; CHECK-NEXT: [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[XOR_I63_1]] -; CHECK-NEXT: [[TMP181:%.*]] = shufflevector <2 x i32> [[TMP165]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0> -; CHECK-NEXT: [[TMP182:%.*]] = insertelement <2 x i32> [[TMP181]], i32 [[ADD44_2]], i32 0 -; CHECK-NEXT: [[TMP183:%.*]] = insertelement <2 x i32> [[TMP165]], i32 [[CONV]], i32 0 -; CHECK-NEXT: [[TMP184:%.*]] = sub <2 x i32> [[TMP182]], [[TMP183]] -; CHECK-NEXT: [[TMP185:%.*]] = shufflevector <2 x i32> [[TMP16]], <2 x i32> [[TMP257]], <2 x i32> <i32 1, i32 2> -; CHECK-NEXT: [[TMP186:%.*]] = shufflevector <2 x i32> [[TMP16]], <2 x i32> [[TMP257]], <2 x i32> <i32 0, i32 3> +; CHECK-NEXT: [[TMP181:%.*]] = shufflevector <2 x i32> [[TMP106]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0> +; CHECK-NEXT: [[TMP182:%.*]] = insertelement <2 x i32> [[TMP181]], i32 [[ADD44_3]], i32 0 +; CHECK-NEXT: [[TMP183:%.*]] = insertelement <2 x i32> [[TMP106]], i32 [[ADD46_2]], i32 0 +; CHECK-NEXT: [[TMP195:%.*]] = sub <2 x i32> [[TMP182]], [[TMP183]] +; CHECK-NEXT: [[TMP185:%.*]] = shufflevector <2 x i32> [[TMP72]], <2 x i32> [[TMP143]], <2 x i32> <i32 1, i32 2> +; CHECK-NEXT: [[TMP186:%.*]] = shufflevector <2 x i32> [[TMP72]], <2 x i32> [[TMP143]], <2 x i32> <i32 0, i32 3> ; CHECK-NEXT: [[TMP187:%.*]] = sub <2 x i32> [[TMP185]], [[TMP186]] -; CHECK-NEXT: [[TMP188:%.*]] = extractelement <2 x i32> [[TMP184]], i32 0 -; CHECK-NEXT: [[TMP189:%.*]] = extractelement <2 x i32> [[TMP187]], i32 0 -; CHECK-NEXT: [[TMP190:%.*]] = shufflevector <2 x i32> [[TMP187]], <2 x i32> [[TMP184]], <2 x i32> <i32 0, i32 2> -; CHECK-NEXT: [[ADD94_4:%.*]] = add i32 [[TMP189]], [[TMP188]] -; CHECK-NEXT: [[TMP191:%.*]] = extractelement <2 x i32> [[TMP184]], i32 1 +; CHECK-NEXT: [[TMP188:%.*]] = extractelement <2 x i32> [[TMP195]], i32 0 +; CHECK-NEXT: [[TMP196:%.*]] = extractelement <2 x i32> [[TMP187]], i32 0 +; CHECK-NEXT: [[TMP199:%.*]] = shufflevector <2 x i32> [[TMP187]], <2 x i32> [[TMP195]], <2 x i32> <i32 0, i32 2> +; CHECK-NEXT: [[ADD94_4:%.*]] = add i32 [[TMP196]], [[TMP188]] +; CHECK-NEXT: [[TMP191:%.*]] = extractelement <2 x i32> [[TMP195]], i32 1 ; CHECK-NEXT: [[TMP192:%.*]] = extractelement <2 x i32> [[TMP187]], i32 1 -; CHECK-NEXT: [[TMP193:%.*]] = shufflevector <2 x i32> [[TMP187]], <2 x i32> [[TMP184]], <2 x i32> <i32 1, i32 3> +; CHECK-NEXT: [[TMP193:%.*]] = shufflevector <2 x i32> [[TMP187]], <2 x i32> [[TMP195]], <2 x i32> <i32 1, i32 3> ; CHECK-NEXT: [[ADD94_2:%.*]] = add i32 [[TMP192]], [[TMP191]] -; CHECK-NEXT: [[TMP194:%.*]] = sub <2 x i32> [[TMP184]], [[TMP187]] +; CHECK-NEXT: [[TMP194:%.*]] = sub <2 x i32> [[TMP195]], [[TMP187]] ; CHECK-NEXT: [[TMP244:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_2]], i32 0 ; CHECK-NEXT: [[TMP245:%.*]] = shufflevector <2 x i32> [[TMP244]], <2 x i32> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP197:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_4]], i32 0 ; CHECK-NEXT: [[TMP198:%.*]] = shufflevector <2 x i32> [[TMP197]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP246:%.*]] = add <2 x i32> [[TMP245]], [[TMP198]] -; CHECK-NEXT: [[TMP247:%.*]] = sub <2 x i32> [[TMP245]], [[TMP198]] -; CHECK-NEXT: [[TMP248:%.*]] = shufflevector <2 x i32> [[TMP246]], <2 x i32> [[TMP247]], <2 x i32> <i32 0, i32 3> +; CHECK-NEXT: [[TMP216:%.*]] = add <2 x i32> [[TMP245]], [[TMP198]] +; CHECK-NEXT: [[TMP210:%.*]] = sub <2 x i32> [[TMP245]], [[TMP198]] +; CHECK-NEXT: [[TMP221:%.*]] = shufflevector <2 x i32> [[TMP216]], <2 x i32> [[TMP210]], <2 x i32> <i32 0, i32 3> ; CHECK-NEXT: [[TMP215:%.*]] = extractelement <2 x i32> [[TMP194]], i32 0 ; CHECK-NEXT: [[TMP203:%.*]] = extractelement <2 x i32> [[TMP194]], i32 1 ; CHECK-NEXT: [[ADD105_2:%.*]] = add i32 [[TMP215]], [[TMP203]] ; CHECK-NEXT: [[SUB106_2:%.*]] = sub i32 [[TMP203]], [[TMP215]] ; CHECK-NEXT: [[ADD_I52_2:%.*]] = add i32 [[MUL_I51_4]], [[ADD105_2]] ; CHECK-NEXT: [[XOR_I53_2:%.*]] = xor i32 [[ADD_I52_2]], [[CONV_1]] -; CHECK-NEXT: [[TMP266:%.*]] = add <2 x i32> [[TMP149]], [[TMP248]] +; CHECK-NEXT: [[TMP266:%.*]] = add <2 x i32> [[TMP149]], [[TMP221]] ; CHECK-NEXT: [[TMP267:%.*]] = xor <2 x i32> [[TMP266]], [[TMP110]] ; CHECK-NEXT: [[SHR_I59_2:%.*]] = lshr i32 [[TMP238]], 15 ; CHECK-NEXT: [[AND_I60_2:%.*]] = and i32 [[SHR_I59_2]], 65537 @@ -313,48 +313,48 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[TMP207:%.*]] = extractelement <2 x i32> [[TMP267]], i32 1 ; CHECK-NEXT: [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP207]] ; CHECK-NEXT: [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]] -; CHECK-NEXT: [[TMP221:%.*]] = insertelement <2 x i32> [[TMP150]], i32 [[SUB45_2]], i32 0 -; CHECK-NEXT: [[TMP222:%.*]] = insertelement <2 x i32> [[TMP105]], i32 [[SUB47_2]], i32 0 -; CHECK-NEXT: [[TMP210:%.*]] = sub <2 x i32> [[TMP221]], [[TMP222]] -; CHECK-NEXT: [[TMP225:%.*]] = shufflevector <2 x i32> [[TMP73]], <2 x i32> [[TMP144]], <2 x i32> <i32 1, i32 2> -; CHECK-NEXT: [[TMP212:%.*]] = shufflevector <2 x i32> [[TMP73]], <2 x i32> [[TMP144]], <2 x i32> <i32 0, i32 3> -; CHECK-NEXT: [[TMP226:%.*]] = sub <2 x i32> [[TMP225]], [[TMP212]] -; CHECK-NEXT: [[TMP214:%.*]] = extractelement <2 x i32> [[TMP210]], i32 0 -; CHECK-NEXT: [[TMP227:%.*]] = extractelement <2 x i32> [[TMP226]], i32 0 -; CHECK-NEXT: [[TMP216:%.*]] = shufflevector <2 x i32> [[TMP226]], <2 x i32> [[TMP210]], <2 x i32> <i32 0, i32 2> -; CHECK-NEXT: [[ADD94_3:%.*]] = add i32 [[TMP227]], [[TMP214]] -; CHECK-NEXT: [[TMP217:%.*]] = extractelement <2 x i32> [[TMP210]], i32 1 -; CHECK-NEXT: [[TMP218:%.*]] = extractelement <2 x i32> [[TMP226]], i32 1 -; CHECK-NEXT: [[TMP219:%.*]] = shufflevector <2 x i32> [[TMP226]], <2 x i32> [[TMP210]], <2 x i32> <i32 1, i32 3> -; CHECK-NEXT: [[SUB59:%.*]] = add i32 [[TMP218]], [[TMP217]] -; CHECK-NEXT: [[TMP220:%.*]] = sub <2 x i32> [[TMP210]], [[TMP226]] -; CHECK-NEXT: [[TMP274:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59]], i32 0 -; CHECK-NEXT: [[TMP275:%.*]] = shufflevector <2 x i32> [[TMP274]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP222:%.*]] = insertelement <2 x i32> [[TMP150]], i32 [[SUB51_2]], i32 0 +; CHECK-NEXT: [[TMP225:%.*]] = insertelement <2 x i32> [[TMP105]], i32 [[SUB59_2]], i32 0 +; CHECK-NEXT: [[TMP226:%.*]] = sub <2 x i32> [[TMP222]], [[TMP225]] +; CHECK-NEXT: [[TMP227:%.*]] = shufflevector <2 x i32> [[TMP190]], <2 x i32> [[TMP189]], <2 x i32> <i32 1, i32 2> +; CHECK-NEXT: [[TMP212:%.*]] = shufflevector <2 x i32> [[TMP190]], <2 x i32> [[TMP189]], <2 x i32> <i32 0, i32 3> +; CHECK-NEXT: [[TMP213:%.*]] = sub <2 x i32> [[TMP227]], [[TMP212]] +; CHECK-NEXT: [[TMP214:%.*]] = extractelement <2 x i32> [[TMP226]], i32 0 +; CHECK-NEXT: [[TMP237:%.*]] = extractelement <2 x i32> [[TMP213]], i32 0 +; CHECK-NEXT: [[TMP239:%.*]] = shufflevector <2 x i32> [[TMP213]], <2 x i32> [[TMP226]], <2 x i32> <i32 0, i32 2> +; CHECK-NEXT: [[ADD94_5:%.*]] = add i32 [[TMP237]], [[TMP214]] +; CHECK-NEXT: [[TMP217:%.*]] = extractelement <2 x i32> [[TMP226]], i32 1 +; CHECK-NEXT: [[TMP218:%.*]] = extractelement <2 x i32> [[TMP213]], i32 1 +; CHECK-NEXT: [[TMP219:%.*]] = shufflevector <2 x i32> [[TMP213]], <2 x i32> [[TMP226]], <2 x i32> <i32 1, i32 3> +; CHECK-NEXT: [[ADD94_3:%.*]] = add i32 [[TMP218]], [[TMP217]] +; CHECK-NEXT: [[TMP240:%.*]] = sub <2 x i32> [[TMP226]], [[TMP213]] ; CHECK-NEXT: [[TMP223:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_3]], i32 0 ; CHECK-NEXT: [[TMP224:%.*]] = shufflevector <2 x i32> [[TMP223]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP276:%.*]] = add <2 x i32> [[TMP275]], [[TMP224]] -; CHECK-NEXT: [[TMP277:%.*]] = sub <2 x i32> [[TMP275]], [[TMP224]] -; CHECK-NEXT: [[TMP278:%.*]] = shufflevector <2 x i32> [[TMP276]], <2 x i32> [[TMP277]], <2 x i32> <i32 0, i32 3> -; CHECK-NEXT: [[TMP228:%.*]] = extractelement <2 x i32> [[TMP220]], i32 0 -; CHECK-NEXT: [[TMP229:%.*]] = extractelement <2 x i32> [[TMP220]], i32 1 +; CHECK-NEXT: [[TMP241:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_5]], i32 0 +; CHECK-NEXT: [[TMP242:%.*]] = shufflevector <2 x i32> [[TMP241]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP261:%.*]] = add <2 x i32> [[TMP224]], [[TMP242]] +; CHECK-NEXT: [[TMP262:%.*]] = sub <2 x i32> [[TMP224]], [[TMP242]] +; CHECK-NEXT: [[TMP220:%.*]] = shufflevector <2 x i32> [[TMP261]], <2 x i32> [[TMP262]], <2 x i32> <i32 0, i32 3> +; CHECK-NEXT: [[TMP228:%.*]] = extractelement <2 x i32> [[TMP240]], i32 0 +; CHECK-NEXT: [[TMP229:%.*]] = extractelement <2 x i32> [[TMP240]], i32 1 ; CHECK-NEXT: [[ADD105_3:%.*]] = add i32 [[TMP228]], [[TMP229]] ; CHECK-NEXT: [[SUB106_3:%.*]] = sub i32 [[TMP229]], [[TMP228]] -; CHECK-NEXT: [[ADD_I52_3:%.*]] = add i32 [[MUL_I51_5]], [[ADD105_3]] +; CHECK-NEXT: [[ADD_I52_3:%.*]] = add i32 [[MUL_I51_6]], [[ADD105_3]] ; CHECK-NEXT: [[XOR_I53_3:%.*]] = xor i32 [[ADD_I52_3]], [[CONV1]] ; CHECK-NEXT: [[TMP230:%.*]] = lshr <2 x i32> [[TMP102]], <i32 15, i32 15> ; CHECK-NEXT: [[TMP231:%.*]] = and <2 x i32> [[TMP230]], <i32 65537, i32 65537> ; CHECK-NEXT: [[TMP232:%.*]] = mul <2 x i32> [[TMP231]], <i32 65535, i32 65535> -; CHECK-NEXT: [[TMP286:%.*]] = add <2 x i32> [[TMP232]], [[TMP278]] -; CHECK-NEXT: [[TMP287:%.*]] = xor <2 x i32> [[TMP286]], [[TMP102]] +; CHECK-NEXT: [[TMP233:%.*]] = add <2 x i32> [[TMP232]], [[TMP220]] +; CHECK-NEXT: [[TMP234:%.*]] = xor <2 x i32> [[TMP233]], [[TMP102]] ; CHECK-NEXT: [[SHR_I59_3:%.*]] = lshr i32 [[CONV33]], 15 ; CHECK-NEXT: [[AND_I60_3:%.*]] = and i32 [[SHR_I59_3]], 65537 ; CHECK-NEXT: [[MUL_I61_3:%.*]] = mul i32 [[AND_I60_3]], 65535 ; CHECK-NEXT: [[ADD_I62_3:%.*]] = add i32 [[MUL_I61_3]], [[SUB106_3]] ; CHECK-NEXT: [[XOR_I63_3:%.*]] = xor i32 [[ADD_I62_3]], [[CONV33]] ; CHECK-NEXT: [[ADD108_3:%.*]] = add i32 [[XOR_I53_3]], [[ADD113_2]] -; CHECK-NEXT: [[TMP235:%.*]] = extractelement <2 x i32> [[TMP287]], i32 0 +; CHECK-NEXT: [[TMP235:%.*]] = extractelement <2 x i32> [[TMP234]], i32 0 ; CHECK-NEXT: [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP235]] -; CHECK-NEXT: [[TMP236:%.*]] = extractelement <2 x i32> [[TMP287]], i32 1 +; CHECK-NEXT: [[TMP236:%.*]] = extractelement <2 x i32> [[TMP234]], i32 1 ; CHECK-NEXT: [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP236]] ; CHECK-NEXT: [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[XOR_I63_3]] ; CHECK-NEXT: ret i32 [[ADD113_3]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll b/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll index 94534274cab2..5f8941e9f889 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll @@ -332,18 +332,14 @@ define void @reorder_alt_rightsubTree(ptr nocapture %c, ptr noalias nocapture re define void @vec_shuff_reorder() #0 { ; CHECK-LABEL: @vec_shuff_reorder( -; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr @fb, align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr @fa, align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr getelementptr inbounds ([4 x float], ptr @fa, i32 0, i64 1), align 4 -; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr getelementptr inbounds ([4 x float], ptr @fb, i32 0, i64 1), align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr @fa, align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, ptr @fb, align 4 ; CHECK-NEXT: [[TMP5:%.*]] = load <2 x float>, ptr getelementptr inbounds ([4 x float], ptr @fb, i32 0, i64 2), align 4 ; CHECK-NEXT: [[TMP6:%.*]] = load <2 x float>, ptr getelementptr inbounds ([4 x float], ptr @fa, i32 0, i64 2), align 4 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison> ; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison> ; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 4, i32 5> -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x float> [[TMP11]], float [[TMP4]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison> ; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison> ; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x i32> <i32 0, i32 1, i32 4, i32 5> ; CHECK-NEXT: [[TMP15:%.*]] = fadd <4 x float> [[TMP10]], [[TMP14]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/entries-different-vf.ll b/llvm/test/Transforms/SLPVectorizer/X86/entries-different-vf.ll index 610cc5bdeb31..536526a5cfe0 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/entries-different-vf.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/entries-different-vf.ll @@ -6,11 +6,11 @@ define i1 @test() { ; CHECK-SAME: () #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = shl i64 0, 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i64> <i64 poison, i64 poison, i64 0, i64 poison, i64 0, i64 0, i64 0, i64 0>, i64 [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i64> [[TMP1]], i64 0, i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i64> [[TMP2]], i64 0, i32 3 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i64> [[TMP3]], <8 x i64> poison, <4 x i32> <i32 3, i32 poison, i32 1, i32 0> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i64> <i64 undef, i64 0, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <8 x i64> [[TMP3]], <8 x i32> <i32 11, i32 11, i32 11, i32 1, i32 9, i32 9, i32 1, i32 8> +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i64> <i64 poison, i64 poison, i64 poison, i64 poison, i64 0, i64 0, i64 0, i64 0>, i64 0, i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7> +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i64> <i64 undef, i64 undef, i64 0, i64 0>, i64 [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i64> [[TMP11]], i64 0, i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 1, i32 1, i32 3, i32 0> ; CHECK-NEXT: [[TMP6:%.*]] = or <8 x i64> [[TMP3]], [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = sub <8 x i64> [[TMP3]], [[TMP5]] ; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i64> [[TMP6]], <8 x i64> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 11, i32 12, i32 5, i32 6, i32 7> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-many-users-buildvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-many-users-buildvector.ll index cac0491d0b64..7ae6793fba4c 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/extract-many-users-buildvector.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-many-users-buildvector.ll @@ -8,12 +8,10 @@ define i1 @test(float %0, double %1) { ; CHECK-NEXT: [[TMP4:%.*]] = fpext <4 x float> [[TMP3]] to <4 x double> ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> <double poison, double 0.000000e+00>, double [[TMP1]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x double> zeroinitializer, [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 1 -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 1, i32 poison> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x double> [[TMP8]], <4 x double> <double poison, double 0.000000e+00, double poison, double 0.000000e+00>, <4 x i32> <i32 poison, i32 5, i32 2, i32 7> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 1, i32 1> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> <double poison, double 0.000000e+00, double poison, double poison>, <4 x i32> <i32 poison, i32 5, i32 2, i32 3> ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x double> [[TMP9]], double [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <4 x i32> <i32 1, i32 2, i32 0, i32 poison> -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x double> [[TMP11]], double [[TMP7]], i32 3 +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double poison, double poison, double poison, double 0.000000e+00>, <4 x i32> <i32 1, i32 2, i32 0, i32 7> ; CHECK-NEXT: [[TMP13:%.*]] = fmul <4 x double> [[TMP10]], [[TMP12]] ; CHECK-NEXT: [[TMP14:%.*]] = fmul <4 x double> zeroinitializer, [[TMP4]] ; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <4 x double> [[TMP13]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll index dd7ba71ed673..f1580599ba12 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll @@ -12,8 +12,8 @@ define i64 @foo(i32 %tmp7) { ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 4, i32 2, i32 3, i32 5, i32 poison, i32 6> ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[TMP24]], i32 6 ; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <8 x i32> [[TMP3]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = add nsw <8 x i32> [[TMP3]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 9, i32 2, i32 3, i32 12, i32 13, i32 6, i32 7> +; CHECK-NEXT: [[TMP77:%.*]] = add nsw <8 x i32> [[TMP3]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> [[TMP77]], <8 x i32> <i32 0, i32 9, i32 2, i32 3, i32 12, i32 13, i32 6, i32 7> ; CHECK-NEXT: [[TMP9:%.*]] = add <8 x i32> zeroinitializer, [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i32> [[TMP9]], zeroinitializer ; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP10]]) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll index f665dac3282b..24b95c4e6ff2 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll @@ -7,17 +7,14 @@ define void @foo(double %i) { ; CHECK-NEXT: bb: ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x double> <double 0.000000e+00, double 0.000000e+00, double poison, double 0.000000e+00>, double [[I]], i32 2 ; CHECK-NEXT: [[TMP1:%.*]] = fsub <4 x double> zeroinitializer, [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[I]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = fsub <2 x double> zeroinitializer, [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP4]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <8 x i32> <i32 poison, i32 poison, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x double> [[TMP6]], <8 x double> <double 0.000000e+00, double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, <8 x i32> <i32 8, i32 poison, i32 2, i32 poison, i32 poison, i32 13, i32 14, i32 15> -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x double> [[TMP7]], double [[TMP2]], i32 3 -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 poison, i32 1> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x double> [[TMP9]], <8 x double> <double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double poison, double poison, double poison>, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 5, i32 poison, i32 7> -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x double> [[TMP10]], double [[TMP5]], i32 6 -; CHECK-NEXT: [[TMP12:%.*]] = fmul <8 x double> [[TMP8]], [[TMP11]] +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 1, i32 poison, i32 0, i32 poison, i32 1> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x double> [[TMP9]], <8 x double> <double poison, double 0.000000e+00, double poison, double poison, double 0.000000e+00, double poison, double poison, double poison>, <8 x i32> <i32 0, i32 9, i32 poison, i32 3, i32 12, i32 5, i32 poison, i32 7> +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x double> [[TMP6]], double [[TMP5]], i32 2 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x double> [[TMP7]], <8 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 2, i32 7> +; CHECK-NEXT: [[TMP12:%.*]] = fmul <8 x double> <double 0.000000e+00, double poison, double 0.000000e+00, double 0.000000e+00, double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, [[TMP8]] ; CHECK-NEXT: [[TMP13:%.*]] = fadd <8 x double> zeroinitializer, [[TMP12]] ; CHECK-NEXT: [[TMP14:%.*]] = fadd <8 x double> [[TMP13]], zeroinitializer ; CHECK-NEXT: [[TMP15:%.*]] = fcmp ult <8 x double> [[TMP14]], zeroinitializer diff --git a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll index 593aad82ad5d..8562e53b1538 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll @@ -185,9 +185,13 @@ define void @shuffle_nodes_match1(ptr noalias %from, ptr noalias %to, double %v1 ; CHECK-NEXT: br label [[LP:%.*]] ; CHECK: lp: ; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 1, i32 0> +; CHECK-NEXT: [[FROM_1:%.*]] = getelementptr i8, ptr [[FROM:%.*]], i32 8 +; CHECK-NEXT: [[V0_1:%.*]] = load double, ptr [[FROM]], align 4 +; CHECK-NEXT: [[V0_2:%.*]] = load double, ptr [[FROM_1]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V0_2]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[V0_1]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP1]] ; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4 ; CHECK-NEXT: br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]] @@ -233,13 +237,9 @@ define void @vecload_vs_broadcast4(ptr noalias %from, ptr noalias %to, double %v ; CHECK-NEXT: br label [[LP:%.*]] ; CHECK: lp: ; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[FROM_1:%.*]] = getelementptr i8, ptr [[FROM:%.*]], i32 8 -; CHECK-NEXT: [[V0_1:%.*]] = load double, ptr [[FROM]], align 4 -; CHECK-NEXT: [[V0_2:%.*]] = load double, ptr [[FROM_1]], align 4 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V0_2]], i64 0 +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 1, i32 0> ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V0_1]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] ; CHECK-NEXT: store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4 ; CHECK-NEXT: br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/postponed_gathers.ll b/llvm/test/Transforms/SLPVectorizer/X86/postponed_gathers.ll index 681d131c5072..488ca0b23cd9 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/postponed_gathers.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/postponed_gathers.ll @@ -10,7 +10,7 @@ define void @foo() { ; CHECK-NEXT: br label [[BCI_252:%.*]] ; CHECK: bci_252: ; CHECK-NEXT: [[TMP3:%.*]] = phi <2 x i32> [ zeroinitializer, [[BCI_0:%.*]] ], [ [[TMP16:%.*]], [[BCI_252_1:%.*]] ] -; CHECK-NEXT: [[TMP4:%.*]] = mul <2 x i32> zeroinitializer, [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = mul <2 x i32> [[TMP1]], zeroinitializer ; CHECK-NEXT: [[TMP5:%.*]] = or <2 x i32> [[TMP3]], zeroinitializer ; CHECK-NEXT: [[TMP6:%.*]] = or <2 x i32> [[TMP2]], [[TMP4]] ; CHECK-NEXT: [[TMP7:%.*]] = or <2 x i32> [[TMP6]], zeroinitializer diff --git a/llvm/test/Transforms/SLPVectorizer/X86/replaced-external-in-reduction.ll b/llvm/test/Transforms/SLPVectorizer/X86/replaced-external-in-reduction.ll index 19a3a7d53df0..9df7aa1c727c 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/replaced-external-in-reduction.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/replaced-external-in-reduction.ll @@ -5,8 +5,8 @@ define void @test(i32 %0, ptr %p) { ; CHECK-LABEL: define void @test( ; CHECK-SAME: i32 [[TMP0:%.*]], ptr [[P:%.*]]) { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> <i32 0, i32 1, i32 0, i32 poison>, i32 [[TMP0]], i32 3 -; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i32> [[TMP1]], <i32 1, i32 0, i32 1, i32 0> +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> <i32 0, i32 0, i32 0, i32 poison>, i32 [[TMP0]], i32 3 +; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 0> ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 ; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[PH:%.*]] ; CHECK: ph: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll index 69ecf1852aed..8f1d7a11e150 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll @@ -7,11 +7,9 @@ define void @test(ptr nocapture %t2) { ; CHECK-NEXT: [[T4:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 7 ; CHECK-NEXT: [[T5:%.*]] = load i32, ptr [[T4]], align 4 ; CHECK-NEXT: [[T8:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 1 -; CHECK-NEXT: [[T9:%.*]] = load i32, ptr [[T8]], align 4 ; CHECK-NEXT: [[T10:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 6 ; CHECK-NEXT: [[T11:%.*]] = load i32, ptr [[T10]], align 4 -; CHECK-NEXT: [[T14:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 2 -; CHECK-NEXT: [[T15:%.*]] = load i32, ptr [[T14]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[T8]], align 4 ; CHECK-NEXT: [[T16:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 5 ; CHECK-NEXT: [[T17:%.*]] = load i32, ptr [[T16]], align 4 ; CHECK-NEXT: [[T20:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 3 @@ -21,10 +19,11 @@ define void @test(ptr nocapture %t2) { ; CHECK-NEXT: [[T24:%.*]] = add nsw i32 [[T23]], [[T21]] ; CHECK-NEXT: [[T25:%.*]] = sub nsw i32 [[T21]], [[T23]] ; CHECK-NEXT: [[T27:%.*]] = sub nsw i32 [[T3]], [[T24]] +; CHECK-NEXT: [[T9:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[T15:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 ; CHECK-NEXT: [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]] ; CHECK-NEXT: [[T30:%.*]] = add nsw i32 [[T27]], [[T29]] ; CHECK-NEXT: [[T31:%.*]] = mul nsw i32 [[T30]], 4433 -; CHECK-NEXT: [[T32:%.*]] = mul nsw i32 [[T27]], 6270 ; CHECK-NEXT: [[T34:%.*]] = mul nsw i32 [[T29]], -15137 ; CHECK-NEXT: [[T37:%.*]] = add nsw i32 [[T25]], [[T11]] ; CHECK-NEXT: [[T38:%.*]] = add nsw i32 [[T17]], [[T5]] @@ -34,20 +33,19 @@ define void @test(ptr nocapture %t2) { ; CHECK-NEXT: [[T42:%.*]] = mul nsw i32 [[T17]], 16819 ; CHECK-NEXT: [[T47:%.*]] = mul nsw i32 [[T37]], -16069 ; CHECK-NEXT: [[T48:%.*]] = mul nsw i32 [[T38]], -3196 -; CHECK-NEXT: [[T49:%.*]] = add nsw i32 [[T40]], [[T47]] -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[T15]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T40]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[T9]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[T48]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> <i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> -; CHECK-NEXT: [[T67:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T32]], i32 2 -; CHECK-NEXT: [[T68:%.*]] = insertelement <8 x i32> [[T67]], i32 [[T49]], i32 3 -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> -; CHECK-NEXT: [[T701:%.*]] = shufflevector <8 x i32> [[T68]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison> +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[T27]], i32 2 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[T47]], i32 3 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> <i32 poison, i32 poison, i32 6270, i32 poison>, <4 x i32> <i32 1, i32 0, i32 6, i32 poison> +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[T40]], i32 3 +; CHECK-NEXT: [[TMP9:%.*]] = add nsw <4 x i32> [[TMP6]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = mul nsw <4 x i32> [[TMP6]], [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 6, i32 3> +; CHECK-NEXT: [[T50:%.*]] = add nsw i32 [[T40]], [[T48]] +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 poison, i32 poison, i32 3> +; CHECK-NEXT: [[T701:%.*]] = insertelement <8 x i32> [[TMP12]], i32 [[T50]], i32 5 ; CHECK-NEXT: [[T71:%.*]] = insertelement <8 x i32> [[T701]], i32 [[T34]], i32 6 -; CHECK-NEXT: [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[T49]], i32 7 -; CHECK-NEXT: [[T76:%.*]] = shl <8 x i32> [[T72]], <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> +; CHECK-NEXT: [[T76:%.*]] = shl <8 x i32> [[T71]], <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> ; CHECK-NEXT: store <8 x i32> [[T76]], ptr [[T2]], align 4 ; CHECK-NEXT: ret void ; |