summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAlexey Bataev <a.bataev@outlook.com>2024-03-29 17:23:51 +0000
committerAlexey Bataev <a.bataev@outlook.com>2024-03-29 17:23:51 +0000
commit84cdd61dfee3c46855c0193c3dedc3cd76e66cf2 (patch)
treedc69c0f8dc85d725b0efdb8180a9875982d1f9bc
parent84299df301dc07ea83fa8378051103195c3a7c65 (diff)
Created using spr 1.3.5
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp77
-rw-r--r--llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll212
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/addsub.ll12
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/entries-different-vf.ll10
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/extract-many-users-buildvector.ll8
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll4
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll13
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll16
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/postponed_gathers.ll2
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/replaced-external-in-reduction.ll4
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll32
11 files changed, 212 insertions, 178 deletions
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 2875e71081d9..46243c60324a 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1334,12 +1334,19 @@ public:
return LookAheadHeuristics::ScoreSplat;
}
+ auto CheckSameEntryOrFail = [&]() {
+ if (const TreeEntry *TE1 = R.getTreeEntry(V1);
+ TE1 && TE1 == R.getTreeEntry(V2))
+ return LookAheadHeuristics::ScoreSplatLoads;
+ return LookAheadHeuristics::ScoreFail;
+ };
+
auto *LI1 = dyn_cast<LoadInst>(V1);
auto *LI2 = dyn_cast<LoadInst>(V2);
if (LI1 && LI2) {
if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
!LI2->isSimple())
- return LookAheadHeuristics::ScoreFail;
+ return CheckSameEntryOrFail();
std::optional<int> Dist = getPointersDiff(
LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
@@ -1351,7 +1358,7 @@ public:
FixedVectorType::get(LI1->getType(), NumLanes),
LI1->getAlign()))
return LookAheadHeuristics::ScoreMaskedGatherCandidate;
- return LookAheadHeuristics::ScoreFail;
+ return CheckSameEntryOrFail();
}
// The distance is too large - still may be profitable to use masked
// loads/gathers.
@@ -1408,14 +1415,14 @@ public:
}
return LookAheadHeuristics::ScoreAltOpcodes;
}
- return LookAheadHeuristics::ScoreFail;
+ return CheckSameEntryOrFail();
}
auto *I1 = dyn_cast<Instruction>(V1);
auto *I2 = dyn_cast<Instruction>(V2);
if (I1 && I2) {
if (I1->getParent() != I2->getParent())
- return LookAheadHeuristics::ScoreFail;
+ return CheckSameEntryOrFail();
SmallVector<Value *, 4> Ops(MainAltOps.begin(), MainAltOps.end());
Ops.push_back(I1);
Ops.push_back(I2);
@@ -1436,7 +1443,7 @@ public:
if (isa<UndefValue>(V2))
return LookAheadHeuristics::ScoreUndef;
- return LookAheadHeuristics::ScoreFail;
+ return CheckSameEntryOrFail();
}
/// Go through the operands of \p LHS and \p RHS recursively until
@@ -1599,6 +1606,7 @@ public:
const DataLayout &DL;
ScalarEvolution &SE;
const BoUpSLP &R;
+ const Loop *L = nullptr;
/// \returns the operand data at \p OpIdx and \p Lane.
OperandData &getData(unsigned OpIdx, unsigned Lane) {
@@ -1767,8 +1775,9 @@ public:
// Track if the operand must be marked as used. If the operand is set to
// Score 1 explicitly (because of non power-of-2 unique scalars, we may
// want to reestimate the operands again on the following iterations).
- bool IsUsed =
- RMode == ReorderingMode::Splat || RMode == ReorderingMode::Constant;
+ bool IsUsed = RMode == ReorderingMode::Splat ||
+ RMode == ReorderingMode::Constant ||
+ RMode == ReorderingMode::Load;
// Iterate through all unused operands and look for the best.
for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
// Get the operand at Idx and Lane.
@@ -1789,23 +1798,44 @@ public:
// Look for an operand that matches the current mode.
switch (RMode) {
case ReorderingMode::Load:
- case ReorderingMode::Constant:
case ReorderingMode::Opcode: {
bool LeftToRight = Lane > LastLane;
Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
Value *OpRight = (LeftToRight) ? Op : OpLastLane;
int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
OpIdx, Idx, IsUsed);
- if (Score > static_cast<int>(BestOp.Score)) {
+ if (Score > static_cast<int>(BestOp.Score) ||
+ (Score > 0 && Score == static_cast<int>(BestOp.Score) &&
+ Idx == OpIdx)) {
BestOp.Idx = Idx;
BestOp.Score = Score;
BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
}
break;
}
+ case ReorderingMode::Constant:
+ if (isa<Constant>(Op) ||
+ (!BestOp.Score && L && L->isLoopInvariant(Op))) {
+ BestOp.Idx = Idx;
+ if (isa<Constant>(Op)) {
+ BestOp.Score = LookAheadHeuristics::ScoreConstants;
+ BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
+ LookAheadHeuristics::ScoreConstants;
+ }
+ if (isa<UndefValue>(Op) || !isa<Constant>(Op))
+ IsUsed = false;
+ }
+ break;
case ReorderingMode::Splat:
- if (Op == OpLastLane)
+ if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) {
+ IsUsed = Op == OpLastLane;
+ if (Op == OpLastLane) {
+ BestOp.Score = LookAheadHeuristics::ScoreSplat;
+ BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
+ LookAheadHeuristics::ScoreSplat;
+ }
BestOp.Idx = Idx;
+ }
break;
case ReorderingMode::Failed:
llvm_unreachable("Not expected Failed reordering mode.");
@@ -1999,6 +2029,8 @@ public:
/// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
bool OpAPO = getData(OpIdx, Lane).APO;
+ bool IsInvariant = L && L->isLoopInvariant(Op);
+ unsigned Cnt = 0;
for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
if (Ln == Lane)
continue;
@@ -2008,22 +2040,37 @@ public:
OperandData &Data = getData(OpI, Ln);
if (Data.APO != OpAPO || Data.IsUsed)
continue;
- if (Data.V == Op) {
+ Value *OpILane = getValue(OpI, Lane);
+ bool IsConstantOp = isa<Constant>(OpILane);
+ if (Data.V == Op ||
+ (!IsConstantOp &&
+ ((Lns > 2 && isa<Constant>(Data.V)) ||
+ (Lns == 2 &&
+ !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI)
+ .getOpcode() &&
+ isa<Constant>(Data.V)))) ||
+ (IsInvariant && !isa<Constant>(Data.V) &&
+ !getSameOpcode({Op, Data.V}, TLI).getOpcode() &&
+ L->isLoopInvariant(Data.V))) {
FoundCandidate = true;
- Data.IsUsed = true;
+ Data.IsUsed = Data.V == Op;
+ if (Data.V == Op)
+ ++Cnt;
break;
}
}
if (!FoundCandidate)
return false;
}
- return true;
+ return getNumLanes() == 2 || Cnt > 1;
}
public:
/// Initialize with all the operands of the instruction vector \p RootVL.
VLOperands(ArrayRef<Value *> RootVL, const BoUpSLP &R)
- : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R) {
+ : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
+ L(R.LI->getLoopFor(
+ (cast<Instruction>(RootVL.front())->getParent()))) {
// Append all the operands of RootVL.
appendOperandsOfVL(RootVL);
}
@@ -2155,8 +2202,6 @@ public:
// getBestOperand().
swap(OpIdx, *BestIdx, Lane);
} else {
- // We failed to find a best operand, set mode to 'Failed'.
- ReorderingModes[OpIdx] = ReorderingMode::Failed;
// Enable the second pass.
StrategyFailed = true;
}
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
index d87bdfe26899..aa9a070a7945 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
@@ -37,10 +37,10 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 4
; CHECK-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 4
; CHECK-NEXT: [[TMP15:%.*]] = load <2 x i8>, ptr [[ADD_PTR_1]], align 1
-; CHECK-NEXT: [[TMP101:%.*]] = zext <2 x i8> [[TMP15]] to <2 x i32>
+; CHECK-NEXT: [[TMP16:%.*]] = zext <2 x i8> [[TMP15]] to <2 x i32>
; CHECK-NEXT: [[TMP17:%.*]] = load <2 x i8>, ptr [[ADD_PTR64_1]], align 1
; CHECK-NEXT: [[TMP18:%.*]] = zext <2 x i8> [[TMP17]] to <2 x i32>
-; CHECK-NEXT: [[TMP19:%.*]] = sub <2 x i32> [[TMP101]], [[TMP18]]
+; CHECK-NEXT: [[TMP19:%.*]] = sub <2 x i32> [[TMP16]], [[TMP18]]
; CHECK-NEXT: [[TMP20:%.*]] = load <2 x i8>, ptr [[ARRAYIDX3_2]], align 1
; CHECK-NEXT: [[TMP21:%.*]] = zext <2 x i8> [[TMP20]] to <2 x i32>
; CHECK-NEXT: [[TMP22:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_2]], align 1
@@ -64,15 +64,15 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[TMP36:%.*]] = sub <2 x i32> [[TMP33]], [[TMP35]]
; CHECK-NEXT: [[TMP37:%.*]] = shl <2 x i32> [[TMP36]], <i32 16, i32 16>
; CHECK-NEXT: [[TMP38:%.*]] = add <2 x i32> [[TMP37]], [[TMP31]]
-; CHECK-NEXT: [[TMP39:%.*]] = extractelement <2 x i32> [[TMP26]], i32 0
-; CHECK-NEXT: [[TMP40:%.*]] = extractelement <2 x i32> [[TMP26]], i32 1
-; CHECK-NEXT: [[ADD44_2:%.*]] = add i32 [[TMP40]], [[TMP39]]
-; CHECK-NEXT: [[SUB45_2:%.*]] = sub i32 [[TMP39]], [[TMP40]]
-; CHECK-NEXT: [[TMP41:%.*]] = extractelement <2 x i32> [[TMP38]], i32 0
-; CHECK-NEXT: [[TMP42:%.*]] = extractelement <2 x i32> [[TMP38]], i32 1
-; CHECK-NEXT: [[CONV:%.*]] = add i32 [[TMP42]], [[TMP41]]
-; CHECK-NEXT: [[SUB47_2:%.*]] = sub i32 [[TMP41]], [[TMP42]]
-; CHECK-NEXT: [[ADD48_2:%.*]] = add i32 [[CONV]], [[ADD44_2]]
+; CHECK-NEXT: [[ADD44_2:%.*]] = extractelement <2 x i32> [[TMP26]], i32 0
+; CHECK-NEXT: [[CONV:%.*]] = extractelement <2 x i32> [[TMP26]], i32 1
+; CHECK-NEXT: [[ADD44_3:%.*]] = add i32 [[CONV]], [[ADD44_2]]
+; CHECK-NEXT: [[SUB51_2:%.*]] = sub i32 [[ADD44_2]], [[CONV]]
+; CHECK-NEXT: [[SUB45_2:%.*]] = extractelement <2 x i32> [[TMP38]], i32 0
+; CHECK-NEXT: [[SUB47_2:%.*]] = extractelement <2 x i32> [[TMP38]], i32 1
+; CHECK-NEXT: [[ADD46_2:%.*]] = add i32 [[SUB47_2]], [[SUB45_2]]
+; CHECK-NEXT: [[SUB59_2:%.*]] = sub i32 [[SUB45_2]], [[SUB47_2]]
+; CHECK-NEXT: [[ADD48_2:%.*]] = add i32 [[ADD46_2]], [[ADD44_3]]
; CHECK-NEXT: [[TMP43:%.*]] = load i8, ptr null, align 1
; CHECK-NEXT: [[ARRAYIDX20_3:%.*]] = getelementptr i8, ptr null, i64 2
; CHECK-NEXT: [[ARRAYIDX22_3:%.*]] = getelementptr i8, ptr null, i64 2
@@ -104,10 +104,10 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[TMP69:%.*]] = sub <2 x i32> [[TMP66]], [[TMP68]]
; CHECK-NEXT: [[TMP70:%.*]] = shl <2 x i32> [[TMP69]], <i32 16, i32 16>
; CHECK-NEXT: [[TMP71:%.*]] = add <2 x i32> [[TMP70]], [[TMP63]]
-; CHECK-NEXT: [[TMP16:%.*]] = add <2 x i32> [[TMP71]], [[TMP58]]
-; CHECK-NEXT: [[TMP73:%.*]] = sub <2 x i32> [[TMP58]], [[TMP71]]
-; CHECK-NEXT: [[TMP74:%.*]] = extractelement <2 x i32> [[TMP16]], i32 0
-; CHECK-NEXT: [[TMP75:%.*]] = extractelement <2 x i32> [[TMP16]], i32 1
+; CHECK-NEXT: [[TMP72:%.*]] = add <2 x i32> [[TMP71]], [[TMP58]]
+; CHECK-NEXT: [[TMP190:%.*]] = sub <2 x i32> [[TMP58]], [[TMP71]]
+; CHECK-NEXT: [[TMP74:%.*]] = extractelement <2 x i32> [[TMP72]], i32 0
+; CHECK-NEXT: [[TMP75:%.*]] = extractelement <2 x i32> [[TMP72]], i32 1
; CHECK-NEXT: [[ADD48_3:%.*]] = add i32 [[TMP74]], [[TMP75]]
; CHECK-NEXT: [[ADD94:%.*]] = add i32 [[ADD48_3]], [[ADD48_2]]
; CHECK-NEXT: [[SUB102:%.*]] = sub i32 [[ADD48_2]], [[ADD48_3]]
@@ -115,19 +115,19 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[SHR_I49_2:%.*]] = lshr i32 [[TMP79]], 15
; CHECK-NEXT: [[AND_I50_2:%.*]] = and i32 [[SHR_I49_2]], 65537
; CHECK-NEXT: [[MUL_I51_2:%.*]] = mul i32 [[AND_I50_2]], 65535
-; CHECK-NEXT: [[SHR_I49_3:%.*]] = lshr i32 [[CONV]], 15
+; CHECK-NEXT: [[SHR_I49_3:%.*]] = lshr i32 [[ADD46_2]], 15
; CHECK-NEXT: [[AND_I50_3:%.*]] = and i32 [[SHR_I49_3]], 65537
; CHECK-NEXT: [[MUL_I51_3:%.*]] = mul i32 [[AND_I50_3]], 65535
-; CHECK-NEXT: [[TMP107:%.*]] = extractelement <2 x i32> [[TMP101]], i32 0
-; CHECK-NEXT: [[SHR_I49_1:%.*]] = lshr i32 [[TMP107]], 15
-; CHECK-NEXT: [[AND_I50_1:%.*]] = and i32 [[SHR_I49_1]], 65537
-; CHECK-NEXT: [[MUL_I51_1:%.*]] = mul i32 [[AND_I50_1]], 65535
+; CHECK-NEXT: [[TMP107:%.*]] = extractelement <2 x i32> [[TMP16]], i32 0
+; CHECK-NEXT: [[SHR_I49_5:%.*]] = lshr i32 [[TMP107]], 15
+; CHECK-NEXT: [[AND_I50_5:%.*]] = and i32 [[SHR_I49_5]], 65537
+; CHECK-NEXT: [[MUL_I51_5:%.*]] = mul i32 [[AND_I50_5]], 65535
; CHECK-NEXT: [[SHR_I49_4:%.*]] = lshr i32 [[CONV_1]], 15
; CHECK-NEXT: [[AND_I50_4:%.*]] = and i32 [[SHR_I49_4]], 65537
; CHECK-NEXT: [[MUL_I51_4:%.*]] = mul i32 [[AND_I50_4]], 65535
-; CHECK-NEXT: [[SHR_I49_5:%.*]] = lshr i32 [[CONV1]], 15
-; CHECK-NEXT: [[AND_I50_5:%.*]] = and i32 [[SHR_I49_5]], 65537
-; CHECK-NEXT: [[MUL_I51_5:%.*]] = mul i32 [[AND_I50_5]], 65535
+; CHECK-NEXT: [[SHR_I49_6:%.*]] = lshr i32 [[CONV1]], 15
+; CHECK-NEXT: [[AND_I50_6:%.*]] = and i32 [[SHR_I49_6]], 65537
+; CHECK-NEXT: [[MUL_I51_6:%.*]] = mul i32 [[AND_I50_6]], 65535
; CHECK-NEXT: [[TMP78:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1
; CHECK-NEXT: [[TMP102:%.*]] = zext <2 x i8> [[TMP78]] to <2 x i32>
; CHECK-NEXT: [[TMP80:%.*]] = insertelement <2 x ptr> [[TMP5]], ptr [[ARRAYIDX22]], i32 1
@@ -151,21 +151,21 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[TMP98:%.*]] = sub <2 x i32> [[TMP97]], [[TMP90]]
; CHECK-NEXT: [[TMP104:%.*]] = add <2 x i32> [[TMP96]], [[TMP98]]
; CHECK-NEXT: [[TMP100:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV1]], i32 0
-; CHECK-NEXT: [[TMP103:%.*]] = sub <2 x i32> [[TMP100]], [[TMP82]]
-; CHECK-NEXT: [[TMP200:%.*]] = add <2 x i32> [[TMP88]], [[TMP103]]
+; CHECK-NEXT: [[TMP101:%.*]] = sub <2 x i32> [[TMP100]], [[TMP82]]
+; CHECK-NEXT: [[TMP200:%.*]] = add <2 x i32> [[TMP88]], [[TMP101]]
; CHECK-NEXT: [[TMP128:%.*]] = shufflevector <2 x i32> [[TMP104]], <2 x i32> [[TMP200]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[TMP165:%.*]] = add <2 x i32> [[TMP104]], [[TMP200]]
+; CHECK-NEXT: [[TMP106:%.*]] = add <2 x i32> [[TMP104]], [[TMP200]]
; CHECK-NEXT: [[TMP105:%.*]] = sub <2 x i32> [[TMP200]], [[TMP104]]
-; CHECK-NEXT: [[TMP238:%.*]] = extractelement <2 x i32> [[TMP165]], i32 0
-; CHECK-NEXT: [[TMP143:%.*]] = extractelement <2 x i32> [[TMP165]], i32 1
-; CHECK-NEXT: [[ADD48:%.*]] = add i32 [[TMP143]], [[TMP238]]
-; CHECK-NEXT: [[TMP108:%.*]] = extractelement <2 x i32> [[TMP105]], i32 1
-; CHECK-NEXT: [[SHR_I59:%.*]] = lshr i32 [[TMP143]], 15
-; CHECK-NEXT: [[AND_I60:%.*]] = and i32 [[SHR_I59]], 65537
-; CHECK-NEXT: [[MUL_I61:%.*]] = mul i32 [[AND_I60]], 65535
+; CHECK-NEXT: [[TMP238:%.*]] = extractelement <2 x i32> [[TMP106]], i32 0
+; CHECK-NEXT: [[TMP108:%.*]] = extractelement <2 x i32> [[TMP106]], i32 1
+; CHECK-NEXT: [[ADD48:%.*]] = add i32 [[TMP108]], [[TMP238]]
+; CHECK-NEXT: [[TMP142:%.*]] = extractelement <2 x i32> [[TMP105]], i32 1
; CHECK-NEXT: [[SHR_I59_1:%.*]] = lshr i32 [[TMP108]], 15
; CHECK-NEXT: [[AND_I60_1:%.*]] = and i32 [[SHR_I59_1]], 65537
; CHECK-NEXT: [[MUL_I61_1:%.*]] = mul i32 [[AND_I60_1]], 65535
+; CHECK-NEXT: [[SHR_I59_4:%.*]] = lshr i32 [[TMP142]], 15
+; CHECK-NEXT: [[AND_I60_4:%.*]] = and i32 [[SHR_I59_4]], 65537
+; CHECK-NEXT: [[MUL_I61_4:%.*]] = mul i32 [[AND_I60_4]], 65535
; CHECK-NEXT: [[TMP109:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
; CHECK-NEXT: [[TMP110:%.*]] = zext <2 x i8> [[TMP109]] to <2 x i32>
; CHECK-NEXT: [[TMP111:%.*]] = insertelement <2 x i8> poison, i8 [[TMP12]], i32 0
@@ -185,7 +185,7 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[TMP125:%.*]] = shl <2 x i32> [[TMP124]], <i32 16, i32 16>
; CHECK-NEXT: [[TMP126:%.*]] = getelementptr i8, <2 x ptr> [[TMP120]], <2 x i64> <i64 1, i64 3>
; CHECK-NEXT: [[TMP127:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP126]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP153:%.*]] = zext <2 x i8> [[TMP127]] to <2 x i32>
+; CHECK-NEXT: [[TMP144:%.*]] = zext <2 x i8> [[TMP127]] to <2 x i32>
; CHECK-NEXT: [[TMP129:%.*]] = getelementptr i8, <2 x ptr> [[TMP115]], <2 x i64> <i64 5, i64 7>
; CHECK-NEXT: [[TMP130:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP129]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
; CHECK-NEXT: [[TMP131:%.*]] = zext <2 x i8> [[TMP130]] to <2 x i32>
@@ -195,15 +195,15 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[TMP135:%.*]] = sub <2 x i32> [[TMP131]], [[TMP134]]
; CHECK-NEXT: [[TMP136:%.*]] = shl <2 x i32> [[TMP135]], <i32 16, i32 16>
; CHECK-NEXT: [[TMP137:%.*]] = insertelement <2 x i32> [[TMP110]], i32 [[CONV33_1]], i32 1
-; CHECK-NEXT: [[TMP138:%.*]] = sub <2 x i32> [[TMP137]], [[TMP153]]
+; CHECK-NEXT: [[TMP138:%.*]] = sub <2 x i32> [[TMP137]], [[TMP144]]
; CHECK-NEXT: [[TMP139:%.*]] = add <2 x i32> [[TMP136]], [[TMP138]]
; CHECK-NEXT: [[TMP140:%.*]] = insertelement <2 x i32> [[TMP110]], i32 [[CONV_1]], i32 0
; CHECK-NEXT: [[TMP141:%.*]] = sub <2 x i32> [[TMP140]], [[TMP113]]
-; CHECK-NEXT: [[TMP142:%.*]] = add <2 x i32> [[TMP125]], [[TMP141]]
-; CHECK-NEXT: [[TMP257:%.*]] = add <2 x i32> [[TMP139]], [[TMP142]]
-; CHECK-NEXT: [[TMP144:%.*]] = sub <2 x i32> [[TMP142]], [[TMP139]]
-; CHECK-NEXT: [[TMP145:%.*]] = extractelement <2 x i32> [[TMP257]], i32 0
-; CHECK-NEXT: [[TMP146:%.*]] = extractelement <2 x i32> [[TMP257]], i32 1
+; CHECK-NEXT: [[TMP155:%.*]] = add <2 x i32> [[TMP125]], [[TMP141]]
+; CHECK-NEXT: [[TMP143:%.*]] = add <2 x i32> [[TMP139]], [[TMP155]]
+; CHECK-NEXT: [[TMP189:%.*]] = sub <2 x i32> [[TMP155]], [[TMP139]]
+; CHECK-NEXT: [[TMP145:%.*]] = extractelement <2 x i32> [[TMP143]], i32 0
+; CHECK-NEXT: [[TMP146:%.*]] = extractelement <2 x i32> [[TMP143]], i32 1
; CHECK-NEXT: [[ADD48_1:%.*]] = add i32 [[TMP146]], [[TMP145]]
; CHECK-NEXT: [[SHR_I54:%.*]] = lshr i32 [[TMP146]], 15
; CHECK-NEXT: [[AND_I55:%.*]] = and i32 [[SHR_I54]], 65537
@@ -220,37 +220,37 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[ADD_I:%.*]] = add i32 [[MUL_I51_2]], [[ADD103]]
; CHECK-NEXT: [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP79]]
; CHECK-NEXT: [[ADD_I52:%.*]] = add i32 [[MUL_I51_3]], [[ADD105]]
-; CHECK-NEXT: [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[CONV]]
+; CHECK-NEXT: [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[ADD46_2]]
; CHECK-NEXT: [[ADD_I57:%.*]] = add i32 [[MUL_I56]], [[SUB104]]
; CHECK-NEXT: [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP146]]
-; CHECK-NEXT: [[ADD_I62:%.*]] = add i32 [[MUL_I61]], [[SUB106]]
-; CHECK-NEXT: [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[TMP143]]
+; CHECK-NEXT: [[ADD_I62:%.*]] = add i32 [[MUL_I61_1]], [[SUB106]]
+; CHECK-NEXT: [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[TMP108]]
; CHECK-NEXT: [[ADD110:%.*]] = add i32 [[XOR_I53]], [[XOR_I]]
; CHECK-NEXT: [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I58]]
; CHECK-NEXT: [[ADD113:%.*]] = add i32 [[ADD112]], [[XOR_I63]]
; CHECK-NEXT: [[TMP150:%.*]] = shufflevector <2 x i32> [[TMP105]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT: [[TMP151:%.*]] = insertelement <2 x i32> [[TMP150]], i32 [[SUB47_2]], i32 1
-; CHECK-NEXT: [[TMP152:%.*]] = insertelement <2 x i32> [[TMP105]], i32 [[SUB45_2]], i32 1
-; CHECK-NEXT: [[TMP163:%.*]] = add <2 x i32> [[TMP151]], [[TMP152]]
-; CHECK-NEXT: [[TMP154:%.*]] = shufflevector <2 x i32> [[TMP144]], <2 x i32> [[TMP73]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: [[TMP155:%.*]] = shufflevector <2 x i32> [[TMP144]], <2 x i32> [[TMP73]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP156:%.*]] = add <2 x i32> [[TMP154]], [[TMP155]]
-; CHECK-NEXT: [[TMP157:%.*]] = extractelement <2 x i32> [[TMP163]], i32 1
+; CHECK-NEXT: [[TMP151:%.*]] = insertelement <2 x i32> [[TMP150]], i32 [[SUB59_2]], i32 1
+; CHECK-NEXT: [[TMP152:%.*]] = insertelement <2 x i32> [[TMP105]], i32 [[SUB51_2]], i32 1
+; CHECK-NEXT: [[TMP153:%.*]] = add <2 x i32> [[TMP151]], [[TMP152]]
+; CHECK-NEXT: [[TMP154:%.*]] = shufflevector <2 x i32> [[TMP189]], <2 x i32> [[TMP190]], <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT: [[TMP184:%.*]] = shufflevector <2 x i32> [[TMP189]], <2 x i32> [[TMP190]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP156:%.*]] = add <2 x i32> [[TMP154]], [[TMP184]]
+; CHECK-NEXT: [[TMP157:%.*]] = extractelement <2 x i32> [[TMP153]], i32 1
; CHECK-NEXT: [[TMP158:%.*]] = extractelement <2 x i32> [[TMP156]], i32 1
-; CHECK-NEXT: [[TMP159:%.*]] = shufflevector <2 x i32> [[TMP156]], <2 x i32> [[TMP163]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT: [[TMP159:%.*]] = shufflevector <2 x i32> [[TMP156]], <2 x i32> [[TMP153]], <2 x i32> <i32 1, i32 3>
; CHECK-NEXT: [[ADD78_2:%.*]] = add i32 [[TMP158]], [[TMP157]]
-; CHECK-NEXT: [[TMP160:%.*]] = extractelement <2 x i32> [[TMP163]], i32 0
+; CHECK-NEXT: [[TMP160:%.*]] = extractelement <2 x i32> [[TMP153]], i32 0
; CHECK-NEXT: [[TMP161:%.*]] = extractelement <2 x i32> [[TMP156]], i32 0
-; CHECK-NEXT: [[TMP162:%.*]] = shufflevector <2 x i32> [[TMP156]], <2 x i32> [[TMP163]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT: [[TMP162:%.*]] = shufflevector <2 x i32> [[TMP156]], <2 x i32> [[TMP153]], <2 x i32> <i32 0, i32 2>
; CHECK-NEXT: [[ADD94_1:%.*]] = add i32 [[TMP161]], [[TMP160]]
-; CHECK-NEXT: [[TMP164:%.*]] = sub <2 x i32> [[TMP163]], [[TMP156]]
-; CHECK-NEXT: [[TMP173:%.*]] = extractelement <2 x i32> [[TMP164]], i32 0
-; CHECK-NEXT: [[TMP174:%.*]] = extractelement <2 x i32> [[TMP164]], i32 1
-; CHECK-NEXT: [[ADD105_1:%.*]] = add i32 [[TMP174]], [[TMP173]]
-; CHECK-NEXT: [[SUB106_1:%.*]] = sub i32 [[TMP173]], [[TMP174]]
-; CHECK-NEXT: [[ADD_I52_1:%.*]] = add i32 [[MUL_I51_1]], [[ADD105_1]]
+; CHECK-NEXT: [[TMP163:%.*]] = sub <2 x i32> [[TMP153]], [[TMP156]]
+; CHECK-NEXT: [[TMP164:%.*]] = extractelement <2 x i32> [[TMP163]], i32 0
+; CHECK-NEXT: [[TMP165:%.*]] = extractelement <2 x i32> [[TMP163]], i32 1
+; CHECK-NEXT: [[ADD105_1:%.*]] = add i32 [[TMP165]], [[TMP164]]
+; CHECK-NEXT: [[SUB106_1:%.*]] = sub i32 [[TMP164]], [[TMP165]]
+; CHECK-NEXT: [[ADD_I52_1:%.*]] = add i32 [[MUL_I51_5]], [[ADD105_1]]
; CHECK-NEXT: [[XOR_I53_1:%.*]] = xor i32 [[ADD_I52_1]], [[TMP107]]
-; CHECK-NEXT: [[TMP166:%.*]] = shufflevector <2 x i32> [[TMP101]], <2 x i32> [[TMP144]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT: [[TMP166:%.*]] = shufflevector <2 x i32> [[TMP16]], <2 x i32> [[TMP189]], <2 x i32> <i32 1, i32 3>
; CHECK-NEXT: [[TMP167:%.*]] = lshr <2 x i32> [[TMP166]], <i32 15, i32 15>
; CHECK-NEXT: [[TMP168:%.*]] = and <2 x i32> [[TMP167]], <i32 65537, i32 65537>
; CHECK-NEXT: [[TMP169:%.*]] = mul <2 x i32> [[TMP168]], <i32 65535, i32 65535>
@@ -263,44 +263,44 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[TMP283:%.*]] = shufflevector <2 x i32> [[TMP282]], <2 x i32> [[TMP211]], <2 x i32> <i32 0, i32 3>
; CHECK-NEXT: [[TMP177:%.*]] = add <2 x i32> [[TMP169]], [[TMP283]]
; CHECK-NEXT: [[TMP178:%.*]] = xor <2 x i32> [[TMP177]], [[TMP166]]
-; CHECK-NEXT: [[ADD_I62_1:%.*]] = add i32 [[MUL_I61_1]], [[SUB106_1]]
-; CHECK-NEXT: [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[TMP108]]
+; CHECK-NEXT: [[ADD_I62_1:%.*]] = add i32 [[MUL_I61_4]], [[SUB106_1]]
+; CHECK-NEXT: [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[TMP142]]
; CHECK-NEXT: [[ADD108_1:%.*]] = add i32 [[XOR_I53_1]], [[ADD113]]
; CHECK-NEXT: [[TMP179:%.*]] = extractelement <2 x i32> [[TMP178]], i32 0
; CHECK-NEXT: [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[TMP179]]
; CHECK-NEXT: [[TMP180:%.*]] = extractelement <2 x i32> [[TMP178]], i32 1
; CHECK-NEXT: [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[TMP180]]
; CHECK-NEXT: [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[XOR_I63_1]]
-; CHECK-NEXT: [[TMP181:%.*]] = shufflevector <2 x i32> [[TMP165]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
-; CHECK-NEXT: [[TMP182:%.*]] = insertelement <2 x i32> [[TMP181]], i32 [[ADD44_2]], i32 0
-; CHECK-NEXT: [[TMP183:%.*]] = insertelement <2 x i32> [[TMP165]], i32 [[CONV]], i32 0
-; CHECK-NEXT: [[TMP184:%.*]] = sub <2 x i32> [[TMP182]], [[TMP183]]
-; CHECK-NEXT: [[TMP185:%.*]] = shufflevector <2 x i32> [[TMP16]], <2 x i32> [[TMP257]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: [[TMP186:%.*]] = shufflevector <2 x i32> [[TMP16]], <2 x i32> [[TMP257]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP181:%.*]] = shufflevector <2 x i32> [[TMP106]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT: [[TMP182:%.*]] = insertelement <2 x i32> [[TMP181]], i32 [[ADD44_3]], i32 0
+; CHECK-NEXT: [[TMP183:%.*]] = insertelement <2 x i32> [[TMP106]], i32 [[ADD46_2]], i32 0
+; CHECK-NEXT: [[TMP195:%.*]] = sub <2 x i32> [[TMP182]], [[TMP183]]
+; CHECK-NEXT: [[TMP185:%.*]] = shufflevector <2 x i32> [[TMP72]], <2 x i32> [[TMP143]], <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT: [[TMP186:%.*]] = shufflevector <2 x i32> [[TMP72]], <2 x i32> [[TMP143]], <2 x i32> <i32 0, i32 3>
; CHECK-NEXT: [[TMP187:%.*]] = sub <2 x i32> [[TMP185]], [[TMP186]]
-; CHECK-NEXT: [[TMP188:%.*]] = extractelement <2 x i32> [[TMP184]], i32 0
-; CHECK-NEXT: [[TMP189:%.*]] = extractelement <2 x i32> [[TMP187]], i32 0
-; CHECK-NEXT: [[TMP190:%.*]] = shufflevector <2 x i32> [[TMP187]], <2 x i32> [[TMP184]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[ADD94_4:%.*]] = add i32 [[TMP189]], [[TMP188]]
-; CHECK-NEXT: [[TMP191:%.*]] = extractelement <2 x i32> [[TMP184]], i32 1
+; CHECK-NEXT: [[TMP188:%.*]] = extractelement <2 x i32> [[TMP195]], i32 0
+; CHECK-NEXT: [[TMP196:%.*]] = extractelement <2 x i32> [[TMP187]], i32 0
+; CHECK-NEXT: [[TMP199:%.*]] = shufflevector <2 x i32> [[TMP187]], <2 x i32> [[TMP195]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT: [[ADD94_4:%.*]] = add i32 [[TMP196]], [[TMP188]]
+; CHECK-NEXT: [[TMP191:%.*]] = extractelement <2 x i32> [[TMP195]], i32 1
; CHECK-NEXT: [[TMP192:%.*]] = extractelement <2 x i32> [[TMP187]], i32 1
-; CHECK-NEXT: [[TMP193:%.*]] = shufflevector <2 x i32> [[TMP187]], <2 x i32> [[TMP184]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT: [[TMP193:%.*]] = shufflevector <2 x i32> [[TMP187]], <2 x i32> [[TMP195]], <2 x i32> <i32 1, i32 3>
; CHECK-NEXT: [[ADD94_2:%.*]] = add i32 [[TMP192]], [[TMP191]]
-; CHECK-NEXT: [[TMP194:%.*]] = sub <2 x i32> [[TMP184]], [[TMP187]]
+; CHECK-NEXT: [[TMP194:%.*]] = sub <2 x i32> [[TMP195]], [[TMP187]]
; CHECK-NEXT: [[TMP244:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_2]], i32 0
; CHECK-NEXT: [[TMP245:%.*]] = shufflevector <2 x i32> [[TMP244]], <2 x i32> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP197:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_4]], i32 0
; CHECK-NEXT: [[TMP198:%.*]] = shufflevector <2 x i32> [[TMP197]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP246:%.*]] = add <2 x i32> [[TMP245]], [[TMP198]]
-; CHECK-NEXT: [[TMP247:%.*]] = sub <2 x i32> [[TMP245]], [[TMP198]]
-; CHECK-NEXT: [[TMP248:%.*]] = shufflevector <2 x i32> [[TMP246]], <2 x i32> [[TMP247]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP216:%.*]] = add <2 x i32> [[TMP245]], [[TMP198]]
+; CHECK-NEXT: [[TMP210:%.*]] = sub <2 x i32> [[TMP245]], [[TMP198]]
+; CHECK-NEXT: [[TMP221:%.*]] = shufflevector <2 x i32> [[TMP216]], <2 x i32> [[TMP210]], <2 x i32> <i32 0, i32 3>
; CHECK-NEXT: [[TMP215:%.*]] = extractelement <2 x i32> [[TMP194]], i32 0
; CHECK-NEXT: [[TMP203:%.*]] = extractelement <2 x i32> [[TMP194]], i32 1
; CHECK-NEXT: [[ADD105_2:%.*]] = add i32 [[TMP215]], [[TMP203]]
; CHECK-NEXT: [[SUB106_2:%.*]] = sub i32 [[TMP203]], [[TMP215]]
; CHECK-NEXT: [[ADD_I52_2:%.*]] = add i32 [[MUL_I51_4]], [[ADD105_2]]
; CHECK-NEXT: [[XOR_I53_2:%.*]] = xor i32 [[ADD_I52_2]], [[CONV_1]]
-; CHECK-NEXT: [[TMP266:%.*]] = add <2 x i32> [[TMP149]], [[TMP248]]
+; CHECK-NEXT: [[TMP266:%.*]] = add <2 x i32> [[TMP149]], [[TMP221]]
; CHECK-NEXT: [[TMP267:%.*]] = xor <2 x i32> [[TMP266]], [[TMP110]]
; CHECK-NEXT: [[SHR_I59_2:%.*]] = lshr i32 [[TMP238]], 15
; CHECK-NEXT: [[AND_I60_2:%.*]] = and i32 [[SHR_I59_2]], 65537
@@ -313,48 +313,48 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[TMP207:%.*]] = extractelement <2 x i32> [[TMP267]], i32 1
; CHECK-NEXT: [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP207]]
; CHECK-NEXT: [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]]
-; CHECK-NEXT: [[TMP221:%.*]] = insertelement <2 x i32> [[TMP150]], i32 [[SUB45_2]], i32 0
-; CHECK-NEXT: [[TMP222:%.*]] = insertelement <2 x i32> [[TMP105]], i32 [[SUB47_2]], i32 0
-; CHECK-NEXT: [[TMP210:%.*]] = sub <2 x i32> [[TMP221]], [[TMP222]]
-; CHECK-NEXT: [[TMP225:%.*]] = shufflevector <2 x i32> [[TMP73]], <2 x i32> [[TMP144]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: [[TMP212:%.*]] = shufflevector <2 x i32> [[TMP73]], <2 x i32> [[TMP144]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP226:%.*]] = sub <2 x i32> [[TMP225]], [[TMP212]]
-; CHECK-NEXT: [[TMP214:%.*]] = extractelement <2 x i32> [[TMP210]], i32 0
-; CHECK-NEXT: [[TMP227:%.*]] = extractelement <2 x i32> [[TMP226]], i32 0
-; CHECK-NEXT: [[TMP216:%.*]] = shufflevector <2 x i32> [[TMP226]], <2 x i32> [[TMP210]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[ADD94_3:%.*]] = add i32 [[TMP227]], [[TMP214]]
-; CHECK-NEXT: [[TMP217:%.*]] = extractelement <2 x i32> [[TMP210]], i32 1
-; CHECK-NEXT: [[TMP218:%.*]] = extractelement <2 x i32> [[TMP226]], i32 1
-; CHECK-NEXT: [[TMP219:%.*]] = shufflevector <2 x i32> [[TMP226]], <2 x i32> [[TMP210]], <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT: [[SUB59:%.*]] = add i32 [[TMP218]], [[TMP217]]
-; CHECK-NEXT: [[TMP220:%.*]] = sub <2 x i32> [[TMP210]], [[TMP226]]
-; CHECK-NEXT: [[TMP274:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59]], i32 0
-; CHECK-NEXT: [[TMP275:%.*]] = shufflevector <2 x i32> [[TMP274]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP222:%.*]] = insertelement <2 x i32> [[TMP150]], i32 [[SUB51_2]], i32 0
+; CHECK-NEXT: [[TMP225:%.*]] = insertelement <2 x i32> [[TMP105]], i32 [[SUB59_2]], i32 0
+; CHECK-NEXT: [[TMP226:%.*]] = sub <2 x i32> [[TMP222]], [[TMP225]]
+; CHECK-NEXT: [[TMP227:%.*]] = shufflevector <2 x i32> [[TMP190]], <2 x i32> [[TMP189]], <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT: [[TMP212:%.*]] = shufflevector <2 x i32> [[TMP190]], <2 x i32> [[TMP189]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP213:%.*]] = sub <2 x i32> [[TMP227]], [[TMP212]]
+; CHECK-NEXT: [[TMP214:%.*]] = extractelement <2 x i32> [[TMP226]], i32 0
+; CHECK-NEXT: [[TMP237:%.*]] = extractelement <2 x i32> [[TMP213]], i32 0
+; CHECK-NEXT: [[TMP239:%.*]] = shufflevector <2 x i32> [[TMP213]], <2 x i32> [[TMP226]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT: [[ADD94_5:%.*]] = add i32 [[TMP237]], [[TMP214]]
+; CHECK-NEXT: [[TMP217:%.*]] = extractelement <2 x i32> [[TMP226]], i32 1
+; CHECK-NEXT: [[TMP218:%.*]] = extractelement <2 x i32> [[TMP213]], i32 1
+; CHECK-NEXT: [[TMP219:%.*]] = shufflevector <2 x i32> [[TMP213]], <2 x i32> [[TMP226]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT: [[ADD94_3:%.*]] = add i32 [[TMP218]], [[TMP217]]
+; CHECK-NEXT: [[TMP240:%.*]] = sub <2 x i32> [[TMP226]], [[TMP213]]
; CHECK-NEXT: [[TMP223:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_3]], i32 0
; CHECK-NEXT: [[TMP224:%.*]] = shufflevector <2 x i32> [[TMP223]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP276:%.*]] = add <2 x i32> [[TMP275]], [[TMP224]]
-; CHECK-NEXT: [[TMP277:%.*]] = sub <2 x i32> [[TMP275]], [[TMP224]]
-; CHECK-NEXT: [[TMP278:%.*]] = shufflevector <2 x i32> [[TMP276]], <2 x i32> [[TMP277]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP228:%.*]] = extractelement <2 x i32> [[TMP220]], i32 0
-; CHECK-NEXT: [[TMP229:%.*]] = extractelement <2 x i32> [[TMP220]], i32 1
+; CHECK-NEXT: [[TMP241:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_5]], i32 0
+; CHECK-NEXT: [[TMP242:%.*]] = shufflevector <2 x i32> [[TMP241]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP261:%.*]] = add <2 x i32> [[TMP224]], [[TMP242]]
+; CHECK-NEXT: [[TMP262:%.*]] = sub <2 x i32> [[TMP224]], [[TMP242]]
+; CHECK-NEXT: [[TMP220:%.*]] = shufflevector <2 x i32> [[TMP261]], <2 x i32> [[TMP262]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP228:%.*]] = extractelement <2 x i32> [[TMP240]], i32 0
+; CHECK-NEXT: [[TMP229:%.*]] = extractelement <2 x i32> [[TMP240]], i32 1
; CHECK-NEXT: [[ADD105_3:%.*]] = add i32 [[TMP228]], [[TMP229]]
; CHECK-NEXT: [[SUB106_3:%.*]] = sub i32 [[TMP229]], [[TMP228]]
-; CHECK-NEXT: [[ADD_I52_3:%.*]] = add i32 [[MUL_I51_5]], [[ADD105_3]]
+; CHECK-NEXT: [[ADD_I52_3:%.*]] = add i32 [[MUL_I51_6]], [[ADD105_3]]
; CHECK-NEXT: [[XOR_I53_3:%.*]] = xor i32 [[ADD_I52_3]], [[CONV1]]
; CHECK-NEXT: [[TMP230:%.*]] = lshr <2 x i32> [[TMP102]], <i32 15, i32 15>
; CHECK-NEXT: [[TMP231:%.*]] = and <2 x i32> [[TMP230]], <i32 65537, i32 65537>
; CHECK-NEXT: [[TMP232:%.*]] = mul <2 x i32> [[TMP231]], <i32 65535, i32 65535>
-; CHECK-NEXT: [[TMP286:%.*]] = add <2 x i32> [[TMP232]], [[TMP278]]
-; CHECK-NEXT: [[TMP287:%.*]] = xor <2 x i32> [[TMP286]], [[TMP102]]
+; CHECK-NEXT: [[TMP233:%.*]] = add <2 x i32> [[TMP232]], [[TMP220]]
+; CHECK-NEXT: [[TMP234:%.*]] = xor <2 x i32> [[TMP233]], [[TMP102]]
; CHECK-NEXT: [[SHR_I59_3:%.*]] = lshr i32 [[CONV33]], 15
; CHECK-NEXT: [[AND_I60_3:%.*]] = and i32 [[SHR_I59_3]], 65537
; CHECK-NEXT: [[MUL_I61_3:%.*]] = mul i32 [[AND_I60_3]], 65535
; CHECK-NEXT: [[ADD_I62_3:%.*]] = add i32 [[MUL_I61_3]], [[SUB106_3]]
; CHECK-NEXT: [[XOR_I63_3:%.*]] = xor i32 [[ADD_I62_3]], [[CONV33]]
; CHECK-NEXT: [[ADD108_3:%.*]] = add i32 [[XOR_I53_3]], [[ADD113_2]]
-; CHECK-NEXT: [[TMP235:%.*]] = extractelement <2 x i32> [[TMP287]], i32 0
+; CHECK-NEXT: [[TMP235:%.*]] = extractelement <2 x i32> [[TMP234]], i32 0
; CHECK-NEXT: [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP235]]
-; CHECK-NEXT: [[TMP236:%.*]] = extractelement <2 x i32> [[TMP287]], i32 1
+; CHECK-NEXT: [[TMP236:%.*]] = extractelement <2 x i32> [[TMP234]], i32 1
; CHECK-NEXT: [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP236]]
; CHECK-NEXT: [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[XOR_I63_3]]
; CHECK-NEXT: ret i32 [[ADD113_3]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll b/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll
index 94534274cab2..5f8941e9f889 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll
@@ -332,18 +332,14 @@ define void @reorder_alt_rightsubTree(ptr nocapture %c, ptr noalias nocapture re
define void @vec_shuff_reorder() #0 {
; CHECK-LABEL: @vec_shuff_reorder(
-; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr @fb, align 4
-; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr @fa, align 4
-; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr getelementptr inbounds ([4 x float], ptr @fa, i32 0, i64 1), align 4
-; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr getelementptr inbounds ([4 x float], ptr @fb, i32 0, i64 1), align 4
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr @fa, align 4
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, ptr @fb, align 4
; CHECK-NEXT: [[TMP5:%.*]] = load <2 x float>, ptr getelementptr inbounds ([4 x float], ptr @fb, i32 0, i64 2), align 4
; CHECK-NEXT: [[TMP6:%.*]] = load <2 x float>, ptr getelementptr inbounds ([4 x float], ptr @fa, i32 0, i64 2), align 4
-; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[TMP3]], i32 1
+; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x float> [[TMP11]], float [[TMP4]], i32 1
+; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
; CHECK-NEXT: [[TMP15:%.*]] = fadd <4 x float> [[TMP10]], [[TMP14]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/entries-different-vf.ll b/llvm/test/Transforms/SLPVectorizer/X86/entries-different-vf.ll
index 610cc5bdeb31..536526a5cfe0 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/entries-different-vf.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/entries-different-vf.ll
@@ -6,11 +6,11 @@ define i1 @test() {
; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = shl i64 0, 0
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i64> <i64 poison, i64 poison, i64 0, i64 poison, i64 0, i64 0, i64 0, i64 0>, i64 [[TMP0]], i32 0
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i64> [[TMP1]], i64 0, i32 1
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i64> [[TMP2]], i64 0, i32 3
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i64> [[TMP3]], <8 x i64> poison, <4 x i32> <i32 3, i32 poison, i32 1, i32 0>
-; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i64> <i64 undef, i64 0, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <8 x i64> [[TMP3]], <8 x i32> <i32 11, i32 11, i32 11, i32 1, i32 9, i32 9, i32 1, i32 8>
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i64> <i64 poison, i64 poison, i64 poison, i64 poison, i64 0, i64 0, i64 0, i64 0>, i64 0, i32 0
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i64> <i64 undef, i64 undef, i64 0, i64 0>, i64 [[TMP0]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i64> [[TMP11]], i64 0, i32 1
+; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 1, i32 1, i32 3, i32 0>
; CHECK-NEXT: [[TMP6:%.*]] = or <8 x i64> [[TMP3]], [[TMP5]]
; CHECK-NEXT: [[TMP7:%.*]] = sub <8 x i64> [[TMP3]], [[TMP5]]
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i64> [[TMP6]], <8 x i64> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 11, i32 12, i32 5, i32 6, i32 7>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-many-users-buildvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-many-users-buildvector.ll
index cac0491d0b64..7ae6793fba4c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extract-many-users-buildvector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-many-users-buildvector.ll
@@ -8,12 +8,10 @@ define i1 @test(float %0, double %1) {
; CHECK-NEXT: [[TMP4:%.*]] = fpext <4 x float> [[TMP3]] to <4 x double>
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> <double poison, double 0.000000e+00>, double [[TMP1]], i32 0
; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x double> zeroinitializer, [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 1
-; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 1, i32 poison>
-; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x double> [[TMP8]], <4 x double> <double poison, double 0.000000e+00, double poison, double 0.000000e+00>, <4 x i32> <i32 poison, i32 5, i32 2, i32 7>
+; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 1, i32 1>
+; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> <double poison, double 0.000000e+00, double poison, double poison>, <4 x i32> <i32 poison, i32 5, i32 2, i32 3>
; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x double> [[TMP9]], double [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <4 x i32> <i32 1, i32 2, i32 0, i32 poison>
-; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x double> [[TMP11]], double [[TMP7]], i32 3
+; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double poison, double poison, double poison, double 0.000000e+00>, <4 x i32> <i32 1, i32 2, i32 0, i32 7>
; CHECK-NEXT: [[TMP13:%.*]] = fmul <4 x double> [[TMP10]], [[TMP12]]
; CHECK-NEXT: [[TMP14:%.*]] = fmul <4 x double> zeroinitializer, [[TMP4]]
; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <4 x double> [[TMP13]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll
index dd7ba71ed673..f1580599ba12 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll
@@ -12,8 +12,8 @@ define i64 @foo(i32 %tmp7) {
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 4, i32 2, i32 3, i32 5, i32 poison, i32 6>
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[TMP24]], i32 6
; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <8 x i32> [[TMP3]], [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = add nsw <8 x i32> [[TMP3]], [[TMP5]]
-; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 9, i32 2, i32 3, i32 12, i32 13, i32 6, i32 7>
+; CHECK-NEXT: [[TMP77:%.*]] = add nsw <8 x i32> [[TMP3]], [[TMP5]]
+; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> [[TMP77]], <8 x i32> <i32 0, i32 9, i32 2, i32 3, i32 12, i32 13, i32 6, i32 7>
; CHECK-NEXT: [[TMP9:%.*]] = add <8 x i32> zeroinitializer, [[TMP8]]
; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i32> [[TMP9]], zeroinitializer
; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP10]])
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll
index f665dac3282b..24b95c4e6ff2 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll
@@ -7,17 +7,14 @@ define void @foo(double %i) {
; CHECK-NEXT: bb:
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x double> <double 0.000000e+00, double 0.000000e+00, double poison, double 0.000000e+00>, double [[I]], i32 2
; CHECK-NEXT: [[TMP1:%.*]] = fsub <4 x double> zeroinitializer, [[TMP0]]
-; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[I]], i32 0
; CHECK-NEXT: [[TMP4:%.*]] = fsub <2 x double> zeroinitializer, [[TMP3]]
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP4]], i32 1
-; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <8 x i32> <i32 poison, i32 poison, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x double> [[TMP6]], <8 x double> <double 0.000000e+00, double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, <8 x i32> <i32 8, i32 poison, i32 2, i32 poison, i32 poison, i32 13, i32 14, i32 15>
-; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x double> [[TMP7]], double [[TMP2]], i32 3
-; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 poison, i32 1>
-; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x double> [[TMP9]], <8 x double> <double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double poison, double poison, double poison>, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 5, i32 poison, i32 7>
-; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x double> [[TMP10]], double [[TMP5]], i32 6
-; CHECK-NEXT: [[TMP12:%.*]] = fmul <8 x double> [[TMP8]], [[TMP11]]
+; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 1, i32 poison, i32 0, i32 poison, i32 1>
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x double> [[TMP9]], <8 x double> <double poison, double 0.000000e+00, double poison, double poison, double 0.000000e+00, double poison, double poison, double poison>, <8 x i32> <i32 0, i32 9, i32 poison, i32 3, i32 12, i32 5, i32 poison, i32 7>
+; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x double> [[TMP6]], double [[TMP5]], i32 2
+; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x double> [[TMP7]], <8 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 2, i32 7>
+; CHECK-NEXT: [[TMP12:%.*]] = fmul <8 x double> <double 0.000000e+00, double poison, double 0.000000e+00, double 0.000000e+00, double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, [[TMP8]]
; CHECK-NEXT: [[TMP13:%.*]] = fadd <8 x double> zeroinitializer, [[TMP12]]
; CHECK-NEXT: [[TMP14:%.*]] = fadd <8 x double> [[TMP13]], zeroinitializer
; CHECK-NEXT: [[TMP15:%.*]] = fcmp ult <8 x double> [[TMP14]], zeroinitializer
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
index 593aad82ad5d..8562e53b1538 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
@@ -185,9 +185,13 @@ define void @shuffle_nodes_match1(ptr noalias %from, ptr noalias %to, double %v1
; CHECK-NEXT: br label [[LP:%.*]]
; CHECK: lp:
; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT: [[FROM_1:%.*]] = getelementptr i8, ptr [[FROM:%.*]], i32 8
+; CHECK-NEXT: [[V0_1:%.*]] = load double, ptr [[FROM]], align 4
+; CHECK-NEXT: [[V0_2:%.*]] = load double, ptr [[FROM_1]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V0_2]], i64 0
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[V0_1]], i64 0
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP1]]
; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
; CHECK-NEXT: br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
@@ -233,13 +237,9 @@ define void @vecload_vs_broadcast4(ptr noalias %from, ptr noalias %to, double %v
; CHECK-NEXT: br label [[LP:%.*]]
; CHECK: lp:
; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[FROM_1:%.*]] = getelementptr i8, ptr [[FROM:%.*]], i32 8
-; CHECK-NEXT: [[V0_1:%.*]] = load double, ptr [[FROM]], align 4
-; CHECK-NEXT: [[V0_2:%.*]] = load double, ptr [[FROM_1]], align 4
-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V0_2]], i64 0
+; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V0_1]], i64 0
-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
; CHECK-NEXT: store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4
; CHECK-NEXT: br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/postponed_gathers.ll b/llvm/test/Transforms/SLPVectorizer/X86/postponed_gathers.ll
index 681d131c5072..488ca0b23cd9 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/postponed_gathers.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/postponed_gathers.ll
@@ -10,7 +10,7 @@ define void @foo() {
; CHECK-NEXT: br label [[BCI_252:%.*]]
; CHECK: bci_252:
; CHECK-NEXT: [[TMP3:%.*]] = phi <2 x i32> [ zeroinitializer, [[BCI_0:%.*]] ], [ [[TMP16:%.*]], [[BCI_252_1:%.*]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = mul <2 x i32> zeroinitializer, [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = mul <2 x i32> [[TMP1]], zeroinitializer
; CHECK-NEXT: [[TMP5:%.*]] = or <2 x i32> [[TMP3]], zeroinitializer
; CHECK-NEXT: [[TMP6:%.*]] = or <2 x i32> [[TMP2]], [[TMP4]]
; CHECK-NEXT: [[TMP7:%.*]] = or <2 x i32> [[TMP6]], zeroinitializer
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/replaced-external-in-reduction.ll b/llvm/test/Transforms/SLPVectorizer/X86/replaced-external-in-reduction.ll
index 19a3a7d53df0..9df7aa1c727c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/replaced-external-in-reduction.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/replaced-external-in-reduction.ll
@@ -5,8 +5,8 @@ define void @test(i32 %0, ptr %p) {
; CHECK-LABEL: define void @test(
; CHECK-SAME: i32 [[TMP0:%.*]], ptr [[P:%.*]]) {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> <i32 0, i32 1, i32 0, i32 poison>, i32 [[TMP0]], i32 3
-; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i32> [[TMP1]], <i32 1, i32 0, i32 1, i32 0>
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> <i32 0, i32 0, i32 0, i32 poison>, i32 [[TMP0]], i32 3
+; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 0>
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[PH:%.*]]
; CHECK: ph:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll
index 69ecf1852aed..8f1d7a11e150 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll
@@ -7,11 +7,9 @@ define void @test(ptr nocapture %t2) {
; CHECK-NEXT: [[T4:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 7
; CHECK-NEXT: [[T5:%.*]] = load i32, ptr [[T4]], align 4
; CHECK-NEXT: [[T8:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 1
-; CHECK-NEXT: [[T9:%.*]] = load i32, ptr [[T8]], align 4
; CHECK-NEXT: [[T10:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 6
; CHECK-NEXT: [[T11:%.*]] = load i32, ptr [[T10]], align 4
-; CHECK-NEXT: [[T14:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 2
-; CHECK-NEXT: [[T15:%.*]] = load i32, ptr [[T14]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[T8]], align 4
; CHECK-NEXT: [[T16:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 5
; CHECK-NEXT: [[T17:%.*]] = load i32, ptr [[T16]], align 4
; CHECK-NEXT: [[T20:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 3
@@ -21,10 +19,11 @@ define void @test(ptr nocapture %t2) {
; CHECK-NEXT: [[T24:%.*]] = add nsw i32 [[T23]], [[T21]]
; CHECK-NEXT: [[T25:%.*]] = sub nsw i32 [[T21]], [[T23]]
; CHECK-NEXT: [[T27:%.*]] = sub nsw i32 [[T3]], [[T24]]
+; CHECK-NEXT: [[T9:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; CHECK-NEXT: [[T15:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
; CHECK-NEXT: [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]]
; CHECK-NEXT: [[T30:%.*]] = add nsw i32 [[T27]], [[T29]]
; CHECK-NEXT: [[T31:%.*]] = mul nsw i32 [[T30]], 4433
-; CHECK-NEXT: [[T32:%.*]] = mul nsw i32 [[T27]], 6270
; CHECK-NEXT: [[T34:%.*]] = mul nsw i32 [[T29]], -15137
; CHECK-NEXT: [[T37:%.*]] = add nsw i32 [[T25]], [[T11]]
; CHECK-NEXT: [[T38:%.*]] = add nsw i32 [[T17]], [[T5]]
@@ -34,20 +33,19 @@ define void @test(ptr nocapture %t2) {
; CHECK-NEXT: [[T42:%.*]] = mul nsw i32 [[T17]], 16819
; CHECK-NEXT: [[T47:%.*]] = mul nsw i32 [[T37]], -16069
; CHECK-NEXT: [[T48:%.*]] = mul nsw i32 [[T38]], -3196
-; CHECK-NEXT: [[T49:%.*]] = add nsw i32 [[T40]], [[T47]]
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[T15]], i32 0
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T40]], i32 1
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[T9]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[T48]], i32 1
-; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[TMP2]], [[TMP4]]
-; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> <i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[T67:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T32]], i32 2
-; CHECK-NEXT: [[T68:%.*]] = insertelement <8 x i32> [[T67]], i32 [[T49]], i32 3
-; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[T701:%.*]] = shufflevector <8 x i32> [[T68]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[T27]], i32 2
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[T47]], i32 3
+; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> <i32 poison, i32 poison, i32 6270, i32 poison>, <4 x i32> <i32 1, i32 0, i32 6, i32 poison>
+; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[T40]], i32 3
+; CHECK-NEXT: [[TMP9:%.*]] = add nsw <4 x i32> [[TMP6]], [[TMP8]]
+; CHECK-NEXT: [[TMP10:%.*]] = mul nsw <4 x i32> [[TMP6]], [[TMP8]]
+; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; CHECK-NEXT: [[T50:%.*]] = add nsw i32 [[T40]], [[T48]]
+; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 poison, i32 poison, i32 3>
+; CHECK-NEXT: [[T701:%.*]] = insertelement <8 x i32> [[TMP12]], i32 [[T50]], i32 5
; CHECK-NEXT: [[T71:%.*]] = insertelement <8 x i32> [[T701]], i32 [[T34]], i32 6
-; CHECK-NEXT: [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[T49]], i32 7
-; CHECK-NEXT: [[T76:%.*]] = shl <8 x i32> [[T72]], <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; CHECK-NEXT: [[T76:%.*]] = shl <8 x i32> [[T71]], <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
; CHECK-NEXT: store <8 x i32> [[T76]], ptr [[T2]], align 4
; CHECK-NEXT: ret void
;