LLVM / project / 1008539

    /// scalable version of the vectorized loop.
    bool preferFixedOverScalableIfEqualCost() const;
++  /// \returns True if target prefers SLP vectorizer with altermate opcode
++  /// vectorization, false - otherwise.
++  bool preferAlternateOpcodeVectorization() const;
++
    /// \returns True if the target prefers reductions in loop.
    bool preferInLoopReduction(unsigned Opcode, Type *Ty) const;
    virtual bool preferInLoopReduction(unsigned Opcode, Type *Ty) const = 0;
    virtual bool preferPredicatedReductionSelect(unsigned Opcode,
                                                 Type *Ty) const = 0;
++  virtual bool preferAlternateOpcodeVectorization() const = 0;
    virtual bool preferEpilogueVectorization() const = 0;
    virtual bool shouldExpandReduction(const IntrinsicInst *II) const = 0;
    bool preferInLoopReduction(unsigned Opcode, Type *Ty) const override {
      return Impl.preferInLoopReduction(Opcode, Ty);
    }
++  bool preferAlternateOpcodeVectorization() const override {
++    return Impl.preferAlternateOpcodeVectorization();
++  }
    bool preferPredicatedReductionSelect(unsigned Opcode,
                                         Type *Ty) const override {
      return Impl.preferPredicatedReductionSelect(Opcode, Ty);

    bool preferFixedOverScalableIfEqualCost() const { return false; }
    bool preferInLoopReduction(unsigned Opcode, Type *Ty) const { return false; }
++  bool preferAlternateOpcodeVectorization() const { return true; }
    bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty) const {
      return false;

llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1384,6 +1384,10 @@ bool TargetTransformInfo::preferInLoopReduction(unsigned Opcode,
	return TTIImpl->preferInLoopReduction(Opcode, Ty);	return TTIImpl->preferInLoopReduction(Opcode, Ty);
	}	}

		+bool TargetTransformInfo::preferAlternateOpcodeVectorization() const {
		+ return TTIImpl->preferAlternateOpcodeVectorization();
		+}
		+
	bool TargetTransformInfo::preferPredicatedReductionSelect(unsigned Opcode,	bool TargetTransformInfo::preferPredicatedReductionSelect(unsigned Opcode,
	Type *Ty) const {	Type *Ty) const {
	return TTIImpl->preferPredicatedReductionSelect(Opcode, Ty);	return TTIImpl->preferPredicatedReductionSelect(Opcode, Ty);

    unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const;
++  bool preferAlternateOpcodeVectorization() const { return false; }
++
    bool preferEpilogueVectorization() const {
      // Epilogue vectorization is usually unprofitable - tail folding or
      // a smaller VF would have been better.  This a blunt hammer - we

    TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
                                                      bool IsZeroCmp) const;
++  bool preferAlternateOpcodeVectorization() const { return false; }
    bool prefersVectorizedAddressing() const;
    bool supportsEfficientVectorElementLoadStore() const;
    bool enableInterleavedAccessVectorization();

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -141,6 +141,10 @@ static cl::opt<bool> ShouldStartVectorizeHorAtStore(
	cl::desc(	cl::desc(
	"Attempt to vectorize horizontal reductions feeding into a store"));	"Attempt to vectorize horizontal reductions feeding into a store"));

		+static cl::opt<bool> SplitAlternateInstructions(
		+ "slp-split-alternate-instructions", cl::init(true), cl::Hidden,
		+ cl::desc("Improve the code quality by splitting alternate instructions"));
		+
	static cl::opt<int>	static cl::opt<int>
	MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,	MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
	cl::desc("Attempt to vectorize for this register size in bits"));	cl::desc("Attempt to vectorize for this register size in bits"));
@@ -840,6 +844,35 @@ public:
	return getOpcode() == CheckedOpcode \|\| getAltOpcode() == CheckedOpcode;	return getOpcode() == CheckedOpcode \|\| getAltOpcode() == CheckedOpcode;
	}	}

		+ /// Checks if main/alt instructions are shift operations.
		+ bool isShiftOp() const {
		+ return getMainOp()->isShift() && getAltOp()->isShift();
		+ }
		+
		+ /// Checks if main/alt instructions are bitwise logic operations.
		+ bool isBitwiseLogicOp() const {
		+ return getMainOp()->isBitwiseLogicOp() && getAltOp()->isBitwiseLogicOp();
		+ }
		+
		+ /// Checks if main/alt instructions are mul/div/rem/fmul/fdiv/frem operations.
		+ bool isMulDivLikeOp() const {
		+ constexpr std::array<unsigned, 8> MulDiv = {
		+ Instruction::Mul, Instruction::FMul, Instruction::SDiv,
		+ Instruction::UDiv, Instruction::FDiv, Instruction::SRem,
		+ Instruction::URem, Instruction::FRem};
		+ return is_contained(MulDiv, getOpcode()) &&
		+ is_contained(MulDiv, getAltOpcode());
		+ }
		+
		+ /// Checks if main/alt instructions are add/sub/fadd/fsub operations.
		+ bool isAddSubLikeOp() const {
		+ constexpr std::array<unsigned, 4> AddSub = {
		+ Instruction::Add, Instruction::Sub, Instruction::FAdd,
		+ Instruction::FSub};
		+ return is_contained(AddSub, getOpcode()) &&
		+ is_contained(AddSub, getAltOpcode());
		+ }
		+
	/// Checks if the current state is valid, i.e. has non-null MainOp	/// Checks if the current state is valid, i.e. has non-null MainOp
	bool valid() const { return MainOp && AltOp; }	bool valid() const { return MainOp && AltOp; }

@@ -1472,6 +1505,7 @@ public:
	void deleteTree() {	void deleteTree() {
	VectorizableTree.clear();	VectorizableTree.clear();
	ScalarToTreeEntries.clear();	ScalarToTreeEntries.clear();
		+ ScalarsInSplitNodes.clear();
	MustGather.clear();	MustGather.clear();
	NonScheduledFirst.clear();	NonScheduledFirst.clear();
	EntryToLastInstruction.clear();	EntryToLastInstruction.clear();
@@ -1507,7 +1541,7 @@ public:
	/// should be represented as an empty order, so this is used to	/// should be represented as an empty order, so this is used to
	/// decide if we can canonicalize a computed order. Undef elements	/// decide if we can canonicalize a computed order. Undef elements
	/// (represented as size) are ignored.	/// (represented as size) are ignored.
	- bool isIdentityOrder(ArrayRef<unsigned> Order) ~~const~~ {	+ static bool isIdentityOrder(ArrayRef<unsigned> Order) {
	assert(!Order.empty() && "expected non-empty order");	assert(!Order.empty() && "expected non-empty order");
	const unsigned Sz = Order.size();	const unsigned Sz = Order.size();
	return all_of(enumerate(Order), [&](const auto &P) {	return all_of(enumerate(Order), [&](const auto &P) {
@@ -3229,12 +3263,35 @@ private:

	/// \returns Common mask for reorder indices and reused scalars.	/// \returns Common mask for reorder indices and reused scalars.
	SmallVector<int> getCommonMask() const {	SmallVector<int> getCommonMask() const {
		+ if (State == TreeEntry::SplitVectorize)
		+ return {};
	SmallVector<int> Mask;	SmallVector<int> Mask;
	inversePermutation(ReorderIndices, Mask);	inversePermutation(ReorderIndices, Mask);
	::addMask(Mask, ReuseShuffleIndices);	::addMask(Mask, ReuseShuffleIndices);
	return Mask;	return Mask;
	}	}

		+ /// \returns The mask for split nodes.
		+ SmallVector<int> getSplitMask() const {
		+ assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() &&
		+ "Expected only split vectorize node.");
		+ SmallVector<int> Mask(getVectorFactor(), PoisonMaskElem);
		+ unsigned CommonVF = std::max<unsigned>(
		+ CombinedEntriesWithIndices.back().second,
		+ Scalars.size() - CombinedEntriesWithIndices.back().second);
		+ for (auto [Idx, I] : enumerate(ReorderIndices))
		+ Mask[I] =
		+ Idx + (Idx >= CombinedEntriesWithIndices.back().second
		+ ? CommonVF - CombinedEntriesWithIndices.back().second
		+ : 0);
		+ return Mask;
		+ }
		+
		+ /// Updates (reorders) SplitVectorize node according to the given mask \p
		+ /// Mask and order \p MaskOrder.
		+ void reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
		+ ArrayRef<int> MaskOrder);
		+
	/// \returns true if the scalars in VL are equal to this entry.	/// \returns true if the scalars in VL are equal to this entry.
	bool isSame(ArrayRef<Value *> VL) const {	bool isSame(ArrayRef<Value *> VL) const {
	auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {	auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
@@ -3322,6 +3379,8 @@ private:
	///< complex node like select/cmp to minmax, mul/add to	///< complex node like select/cmp to minmax, mul/add to
	///< fma, etc. Must be used for the following nodes in	///< fma, etc. Must be used for the following nodes in
	///< the pattern, not the very first one.	///< the pattern, not the very first one.
		+ SplitVectorize, ///< Splits the node into 2 subnodes, vectorizes them
		+ ///< independently and then combines back.
	};	};
	EntryState State;	EntryState State;

@@ -3352,7 +3411,7 @@ private:
	/// The index of this treeEntry in VectorizableTree.	/// The index of this treeEntry in VectorizableTree.
	unsigned Idx = 0;	unsigned Idx = 0;

	- /// For gather/buildvector/alt opcode ~~(TODO)~~ nodes, which are combined from	+ /// For gather/buildvector/alt opcode nodes, which are combined from
	/// other nodes as a series of insertvector instructions.	/// other nodes as a series of insertvector instructions.
	SmallVector<std::pair<unsigned, unsigned>, 2> CombinedEntriesWithIndices;	SmallVector<std::pair<unsigned, unsigned>, 2> CombinedEntriesWithIndices;

@@ -3547,6 +3606,9 @@ private:
	case CombinedVectorize:	case CombinedVectorize:
	dbgs() << "CombinedVectorize\n";	dbgs() << "CombinedVectorize\n";
	break;	break;
		+ case SplitVectorize:
		+ dbgs() << "SplitVectorize\n";
		+ break;
	}	}
	if (S) {	if (S) {
	dbgs() << "MainOp: " << *S.getMainOp() << "\n";	dbgs() << "MainOp: " << *S.getMainOp() << "\n";
@@ -3627,8 +3689,10 @@ private:
	const EdgeInfo &UserTreeIdx,	const EdgeInfo &UserTreeIdx,
	ArrayRef<int> ReuseShuffleIndices = {},	ArrayRef<int> ReuseShuffleIndices = {},
	ArrayRef<unsigned> ReorderIndices = {}) {	ArrayRef<unsigned> ReorderIndices = {}) {
	- assert(((!Bundle && EntryState == TreeEntry::NeedToGather) \|\|	+ assert(((!Bundle && (EntryState == TreeEntry::NeedToGather \|\|
	- ~~(Bundle &&~~ EntryState != ~~TreeEntry::NeedToGather~~)) &&	+ EntryState == TreeEntry::SplitVectorize)) \|\|
		+ (Bundle && EntryState != TreeEntry::NeedToGather &&
		+ EntryState != TreeEntry::SplitVectorize)) &&
	"Need to vectorize gather entry?");	"Need to vectorize gather entry?");
	// Gathered loads still gathered? Do not create entry, use the original one.	// Gathered loads still gathered? Do not create entry, use the original one.
	if (GatheredLoadsEntriesFirst.has_value() &&	if (GatheredLoadsEntriesFirst.has_value() &&
@@ -3666,7 +3730,33 @@ private:
	Last->setOperations(S);	Last->setOperations(S);
	Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());	Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
	}	}
	- if (!Last->isGather()) {	+ if (EntryState == TreeEntry::SplitVectorize) {
		+ auto *MainOp =
		+ cast<Instruction>(*find_if(Last->Scalars, IsaPred<Instruction>));
		+ auto AltOp = cast<Instruction>(find_if(Last->Scalars, [=](Value *V) {
		+ auto *I = dyn_cast<Instruction>(V);
		+ if (!I)
		+ return false;
		+ InstructionsState LocalS = getSameOpcode({I, MainOp}, *TLI);
		+ return !LocalS \|\| LocalS.isAltShuffle();
		+ }));
		+ Last->setOperations(InstructionsState(MainOp, AltOp));
		+ SmallPtrSet<Value *, 4> Processed;
		+ for (Value *V : VL) {
		+ auto *I = dyn_cast<Instruction>(V);
		+ if (!I)
		+ continue;
		+ auto It = ScalarsInSplitNodes.find(V);
		+ if (It == ScalarsInSplitNodes.end()) {
		+ ScalarsInSplitNodes.try_emplace(V).first->getSecond().push_back(Last);
		+ (void)Processed.insert(V);
		+ } else if (Processed.insert(V).second) {
		+ assert(!is_contained(It->getSecond(), Last) &&
		+ "Value already associated with the node.");
		+ It->getSecond().push_back(Last);
		+ }
		+ }
		+ } else if (!Last->isGather()) {
	SmallPtrSet<Value *, 4> Processed;	SmallPtrSet<Value *, 4> Processed;
	for (Value *V : VL) {	for (Value *V : VL) {
	if (isa<PoisonValue>(V))	if (isa<PoisonValue>(V))
@@ -3748,6 +3838,15 @@ private:
	return It->getSecond();	return It->getSecond();
	}	}

		+ /// Get list of split vector entries, associated with the value \p V.
		+ ArrayRef<TreeEntry > getSplitTreeEntries(Value V) const {
		+ assert(V && "V cannot be nullptr.");
		+ auto It = ScalarsInSplitNodes.find(V);
		+ if (It == ScalarsInSplitNodes.end())
		+ return {};
		+ return It->getSecond();
		+ }
		+
	/// Returns first vector node for value \p V, matching values \p VL.	/// Returns first vector node for value \p V, matching values \p VL.
	TreeEntry getSameValuesTreeEntry(Value V, ArrayRef<Value *> VL,	TreeEntry getSameValuesTreeEntry(Value V, ArrayRef<Value *> VL,
	bool SameVF = false) const {	bool SameVF = false) const {
@@ -3778,6 +3877,9 @@ private:
	/// Maps a specific scalar to its tree entry(ies).	/// Maps a specific scalar to its tree entry(ies).
	SmallDenseMap<Value , SmallVector<TreeEntry >> ScalarToTreeEntries;	SmallDenseMap<Value , SmallVector<TreeEntry >> ScalarToTreeEntries;

		+ /// Scalars, used in split vectorize nodes.
		+ SmallDenseMap<Value , SmallVector<TreeEntry >> ScalarsInSplitNodes;
		+
	/// Maps a value to the proposed vectorizable size.	/// Maps a value to the proposed vectorizable size.
	SmallDenseMap<Value *, unsigned> InstrElementSize;	SmallDenseMap<Value *, unsigned> InstrElementSize;

@@ -5764,12 +5866,14 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
	!Instruction::isBinaryOp(TE.UserTreeIndex.UserTE->getOpcode())) &&	!Instruction::isBinaryOp(TE.UserTreeIndex.UserTE->getOpcode())) &&
	(TE.ReorderIndices.empty() \|\| isReverseOrder(TE.ReorderIndices)))	(TE.ReorderIndices.empty() \|\| isReverseOrder(TE.ReorderIndices)))
	return std::nullopt;	return std::nullopt;
	- if ((TE.State == ~~TreeEntry::Vectorize~~ \|\|	+ if (TE.State == TreeEntry::SplitVectorize \|\|
	- TE.State == ~~TreeEntry::StridedVectorize)~~ &&	+ ((TE.State == TreeEntry::Vectorize \|\|
	- (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) \|\|	+ TE.State == TreeEntry::StridedVectorize) &&
	- (~~TopToBottom~~ ~~&& isa<StoreInst~~, ~~InsertElementInst>~~(TE.getMainOp())~~)))~~ {	+ (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) \|\|
	- assert(!TE.isAltShuffle() && "Alternate instructions are only supported by "	+ (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))))) {
	- "BinaryOperator and CastInst.");	+ assert((TE.State == TreeEntry::SplitVectorize \|\| !TE.isAltShuffle()) &&
		+ "Alternate instructions are only supported by "
		+ "BinaryOperator and CastInst.");
	return TE.ReorderIndices;	return TE.ReorderIndices;
	}	}
	if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {	if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
@@ -5880,7 +5984,9 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
	return std::nullopt; // No need to reorder.	return std::nullopt; // No need to reorder.
	return std::move(Phis);	return std::move(Phis);
	}	}
	- if (TE.isGather() && ~~(!TE.hasState() \|\| !TE.isAltShuffle()) &&~~	+ if (TE.isGather() &&
		+ (!TE.hasState() \|\| !TE.isAltShuffle() \|\|
		+ ScalarsInSplitNodes.contains(TE.getMainOp())) &&
	allSameType(TE.Scalars)) {	allSameType(TE.Scalars)) {
	// TODO: add analysis of other gather nodes with extractelement	// TODO: add analysis of other gather nodes with extractelement
	// instructions and other values/instructions, not only undefs.	// instructions and other values/instructions, not only undefs.
@@ -6088,6 +6194,30 @@ bool BoUpSLP::isProfitableToReorder() const {
	return true;	return true;
	}	}

		+void BoUpSLP::TreeEntry::reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
		+ ArrayRef<int> MaskOrder) {
		+ assert(State == TreeEntry::SplitVectorize && "Expected split user node.");
		+ SmallVector<int> NewMask(getVectorFactor());
		+ SmallVector<int> NewMaskOrder(getVectorFactor());
		+ std::iota(NewMask.begin(), NewMask.end(), 0);
		+ std::iota(NewMaskOrder.begin(), NewMaskOrder.end(), 0);
		+ if (Idx == 0) {
		+ copy(Mask, NewMask.begin());
		+ copy(MaskOrder, NewMaskOrder.begin());
		+ } else {
		+ assert(Idx == 1 && "Expected either 0 or 1 index.");
		+ unsigned Offset = CombinedEntriesWithIndices.back().second;
		+ for (unsigned I : seq<unsigned>(Mask.size())) {
		+ NewMask[I + Offset] = Mask[I] + Offset;
		+ NewMaskOrder[I + Offset] = MaskOrder[I] + Offset;
		+ }
		+ }
		+ reorderScalars(Scalars, NewMask);
		+ reorderOrder(ReorderIndices, NewMaskOrder, /BottomOrder=/true);
		+ if (!ReorderIndices.empty() && BoUpSLP::isIdentityOrder(ReorderIndices))
		+ ReorderIndices.clear();
		+}
		+
	void BoUpSLP::reorderTopToBottom() {	void BoUpSLP::reorderTopToBottom() {
	// Maps VF to the graph nodes.	// Maps VF to the graph nodes.
	DenseMap<unsigned, SetVector<TreeEntry *>> VFToOrderedEntries;	DenseMap<unsigned, SetVector<TreeEntry *>> VFToOrderedEntries;
@@ -6122,7 +6252,8 @@ void BoUpSLP::reorderTopToBottom() {
	// Patterns like [fadd,fsub] can be combined into a single instruction in	// Patterns like [fadd,fsub] can be combined into a single instruction in
	// x86. Reordering them into [fsub,fadd] blocks this pattern. So we need	// x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
	// to take into account their order when looking for the most used order.	// to take into account their order when looking for the most used order.
	- if (TE->hasState() && TE->isAltShuffle()) {	+ if (TE->hasState() && TE->isAltShuffle() &&
		+ TE->State != TreeEntry::SplitVectorize) {
	VectorType *VecTy =	VectorType *VecTy =
	getWidenedType(TE->Scalars[0]->getType(), TE->Scalars.size());	getWidenedType(TE->Scalars[0]->getType(), TE->Scalars.size());
	unsigned Opcode0 = TE->getOpcode();	unsigned Opcode0 = TE->getOpcode();
@@ -6163,7 +6294,8 @@ void BoUpSLP::reorderTopToBottom() {
	}	}
	VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());	VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
	if (!(TE->State == TreeEntry::Vectorize \|\|	if (!(TE->State == TreeEntry::Vectorize \|\|
	- TE->State == TreeEntry::StridedVectorize) \|\|	+ TE->State == TreeEntry::StridedVectorize \|\|
		+ TE->State == TreeEntry::SplitVectorize) \|\|
	!TE->ReuseShuffleIndices.empty())	!TE->ReuseShuffleIndices.empty())
	GathersToOrders.try_emplace(TE.get(), *CurrentOrder);	GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
	if (TE->State == TreeEntry::Vectorize &&	if (TE->State == TreeEntry::Vectorize &&
@@ -6194,7 +6326,8 @@ void BoUpSLP::reorderTopToBottom() {
	for (const TreeEntry *OpTE : OrderedEntries) {	for (const TreeEntry *OpTE : OrderedEntries) {
	// No need to reorder this nodes, still need to extend and to use shuffle,	// No need to reorder this nodes, still need to extend and to use shuffle,
	// just need to merge reordering shuffle and the reuse shuffle.	// just need to merge reordering shuffle and the reuse shuffle.
	- if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))	+ if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE) &&
		+ OpTE->State != TreeEntry::SplitVectorize)
	continue;	continue;
	// Count number of orders uses.	// Count number of orders uses.
	const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,	const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
@@ -6301,14 +6434,17 @@ void BoUpSLP::reorderTopToBottom() {
	// Just do the reordering for the nodes with the given VF.	// Just do the reordering for the nodes with the given VF.
	if (TE->Scalars.size() != VF) {	if (TE->Scalars.size() != VF) {
	if (TE->ReuseShuffleIndices.size() == VF) {	if (TE->ReuseShuffleIndices.size() == VF) {
		+ assert(TE->State != TreeEntry::SplitVectorize &&
		+ "Split vectorized not expected.");
	// Need to reorder the reuses masks of the operands with smaller VF to	// Need to reorder the reuses masks of the operands with smaller VF to
	// be able to find the match between the graph nodes and scalar	// be able to find the match between the graph nodes and scalar
	// operands of the given node during vectorization/cost estimation.	// operands of the given node during vectorization/cost estimation.
	- assert(~~(!TE->UserTreeIndex \|\|~~	+ assert(
	- TE->UserTreeIndex~~.UserTE->Scalars.size()~~ ~~== VF~~ \|\|	+ (!TE->UserTreeIndex \|\|
	- TE->UserTreeIndex.UserTE->Scalars.size() ==	+ TE->UserTreeIndex.UserTE->Scalars.size() == VF \|\|
	- TE->Scalars.size()) &&	+ TE->UserTreeIndex.UserTE->Scalars.size() == TE->Scalars.size() \|\|
	- "All users must be of VF size.");	+ TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) &&
		+ "All users must be of VF size.");
	if (SLPReVec) {	if (SLPReVec) {
	assert(SLPReVec && "Only supported by REVEC.");	assert(SLPReVec && "Only supported by REVEC.");
	// ShuffleVectorInst does not do reorderOperands (and it should not	// ShuffleVectorInst does not do reorderOperands (and it should not
@@ -6325,19 +6461,28 @@ void BoUpSLP::reorderTopToBottom() {
	// Update ordering of the operands with the smaller VF than the given	// Update ordering of the operands with the smaller VF than the given
	// one.	// one.
	reorderNodeWithReuses(*TE, Mask);	reorderNodeWithReuses(*TE, Mask);
		+ // Update orders in user split vectorize nodes.
		+ if (TE->UserTreeIndex &&
		+ TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
		+ TE->UserTreeIndex.UserTE->reorderSplitNode(
		+ TE->UserTreeIndex.EdgeIdx, Mask, MaskOrder);
	}	}
	continue;	continue;
	}	}
	- if ((TE->State == ~~TreeEntry::Vectorize~~ \|\|	+ if ((TE->State == TreeEntry::SplitVectorize &&
	- TE->State == TreeEntry::StridedVectorize) &&	+ TE->ReuseShuffleIndices.empty()) \|\|
	- (isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst,	+ ((TE->State == TreeEntry::Vectorize \|\|
	- InsertElementInst>(TE->getMainOp()) \|\|	+ TE->State == TreeEntry::StridedVectorize) &&
	- (SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp())))) {	+ (isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst,
	- assert(!TE->isAltShuffle() &&	+ InsertElementInst>(TE->getMainOp()) \|\|
	- "Alternate instructions are only supported by BinaryOperator "	+ (SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp()))))) {
	- "and CastInst.");	+ assert(
	- // Build correct orders for extract{element,value}, loads and	+ (!TE->isAltShuffle() \|\| (TE->State == TreeEntry::SplitVectorize &&
	- // stores.	+ TE->ReuseShuffleIndices.empty())) &&
		+ "Alternate instructions are only supported by BinaryOperator "
		+ "and CastInst.");
		+ // Build correct orders for extract{element,value}, loads,
		+ // stores and alternate (split) nodes.
	reorderOrder(TE->ReorderIndices, Mask);	reorderOrder(TE->ReorderIndices, Mask);
	if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))	if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
	TE->reorderOperands(Mask);	TE->reorderOperands(Mask);
@@ -6357,7 +6502,11 @@ void BoUpSLP::reorderTopToBottom() {
	inversePermutation(CurrentOrder, NewReuses);	inversePermutation(CurrentOrder, NewReuses);
	addMask(NewReuses, TE->ReuseShuffleIndices);	addMask(NewReuses, TE->ReuseShuffleIndices);
	TE->ReuseShuffleIndices.swap(NewReuses);	TE->ReuseShuffleIndices.swap(NewReuses);
	- }	+ } else if (TE->UserTreeIndex &&
		+ TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
		+ // Update orders in user split vectorize nodes.
		+ TE->UserTreeIndex.UserTE->reorderSplitNode(TE->UserTreeIndex.EdgeIdx,
		+ Mask, MaskOrder);
	}	}
	}	}
	}	}
@@ -6370,7 +6519,8 @@ bool BoUpSLP::canReorderOperands(
	if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {	if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
	return OpData.first == I &&	return OpData.first == I &&
	(OpData.second->State == TreeEntry::Vectorize \|\|	(OpData.second->State == TreeEntry::Vectorize \|\|
	- OpData.second->State == TreeEntry::StridedVectorize);	+ OpData.second->State == TreeEntry::StridedVectorize \|\|
		+ OpData.second->State == TreeEntry::SplitVectorize);
	}))	}))
	continue;	continue;
	if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {	if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
@@ -6384,6 +6534,7 @@ bool BoUpSLP::canReorderOperands(
	// node, just reorder reuses mask.	// node, just reorder reuses mask.
	if (TE->State != TreeEntry::Vectorize &&	if (TE->State != TreeEntry::Vectorize &&
	TE->State != TreeEntry::StridedVectorize &&	TE->State != TreeEntry::StridedVectorize &&
		+ TE->State != TreeEntry::SplitVectorize &&
	TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())	TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
	GatherOps.push_back(TE);	GatherOps.push_back(TE);
	continue;	continue;
@@ -6393,6 +6544,7 @@ bool BoUpSLP::canReorderOperands(
	[&Gather, UserTE, I](TreeEntry *TE) {	[&Gather, UserTE, I](TreeEntry *TE) {
	assert(TE->State != TreeEntry::Vectorize &&	assert(TE->State != TreeEntry::Vectorize &&
	TE->State != TreeEntry::StridedVectorize &&	TE->State != TreeEntry::StridedVectorize &&
		+ TE->State != TreeEntry::SplitVectorize &&
	"Only non-vectorized nodes are expected.");	"Only non-vectorized nodes are expected.");
	if (TE->UserTreeIndex.UserTE == UserTE &&	if (TE->UserTreeIndex.UserTE == UserTE &&
	TE->UserTreeIndex.EdgeIdx == I) {	TE->UserTreeIndex.EdgeIdx == I) {
@@ -6412,7 +6564,14 @@ bool BoUpSLP::canReorderOperands(
	}	}

	void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {	void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
	- SetVector<TreeEntry *> OrderedEntries;	+ struct TreeEntryCompare {
		+ bool operator()(const TreeEntry LHS, const TreeEntry RHS) const {
		+ if (LHS->UserTreeIndex && RHS->UserTreeIndex)
		+ return LHS->UserTreeIndex.UserTE->Idx < RHS->UserTreeIndex.UserTE->Idx;
		+ return LHS->Idx < RHS->Idx;
		+ }
		+ };
		+ PriorityQueue<TreeEntry , SmallVector<TreeEntry >, TreeEntryCompare> Queue;
	DenseSet<const TreeEntry *> GathersToOrders;	DenseSet<const TreeEntry *> GathersToOrders;
	// Find all reorderable leaf nodes with the given VF.	// Find all reorderable leaf nodes with the given VF.
	// Currently the are vectorized loads,extracts without alternate operands +	// Currently the are vectorized loads,extracts without alternate operands +
@@ -6420,13 +6579,15 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
	SmallVector<TreeEntry *> NonVectorized;	SmallVector<TreeEntry *> NonVectorized;
	for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {	for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
	if (TE->State != TreeEntry::Vectorize &&	if (TE->State != TreeEntry::Vectorize &&
	- TE->State != TreeEntry::StridedVectorize)	+ TE->State != TreeEntry::StridedVectorize &&
		+ TE->State != TreeEntry::SplitVectorize)
	NonVectorized.push_back(TE.get());	NonVectorized.push_back(TE.get());
	if (std::optional<OrdersType> CurrentOrder =	if (std::optional<OrdersType> CurrentOrder =
	getReorderingData(TE, /TopToBottom=*/false, IgnoreReorder)) {	getReorderingData(TE, /TopToBottom=*/false, IgnoreReorder)) {
	- OrderedEntries.insert(TE.get());	+ Queue.push(TE.get());
	if (!(TE->State == TreeEntry::Vectorize \|\|	if (!(TE->State == TreeEntry::Vectorize \|\|
	- TE->State == TreeEntry::StridedVectorize) \|\|	+ TE->State == TreeEntry::StridedVectorize \|\|
		+ TE->State == TreeEntry::SplitVectorize) \|\|
	!TE->ReuseShuffleIndices.empty())	!TE->ReuseShuffleIndices.empty())
	GathersToOrders.insert(TE.get());	GathersToOrders.insert(TE.get());
	}	}
@@ -6437,40 +6598,88 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
	// one operand order in the natural order and reorder others + reorder the	// one operand order in the natural order and reorder others + reorder the
	// user node itself.	// user node itself.
	SmallPtrSet<const TreeEntry *, 4> Visited, RevisitedOps;	SmallPtrSet<const TreeEntry *, 4> Visited, RevisitedOps;
	- while (~~!OrderedEntries~~.empty()) {	+ while (!Queue.empty()) {
	// 1. Filter out only reordered nodes.	// 1. Filter out only reordered nodes.
	- ~~DenseMap<TreeEntry~~ , SmallVector<std::pair<unsigned, TreeEntry >>> Users;	+ std::pair<TreeEntry , SmallVector<std::pair<unsigned, TreeEntry >>> Users;
	- SmallVector<TreeEntry *> Filtered;	+ TreeEntry *TE = Queue.top();
	- for (TreeEntry *TE : OrderedEntries) {	+ const TreeEntry *UserTE = TE->UserTreeIndex.UserTE;
		+ Queue.pop();
		+ SmallVector<TreeEntry *> OrderedOps(1, TE);
		+ while (!Queue.empty()) {
		+ TE = Queue.top();
		+ if (!UserTE \|\| UserTE != TE->UserTreeIndex.UserTE)
		+ break;
		+ Queue.pop();
		+ OrderedOps.push_back(TE);
		+ }
		+ for (TreeEntry *TE : OrderedOps) {
	if (!(TE->State == TreeEntry::Vectorize \|\|	if (!(TE->State == TreeEntry::Vectorize \|\|
	TE->State == TreeEntry::StridedVectorize \|\|	TE->State == TreeEntry::StridedVectorize \|\|
		+ TE->State == TreeEntry::SplitVectorize \|\|
	(TE->isGather() && GathersToOrders.contains(TE))) \|\|	(TE->isGather() && GathersToOrders.contains(TE))) \|\|
	!TE->UserTreeIndex \|\| !TE->ReuseShuffleIndices.empty() \|\|	!TE->UserTreeIndex \|\| !TE->ReuseShuffleIndices.empty() \|\|
	- !Visited.insert(TE).second) {	+ !Visited.insert(TE).second)
	- Filtered.push_back(TE);
	continue;	continue;
	- }
	// Build a map between user nodes and their operands order to speedup	// Build a map between user nodes and their operands order to speedup
	// search. The graph currently does not provide this dependency directly.	// search. The graph currently does not provide this dependency directly.
	- Users[TE->UserTreeIndex.UserTE].emplace_back(TE->UserTreeIndex.EdgeIdx,	+ Users.first = TE->UserTreeIndex.UserTE;
	- TE);	+ Users.second.emplace_back(TE->UserTreeIndex.EdgeIdx, TE);
	- }	+ }
	- // Erase filtered entries.	+ if (Users.first) {
	- for (TreeEntry *TE : Filtered)	+ auto &Data = Users;
	- OrderedEntries.remove(TE);	+ if (Data.first->State == TreeEntry::SplitVectorize) {
	- SmallVector<	+ assert(
	- std::pair<TreeEntry , SmallVector<std::pair<unsigned, TreeEntry >>>>	+ Data.second.size() <= 2 &&
	- UsersVec(Users.begin(), Users.end());	+ "Expected not greater than 2 operands for split vectorize node.");
	- sort(UsersVec, [](const auto &Data1, const auto &Data2) {	+ if (any_of(Data.second,
	- return Data1.first->Idx > Data2.first->Idx;	+ [](const auto &Op) { return !Op.second->UserTreeIndex; }))
	- });	+ continue;
	- for (auto &Data : UsersVec) {	+ // Update orders in user split vectorize nodes.
		+ assert(Data.first->CombinedEntriesWithIndices.size() == 2 &&
		+ "Expected exactly 2 entries.");
		+ for (const auto &P : Data.first->CombinedEntriesWithIndices) {
		+ TreeEntry &OpTE = *VectorizableTree[P.first].get();
		+ OrdersType Order = OpTE.ReorderIndices;
		+ if (Order.empty()) {
		+ if (!OpTE.isGather())
		+ continue;
		+ const auto BestOrder =
		+ getReorderingData(OpTE, /TopToBottom=/false, IgnoreReorder);
		+ if (!BestOrder \|\| BestOrder->empty() \|\| isIdentityOrder(*BestOrder))
		+ continue;
		+ Order = *BestOrder;
		+ }
		+ fixupOrderingIndices(Order);
		+ SmallVector<int> Mask;
		+ inversePermutation(Order, Mask);
		+ const unsigned E = Order.size();
		+ SmallVector<int> MaskOrder(E, PoisonMaskElem);
		+ transform(Order, MaskOrder.begin(), [E](unsigned I) {
		+ return I < E ? static_cast<int>(I) : PoisonMaskElem;
		+ });
		+ Data.first->reorderSplitNode(P.second ? 1 : 0, Mask, MaskOrder);
		+ // Clear ordering of the operand.
		+ if (!OpTE.ReorderIndices.empty()) {
		+ OpTE.ReorderIndices.clear();
		+ } else {
		+ assert(OpTE.isGather() && "Expected only gather/buildvector node.");
		+ reorderScalars(OpTE.Scalars, Mask);
		+ }
		+ }
		+ if (Data.first->ReuseShuffleIndices.empty() &&
		+ !Data.first->ReorderIndices.empty()) {
		+ // Insert user node to the list to try to sink reordering deeper in
		+ // the graph.
		+ Queue.push(Data.first);
		+ }
		+ continue;
		+ }
	// Check that operands are used only in the User node.	// Check that operands are used only in the User node.
	SmallVector<TreeEntry *> GatherOps;	SmallVector<TreeEntry *> GatherOps;
	if (!canReorderOperands(Data.first, Data.second, NonVectorized,	if (!canReorderOperands(Data.first, Data.second, NonVectorized,
	GatherOps)) {	GatherOps)) {
	for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)	for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
	- ~~OrderedEntries~~.~~remove~~(Op.second);	+ Visited.insert(Op.second);
	continue;	continue;
	}	}
	// All operands are reordered and used only in this node - propagate the	// All operands are reordered and used only in this node - propagate the
@@ -6563,6 +6772,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
	UTE->UserTreeIndex.UserTE == Data.first) \|\|	UTE->UserTreeIndex.UserTE == Data.first) \|\|
	(Data.first->UserTreeIndex &&	(Data.first->UserTreeIndex &&
	Data.first->UserTreeIndex.UserTE == UTE) \|\|	Data.first->UserTreeIndex.UserTE == UTE) \|\|
		+ (IgnoreReorder && UTE->UserTreeIndex &&
		+ UTE->UserTreeIndex.UserTE->Idx == 0) \|\|
	NodeShouldBeReorderedWithOperands(UTE);	NodeShouldBeReorderedWithOperands(UTE);
	}))	}))
	continue;	continue;
@@ -6576,7 +6787,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
	continue;	continue;
	const TreeEntry *Op = getOperandEntry(UTE, Idx);	const TreeEntry *Op = getOperandEntry(UTE, Idx);
	Visited.erase(Op);	Visited.erase(Op);
	- ~~OrderedEntries~~.~~insert~~(const_cast<TreeEntry *>(Op));	+ Queue.push(const_cast<TreeEntry *>(Op));
	}	}
	}	}
	}	}
@@ -6633,7 +6844,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
	// the compile time.	// the compile time.
	// Profitable to reorder if definitely more operands allow	// Profitable to reorder if definitely more operands allow
	// reordering rather than those with natural order.	// reordering rather than those with natural order.
	- ArrayRef<std::pair<unsigned, TreeEntry *>> Ops = Users~~[UserTE];~~	+ ArrayRef<std::pair<unsigned, TreeEntry *>> Ops = Users.second;
	if (static_cast<unsigned>(count_if(	if (static_cast<unsigned>(count_if(
	Ops, [UserTE, &AllowsReordering](	Ops, [UserTE, &AllowsReordering](
	const std::pair<unsigned, TreeEntry *> &Op) {	const std::pair<unsigned, TreeEntry *> &Op) {
@@ -6645,7 +6856,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
	}	}
	if (OrdersUses.empty()) {	if (OrdersUses.empty()) {
	for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)	for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
	- ~~OrderedEntries~~.~~remove~~(Op.second);	+ Visited.insert(Op.second);
	continue;	continue;
	}	}
	// Choose the most used order.	// Choose the most used order.
@@ -6675,7 +6886,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
	// Set order of the user node.	// Set order of the user node.
	if (isIdentityOrder(BestOrder)) {	if (isIdentityOrder(BestOrder)) {
	for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)	for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
	- ~~OrderedEntries~~.~~remove~~(Op.second);	+ Visited.insert(Op.second);
	continue;	continue;
	}	}
	fixupOrderingIndices(BestOrder);	fixupOrderingIndices(BestOrder);
@@ -6690,7 +6901,6 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
	});	});
	for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {	for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
	TreeEntry *TE = Op.second;	TreeEntry *TE = Op.second;
	- OrderedEntries.remove(TE);
	if (!VisitedOps.insert(TE).second)	if (!VisitedOps.insert(TE).second)
	continue;	continue;
	if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {	if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
@@ -6700,6 +6910,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
	// Gathers are processed separately.	// Gathers are processed separately.
	if (TE->State != TreeEntry::Vectorize &&	if (TE->State != TreeEntry::Vectorize &&
	TE->State != TreeEntry::StridedVectorize &&	TE->State != TreeEntry::StridedVectorize &&
		+ TE->State != TreeEntry::SplitVectorize &&
	(TE->State != TreeEntry::ScatterVectorize \|\|	(TE->State != TreeEntry::ScatterVectorize \|\|
	TE->ReorderIndices.empty()))	TE->ReorderIndices.empty()))
	continue;	continue;
@@ -6720,7 +6931,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
	continue;	continue;
	}	}
	reorderScalars(Gather->Scalars, Mask);	reorderScalars(Gather->Scalars, Mask);
	- ~~OrderedEntries~~.~~remove~~(Gather);	+ Visited.insert(Gather);
	}	}
	// Reorder operands of the user node and set the ordering for the user	// Reorder operands of the user node and set the ordering for the user
	// node itself.	// node itself.
@@ -6740,7 +6951,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
	!Data.first->isAltShuffle()) {	!Data.first->isAltShuffle()) {
	// Insert user node to the list to try to sink reordering deeper in	// Insert user node to the list to try to sink reordering deeper in
	// the graph.	// the graph.
	- ~~OrderedEntries~~.~~insert~~(Data.first);	+ Queue.push(Data.first);
	}	}
	} else {	} else {
	reorderOrder(Data.first->ReorderIndices, Mask);	reorderOrder(Data.first->ReorderIndices, Mask);
@@ -6770,7 +6981,7 @@ void BoUpSLP::buildExternalUses(
	TreeEntry *Entry = TEPtr.get();	TreeEntry *Entry = TEPtr.get();

	// No need to handle users of gathered values.	// No need to handle users of gathered values.
	- if (Entry->isGather())	+ if (Entry->isGather() \|\| Entry->State == TreeEntry::SplitVectorize)
	continue;	continue;

	// For each lane:	// For each lane:
@@ -8379,6 +8590,48 @@ public:
	};	};
	} // namespace	} // namespace

		+/// Returns main/alternate instructions for the given \p VL. Unlike
		+/// getSameOpcode supports non-compatible instructions for better SplitVectorize
		+/// node support.
		+/// \returns first main/alt instructions, if only poisons and instruction with
		+/// only 2 opcodes exists. Returns pair of nullptr otherwise.
		+static std::pair<Instruction , Instruction >
		+getMainAltOpsNoStateVL(ArrayRef<Value *> VL) {
		+ Instruction *MainOp = nullptr;
		+ Instruction *AltOp = nullptr;
		+ for (Value *V : VL) {
		+ if (isa<PoisonValue>(V))
		+ continue;
		+ auto *I = dyn_cast<Instruction>(V);
		+ if (!I)
		+ return {};
		+ if (!MainOp) {
		+ MainOp = I;
		+ continue;
		+ }
		+ if (MainOp->getOpcode() == I->getOpcode()) {
		+ if (I->getParent() != MainOp->getParent())
		+ return {};
		+ continue;
		+ }
		+ if (!AltOp) {
		+ AltOp = I;
		+ continue;
		+ }
		+ if (AltOp->getOpcode() == I->getOpcode()) {
		+ if (I->getParent() != AltOp->getParent())
		+ return {};
		+ continue;
		+ }
		+ return {};
		+ }
		+ if (!AltOp)
		+ return {};
		+ assert(MainOp && AltOp && MainOp->getOpcode() != AltOp->getOpcode() &&
		+ "Expected different main and alt instructions.");
		+ return std::make_pair(MainOp, AltOp);
		+}
		+
	void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,	void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
	const EdgeInfo &UserTreeIdx,	const EdgeInfo &UserTreeIdx,
	unsigned InterleaveFactor) {	unsigned InterleaveFactor) {
@@ -8529,6 +8782,146 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
	return;	return;
	}	}

		+ // Tries to build split node.
		+ constexpr unsigned SmallNodeSize = 4;
		+ auto TrySplitNode = [&, &TTI = *TTI](unsigned SmallNodeSize,
		+ const InstructionsState &LocalState) {
		+ if (VL.size() <= SmallNodeSize \|\|
		+ TTI.preferAlternateOpcodeVectorization() \|\| !SplitAlternateInstructions)
		+ return false;
		+
		+ // Any value is used in split node already - just gather.
		+ if (any_of(VL, [&](Value *V) {
		+ return ScalarsInSplitNodes.contains(V) \|\| isVectorized(V);
		+ })) {
		+ if (TryToFindDuplicates(S))
		+ newTreeEntry(VL, std::nullopt /not vectorized/, S, UserTreeIdx,
		+ ReuseShuffleIndices);
		+ return true;
		+ }
		+ SmallVector<Value *> Op1, Op2;
		+ OrdersType ReorderIndices(VL.size(), VL.size());
		+ SmallBitVector Op1Indices(VL.size());
		+ for (auto [Idx, V] : enumerate(VL)) {
		+ auto *I = dyn_cast<Instruction>(V);
		+ if (!I) {
		+ Op1.push_back(V);
		+ Op1Indices.set(Idx);
		+ continue;
		+ }
		+ InstructionsState NewS = getSameOpcode({LocalState.getMainOp(), I}, *TLI);
		+ if (NewS && !NewS.isAltShuffle()) {
		+ Op1.push_back(V);
		+ Op1Indices.set(Idx);
		+ continue;
		+ }
		+ Op2.push_back(V);
		+ }
		+ Type *ScalarTy = getValueType(VL.front());
		+ VectorType *VecTy = getWidenedType(ScalarTy, VL.size());
		+ unsigned Opcode0 = LocalState.getOpcode();
		+ unsigned Opcode1 = LocalState.getAltOpcode();
		+ SmallBitVector OpcodeMask(getAltInstrMask(VL, Opcode0, Opcode1));
		+ // Enable split node, only if all nodes do not form legal alternate
		+ // instruction (like X86 addsub).
		+ SmallPtrSet<Value *, 4> UOp1(Op1.begin(), Op1.end());
		+ SmallPtrSet<Value *, 4> UOp2(Op2.begin(), Op2.end());
		+ if (UOp1.size() <= 1 \|\| UOp2.size() <= 1 \|\|
		+ TTI.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask) \|\|
		+ !hasFullVectorsOrPowerOf2(TTI, Op1.front()->getType(), Op1.size()) \|\|
		+ !hasFullVectorsOrPowerOf2(TTI, Op2.front()->getType(), Op2.size()))
		+ return false;
		+ // Enable split node, only if all nodes are power-of-2/full registers.
		+ unsigned Op1Cnt = 0, Op2Cnt = Op1.size();
		+ for (unsigned Idx : seq<unsigned>(VL.size())) {
		+ if (Op1Indices.test(Idx)) {
		+ ReorderIndices[Op1Cnt] = Idx;
		+ ++Op1Cnt;
		+ } else {
		+ ReorderIndices[Op2Cnt] = Idx;
		+ ++Op2Cnt;
		+ }
		+ }
		+ if (isIdentityOrder(ReorderIndices))
		+ ReorderIndices.clear();
		+ SmallVector<int> Mask;
		+ if (!ReorderIndices.empty())
		+ inversePermutation(ReorderIndices, Mask);
		+ unsigned NumParts = TTI.getNumberOfParts(VecTy);
		+ VectorType *Op1VecTy = getWidenedType(ScalarTy, Op1.size());
		+ VectorType *Op2VecTy = getWidenedType(ScalarTy, Op2.size());
		+ // Check non-profitable single register ops, which better to be represented
		+ // as alternate ops.
		+ if (NumParts >= VL.size())
		+ return false;
		+ if ((LocalState.getMainOp()->isBinaryOp() &&
		+ LocalState.getAltOp()->isBinaryOp() &&
		+ (LocalState.isShiftOp() \|\| LocalState.isBitwiseLogicOp() \|\|
		+ LocalState.isAddSubLikeOp() \|\| LocalState.isMulDivLikeOp())) \|\|
		+ (LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) \|\|
		+ (LocalState.getMainOp()->isUnaryOp() &&
		+ LocalState.getAltOp()->isUnaryOp())) {
		+ constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput;
		+ InstructionCost InsertCost = ::getShuffleCost(
		+ TTI, TTI::SK_InsertSubvector, VecTy, {}, Kind, Op1.size(), Op2VecTy);
		+ FixedVectorType *SubVecTy =
		+ getWidenedType(ScalarTy, std::max(Op1.size(), Op2.size()));
		+ InstructionCost NewShuffleCost =
		+ ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc, SubVecTy, Mask, Kind);
		+ if (NumParts <= 1 && (Mask.empty() \|\| InsertCost >= NewShuffleCost))
		+ return false;
		+ InstructionCost OriginalVecOpsCost =
		+ TTI.getArithmeticInstrCost(Opcode0, VecTy, Kind) +
		+ TTI.getArithmeticInstrCost(Opcode1, VecTy, Kind);
		+ SmallVector<int> OriginalMask(VL.size(), PoisonMaskElem);
		+ for (unsigned Idx : seq<unsigned>(VL.size())) {
		+ if (isa<PoisonValue>(VL[Idx]))
		+ continue;
		+ OriginalMask[Idx] = Idx + (Op1Indices.test(Idx) ? 0 : VL.size());
		+ }
		+ InstructionCost OriginalCost =
		+ OriginalVecOpsCost + ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
		+ VecTy, OriginalMask, Kind);
		+ InstructionCost NewVecOpsCost =
		+ TTI.getArithmeticInstrCost(Opcode0, Op1VecTy, Kind) +
		+ TTI.getArithmeticInstrCost(Opcode1, Op2VecTy, Kind);
		+ InstructionCost NewCost =
		+ NewVecOpsCost + InsertCost +
		+ (VectorizableTree.front()->hasState() &&
		+ VectorizableTree.front()->getOpcode() == Instruction::Store
		+ ? NewShuffleCost
		+ : 0);
		+ // If not profitable to split - exit.
		+ if (NewCost >= OriginalCost)
		+ return false;
		+ }
		+
		+ SmallVector<Value *> NewVL(VL.size());
		+ copy(Op1, NewVL.begin());
		+ copy(Op2, std::next(NewVL.begin(), Op1.size()));
		+ auto *TE = newTreeEntry(VL, TreeEntry::SplitVectorize, std::nullopt,
		+ LocalState, UserTreeIdx, {}, ReorderIndices);
		+ LLVM_DEBUG(dbgs() << "SLP: split alternate node.\n"; TE->dump());
		+ auto AddNode = [&](ArrayRef<Value *> Op, unsigned Idx) {
		+ InstructionsState S = getSameOpcode(Op, *TLI);
		+ if (S && (isa<LoadInst>(S.getMainOp()) \|\|
		+ getSameValuesTreeEntry(S.getMainOp(), Op, /SameVF=/true))) {
		+ // Build gather node for loads, they will be gathered later.
		+ TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
		+ Idx == 0 ? 0 : Op1.size());
		+ (void)newTreeEntry(Op, TreeEntry::NeedToGather, std::nullopt, S,
		+ {TE, Idx});
		+ } else {
		+ TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
		+ Idx == 0 ? 0 : Op1.size());
		+ buildTree_rec(Op, Depth, {TE, Idx});
		+ }
		+ };
		+ AddNode(Op1, 0);
		+ AddNode(Op2, 1);
		+ return true;
		+ };
		+
	// If all of the operands are identical or constant we have a simple solution.	// If all of the operands are identical or constant we have a simple solution.
	// If we deal with insert/extract instructions, they all must have constant	// If we deal with insert/extract instructions, they all must have constant
	// indices, otherwise we should gather them, not try to vectorize.	// indices, otherwise we should gather them, not try to vectorize.
@@ -8614,6 +9007,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
	S.getMainOp()) &&	S.getMainOp()) &&
	!all_of(VL, isVectorLikeInstWithConstOps)) \|\|	!all_of(VL, isVectorLikeInstWithConstOps)) \|\|
	NotProfitableForVectorization(VL)) {	NotProfitableForVectorization(VL)) {
		+ if (!S) {
		+ auto [MainOp, AltOp] = getMainAltOpsNoStateVL(VL);
		+ // Last chance to try to vectorize alternate node.
		+ if (MainOp && AltOp &&
		+ TrySplitNode(SmallNodeSize, InstructionsState(MainOp, AltOp)))
		+ return;
		+ }
	LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n");	LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n");
	if (TryToFindDuplicates(S))	if (TryToFindDuplicates(S))
	newTreeEntry(VL, std::nullopt /not vectorized/, S, UserTreeIdx,	newTreeEntry(VL, std::nullopt /not vectorized/, S, UserTreeIdx,
@@ -8693,6 +9093,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
	return;	return;
	}	}

		+ // FIXME: investigate if there are profitable cases for VL.size() <= 4.
		+ if (S.isAltShuffle() && TrySplitNode(SmallNodeSize, S))
		+ return;
		+
	// Check that every instruction appears once in this bundle.	// Check that every instruction appears once in this bundle.
	if (!TryToFindDuplicates(S, /DoNotFail=/true))	if (!TryToFindDuplicates(S, /DoNotFail=/true))
	return;	return;
@@ -8725,6 +9129,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
	assert((!BS.getScheduleData(VL0) \|\|	assert((!BS.getScheduleData(VL0) \|\|
	!BS.getScheduleData(VL0)->isPartOfBundle()) &&	!BS.getScheduleData(VL0)->isPartOfBundle()) &&
	"tryScheduleBundle should cancelScheduling on failure");	"tryScheduleBundle should cancelScheduling on failure");
		+ // Last chance to try to vectorize alternate node.
		+ if (S.isAltShuffle() && ReuseShuffleIndices.empty() &&
		+ TrySplitNode(SmallNodeSize, S))
		+ return;
	newTreeEntry(VL, std::nullopt /not vectorized/, S, UserTreeIdx,	newTreeEntry(VL, std::nullopt /not vectorized/, S, UserTreeIdx,
	ReuseShuffleIndices);	ReuseShuffleIndices);
	NonScheduledFirst.insert(VL.front());	NonScheduledFirst.insert(VL.front());
@@ -8869,6 +9277,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
	TE->dump());	TE->dump());
	break;	break;
	case TreeEntry::CombinedVectorize:	case TreeEntry::CombinedVectorize:
		+ case TreeEntry::SplitVectorize:
	case TreeEntry::NeedToGather:	case TreeEntry::NeedToGather:
	llvm_unreachable("Unexpected loads state.");	llvm_unreachable("Unexpected loads state.");
	}	}
@@ -10046,6 +10455,69 @@ void BoUpSLP::transformNodes() {
	reorderGatherNode(E);	reorderGatherNode(E);
	}	}

		+ // Better to use full gathered loads analysis, if there are only 2 loads
		+ // gathered nodes each having less than 16 elements.
		+ constexpr unsigned VFLimit = 16;
		+ bool ForceLoadGather =
		+ count_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
		+ return TE->isGather() && TE->hasState() &&
		+ TE->getOpcode() == Instruction::Load &&
		+ TE->getVectorFactor() < VFLimit;
		+ }) == 2;
		+
		+ // Checks if the scalars are used in other node.
		+ auto AreReusedScalars = [&](const TreeEntry TE, ArrayRef<Value > VL,
		+ function_ref<bool(Value *)> CheckContainer) {
		+ return TE->isSame(VL) \|\| all_of(VL, [&](Value *V) {
		+ if (isa<PoisonValue>(V))
		+ return true;
		+ auto *I = dyn_cast<Instruction>(V);
		+ if (!I)
		+ return false;
		+ return is_contained(TE->Scalars, I) \|\| CheckContainer(I);
		+ });
		+ };
		+ auto CheckForSameVectorNodes = [&](const TreeEntry &E) {
		+ if (E.hasState()) {
		+ if (ArrayRef<TreeEntry *> TEs = getTreeEntries(E.getMainOp());
		+ !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
		+ return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
		+ ArrayRef<TreeEntry *> VTEs = getTreeEntries(V);
		+ return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
		+ return is_contained(TEs, TE);
		+ });
		+ });
		+ }))
		+ return true;
		+ ;
		+ if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(E.getMainOp());
		+ !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
		+ return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
		+ ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
		+ return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
		+ return is_contained(TEs, TE);
		+ });
		+ });
		+ }))
		+ return true;
		+ } else {
		+ // Check if the gather node full copy of split node.
		+ auto *It = find_if(E.Scalars, IsaPred<Instruction>);
		+ if (It != E.Scalars.end()) {
		+ if (ArrayRef<TreeEntry > TEs = getSplitTreeEntries(It);
		+ !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
		+ return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
		+ ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
		+ return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
		+ return is_contained(TEs, TE);
		+ });
		+ });
		+ }))
		+ return true;
		+ }
		+ }
		+ return false;
		+ };
	// The tree may grow here, so iterate over nodes, built before.	// The tree may grow here, so iterate over nodes, built before.
	for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {	for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
	TreeEntry &E = *VectorizableTree[Idx];	TreeEntry &E = *VectorizableTree[Idx];
@@ -10060,6 +10532,11 @@ void BoUpSLP::transformNodes() {
	E.isAltShuffle() \|\| !allSameBlock(VL)) \|\|	E.isAltShuffle() \|\| !allSameBlock(VL)) \|\|
	allConstant(VL) \|\| isSplat(VL))	allConstant(VL) \|\| isSplat(VL))
	continue;	continue;
		+ if (ForceLoadGather && E.hasState() && E.getOpcode() == Instruction::Load)
		+ continue;
		+ // Check if the node is a copy of other vector nodes.
		+ if (CheckForSameVectorNodes(E))
		+ continue;
	// Try to find vectorizable sequences and transform them into a series of	// Try to find vectorizable sequences and transform them into a series of
	// insertvector instructions.	// insertvector instructions.
	unsigned StartIdx = 0;	unsigned StartIdx = 0;
@@ -11293,7 +11770,8 @@ const BoUpSLP::TreeEntry BoUpSLP::getOperandEntry(const TreeEntry E,
	}	}
	const auto *It = find_if(ArrayRef(VectorizableTree).drop_front(E->Idx + 1),	const auto *It = find_if(ArrayRef(VectorizableTree).drop_front(E->Idx + 1),
	[&](const std::unique_ptr<TreeEntry> &TE) {	[&](const std::unique_ptr<TreeEntry> &TE) {
	- return TE->isGather() &&	+ return (TE->isGather() \|\|
		+ TE->State == TreeEntry::SplitVectorize) &&
	TE->UserTreeIndex.EdgeIdx == Idx &&	TE->UserTreeIndex.EdgeIdx == Idx &&
	TE->UserTreeIndex.UserTE == E;	TE->UserTreeIndex.UserTE == E;
	});	});
@@ -11351,6 +11829,32 @@ BoUpSLP::getEntryCost(const TreeEntry E, ArrayRef<Value > VectorizedVals,
	return processBuildVector<ShuffleCostEstimator, InstructionCost>(	return processBuildVector<ShuffleCostEstimator, InstructionCost>(
	E, ScalarTy, TTI, VectorizedVals, this, CheckedExtracts);	E, ScalarTy, TTI, VectorizedVals, this, CheckedExtracts);
	}	}
		+ if (E->State == TreeEntry::SplitVectorize) {
		+ assert(E->CombinedEntriesWithIndices.size() == 2 &&
		+ "Expected exactly 2 combined entries.");
		+ assert(E->ReuseShuffleIndices.empty() && "Expected empty reuses mask.");
		+ InstructionCost VectorCost = 0;
		+ if (E->ReorderIndices.empty()) {
		+ VectorCost = ::getShuffleCost(
		+ *TTI, TTI::SK_InsertSubvector, FinalVecTy, {}, CostKind,
		+ E->CombinedEntriesWithIndices.back().second,
		+ getWidenedType(
		+ ScalarTy,
		+ VectorizableTree[E->CombinedEntriesWithIndices.back().first]
		+ ->getVectorFactor()));
		+ } else {
		+ unsigned CommonVF =
		+ std::max(VectorizableTree[E->CombinedEntriesWithIndices.front().first]
		+ ->getVectorFactor(),
		+ VectorizableTree[E->CombinedEntriesWithIndices.back().first]
		+ ->getVectorFactor());
		+ VectorCost = ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc,
		+ getWidenedType(ScalarTy, CommonVF),
		+ E->getSplitMask(), CostKind);
		+ }
		+ LLVM_DEBUG(dumpTreeCosts(E, 0, VectorCost, 0, "Calculated costs for Tree"));
		+ return VectorCost;
		+ }
	InstructionCost CommonCost = 0;	InstructionCost CommonCost = 0;
	SmallVector<int> Mask;	SmallVector<int> Mask;
	if (!E->ReorderIndices.empty() && (E->State != TreeEntry::StridedVectorize \|\|	if (!E->ReorderIndices.empty() && (E->State != TreeEntry::StridedVectorize \|\|
@@ -11432,7 +11936,8 @@ BoUpSLP::getEntryCost(const TreeEntry E, ArrayRef<Value > VectorizedVals,
	EI.EdgeIdx != 0) {	EI.EdgeIdx != 0) {
	auto UserBWIt = MinBWs.find(EI.UserTE);	auto UserBWIt = MinBWs.find(EI.UserTE);
	Type *UserScalarTy =	Type *UserScalarTy =
	- EI.UserTE->isGather()	+ (EI.UserTE->isGather() \|\|
		+ EI.UserTE->State == TreeEntry::SplitVectorize)
	? EI.UserTE->Scalars.front()->getType()	? EI.UserTE->Scalars.front()->getType()
	: EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();	: EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
	if (UserBWIt != MinBWs.end())	if (UserBWIt != MinBWs.end())
@@ -11935,6 +12440,7 @@ BoUpSLP::getEntryCost(const TreeEntry E, ArrayRef<Value > VectorizedVals,
	break;	break;
	}	}
	case TreeEntry::CombinedVectorize:	case TreeEntry::CombinedVectorize:
		+ case TreeEntry::SplitVectorize:
	case TreeEntry::NeedToGather:	case TreeEntry::NeedToGather:
	llvm_unreachable("Unexpected vectorization state.");	llvm_unreachable("Unexpected vectorization state.");
	}	}
@@ -12431,6 +12937,8 @@ bool BoUpSLP::isTreeNotExtendable() const {
	bool Res = false;	bool Res = false;
	for (unsigned Idx : seq<unsigned>(getTreeSize())) {	for (unsigned Idx : seq<unsigned>(getTreeSize())) {
	TreeEntry &E = *VectorizableTree[Idx];	TreeEntry &E = *VectorizableTree[Idx];
		+ if (E.State == TreeEntry::SplitVectorize)
		+ return false;
	if (!E.isGather())	if (!E.isGather())
	continue;	continue;
	if ((E.hasState() && E.getOpcode() != Instruction::Load) \|\|	if ((E.hasState() && E.getOpcode() != Instruction::Load) \|\|
@@ -12856,7 +13364,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
	TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");	TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
	continue;	continue;
	}	}
	- if (TE.~~isGather() && TE.~~hasState()) {	+ if (TE.hasState() &&
		+ (TE.isGather() \|\| TE.State == TreeEntry::SplitVectorize)) {
	if (const TreeEntry *E =	if (const TreeEntry *E =
	getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);	getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);
	E && E->getVectorFactor() == TE.getVectorFactor()) {	E && E->getVectorFactor() == TE.getVectorFactor()) {
@@ -13502,7 +14011,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
	const BasicBlock *TEInsertBlock = nullptr;	const BasicBlock *TEInsertBlock = nullptr;
	// Main node of PHI entries keeps the correct order of operands/incoming	// Main node of PHI entries keeps the correct order of operands/incoming
	// blocks.	// blocks.
	- if (auto *PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())~~) {~~	+ if (auto *PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp());
		+ PHI && TEUseEI.UserTE->State != TreeEntry::SplitVectorize) {
	TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);	TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);
	TEInsertPt = TEInsertBlock->getTerminator();	TEInsertPt = TEInsertBlock->getTerminator();
	} else {	} else {
@@ -13582,7 +14092,9 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
	"Expected only single user of a gather node.");	"Expected only single user of a gather node.");
	const EdgeInfo &UseEI = TEPtr->UserTreeIndex;	const EdgeInfo &UseEI = TEPtr->UserTreeIndex;

	- PHINode *UserPHI = ~~dyn_cast<PHINode>(~~UseEI.UserTE-~~>getMainOp());~~	+ PHINode *UserPHI = UseEI.UserTE->State != TreeEntry::SplitVectorize
		+ ? dyn_cast<PHINode>(UseEI.UserTE->getMainOp())
		+ : nullptr;
	const Instruction *InsertPt =	const Instruction *InsertPt =
	UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()	UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
	: &getLastInstructionInBundle(UseEI.UserTE);	: &getLastInstructionInBundle(UseEI.UserTE);
@@ -13620,6 +14132,23 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
	break;	break;
	VToTEs.insert(TEPtr);	VToTEs.insert(TEPtr);
	}	}
		+ if (ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V); !VTEs.empty()) {
		+ const auto *It = find_if(
		+ VTEs, [&](const TreeEntry *MTE) { return MTE != TEUseEI.UserTE; });
		+ if (It != VTEs.end()) {
		+ const TreeEntry VTE = It;
		+ if (none_of(TE->CombinedEntriesWithIndices,
		+ [&](const auto &P) { return P.first == VTE->Idx; })) {
		+ Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
		+ if (&LastBundleInst == TEInsertPt \|\| !CheckOrdering(&LastBundleInst))
		+ continue;
		+ }
		+ // The node is reused - exit.
		+ if (CheckAndUseSameNode(VTE))
		+ break;
		+ VToTEs.insert(VTE);
		+ }
		+ }
	if (ArrayRef<TreeEntry *> VTEs = getTreeEntries(V); !VTEs.empty()) {	if (ArrayRef<TreeEntry *> VTEs = getTreeEntries(V); !VTEs.empty()) {
	const TreeEntry *VTE = VTEs.front();	const TreeEntry *VTE = VTEs.front();
	if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) &&	if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) &&
@@ -14173,6 +14702,7 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
	assert(((GatheredLoadsEntriesFirst.has_value() &&	assert(((GatheredLoadsEntriesFirst.has_value() &&
	E->getOpcode() == Instruction::Load && E->isGather() &&	E->getOpcode() == Instruction::Load && E->isGather() &&
	E->Idx < *GatheredLoadsEntriesFirst) \|\|	E->Idx < *GatheredLoadsEntriesFirst) \|\|
		+ E->State == TreeEntry::SplitVectorize \|\|
	all_of(E->Scalars,	all_of(E->Scalars,
	[=](Value *V) -> bool {	[=](Value *V) -> bool {
	if (E->getOpcode() == Instruction::GetElementPtr &&	if (E->getOpcode() == Instruction::GetElementPtr &&
@@ -14198,6 +14728,7 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
	}	}
	assert(((E->getOpcode() == Instruction::GetElementPtr &&	assert(((E->getOpcode() == Instruction::GetElementPtr &&
	!isa<GetElementPtrInst>(I)) \|\|	!isa<GetElementPtrInst>(I)) \|\|
		+ E->State == TreeEntry::SplitVectorize \|\|
	(isVectorLikeInstWithConstOps(LastInst) &&	(isVectorLikeInstWithConstOps(LastInst) &&
	isVectorLikeInstWithConstOps(I)) \|\|	isVectorLikeInstWithConstOps(I)) \|\|
	(GatheredLoadsEntriesFirst.has_value() &&	(GatheredLoadsEntriesFirst.has_value() &&
@@ -14259,8 +14790,14 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
	return FirstInst;	return FirstInst;
	};	};

		+ if (E->State == TreeEntry::SplitVectorize) {
		+ Res = FindLastInst();
		+ return *Res;
		+ }
		+
	// Set insertpoint for gathered loads to the very first load.	// Set insertpoint for gathered loads to the very first load.
	- if (GatheredLoadsEntriesFirst.has_value() &&	+ if (E->State != TreeEntry::SplitVectorize &&
		+ GatheredLoadsEntriesFirst.has_value() &&
	E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&	E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
	E->getOpcode() == Instruction::Load) {	E->getOpcode() == Instruction::Load) {
	Res = FindFirstInst();	Res = FindFirstInst();
@@ -14339,7 +14876,10 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
	bool IsPHI = isa<PHINode>(LastInst);	bool IsPHI = isa<PHINode>(LastInst);
	if (IsPHI)	if (IsPHI)
	LastInstIt = LastInst->getParent()->getFirstNonPHIIt();	LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
	- if (IsPHI \|\| (!E->isGather() && doesNotNeedToSchedule(E->Scalars))) {	+ if (IsPHI \|\| (!E->isGather() && doesNotNeedToSchedule(E->Scalars)) \|\|
		+ (GatheredLoadsEntriesFirst.has_value() &&
		+ E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
		+ E->getOpcode() == Instruction::Load)) {
	Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);	Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);
	} else {	} else {
	// Set the insertion point after the last instruction in the bundle. Set the	// Set the insertion point after the last instruction in the bundle. Set the
@@ -15145,7 +15685,9 @@ Value BoUpSLP::vectorizeOperand(TreeEntry E, unsigned NodeIdx) {
	// correctness of the transformations in many cases.	// correctness of the transformations in many cases.
	auto *I = find_if(ArrayRef(VectorizableTree).drop_front(E->Idx + 1),	auto *I = find_if(ArrayRef(VectorizableTree).drop_front(E->Idx + 1),
	[E, NodeIdx](const std::unique_ptr<TreeEntry> &TE) {	[E, NodeIdx](const std::unique_ptr<TreeEntry> &TE) {
	- return TE->isOperandGatherNode({E, NodeIdx});	+ return TE->isOperandGatherNode({E, NodeIdx}) \|\|
		+ (TE->State == TreeEntry::SplitVectorize &&
		+ TE->UserTreeIndex == EdgeInfo(E, NodeIdx));
	});	});
	assert(I != VectorizableTree.end() && "Gather node is not in the graph.");	assert(I != VectorizableTree.end() && "Gather node is not in the graph.");
	assert(I->get()->UserTreeIndex &&	assert(I->get()->UserTreeIndex &&
@@ -15683,6 +16225,83 @@ Value BoUpSLP::vectorizeTree(TreeEntry E) {
	E->VectorizedValue = Vec;	E->VectorizedValue = Vec;
	return Vec;	return Vec;
	}	}
		+ if (E->State == TreeEntry::SplitVectorize) {
		+ assert(E->CombinedEntriesWithIndices.size() == 2 &&
		+ "Expected exactly 2 combined entries.");
		+ setInsertPointAfterBundle(E);
		+ TreeEntry &OpTE1 =
		+ *VectorizableTree[E->CombinedEntriesWithIndices.front().first].get();
		+ assert(OpTE1.isSame(
		+ ArrayRef(E->Scalars).take_front(OpTE1.getVectorFactor())) &&
		+ "Expected same first part of scalars.");
		+ Value *Op1 = vectorizeTree(&OpTE1);
		+ TreeEntry &OpTE2 =
		+ *VectorizableTree[E->CombinedEntriesWithIndices.back().first].get();
		+ assert(
		+ OpTE2.isSame(ArrayRef(E->Scalars).take_back(OpTE2.getVectorFactor())) &&
		+ "Expected same second part of scalars.");
		+ Value *Op2 = vectorizeTree(&OpTE2);
		+ auto GetOperandSignedness = [&](const TreeEntry *OpE) {
		+ bool IsSigned = false;
		+ auto It = MinBWs.find(OpE);
		+ if (It != MinBWs.end())
		+ IsSigned = It->second.second;
		+ else
		+ IsSigned = any_of(OpE->Scalars, [&](Value *R) {
		+ if (isa<PoisonValue>(V))
		+ return false;
		+ return !isKnownNonNegative(R, SimplifyQuery(*DL));
		+ });
		+ return IsSigned;
		+ };
		+ if (cast<VectorType>(Op1->getType())->getElementType() != ScalarTy) {
		+ assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
		+ Op1 = Builder.CreateIntCast(
		+ Op1,
		+ getWidenedType(
		+ ScalarTy,
		+ cast<FixedVectorType>(Op1->getType())->getNumElements()),
		+ GetOperandSignedness(&OpTE1));
		+ }
		+ if (cast<VectorType>(Op2->getType())->getElementType() != ScalarTy) {
		+ assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
		+ Op2 = Builder.CreateIntCast(
		+ Op2,
		+ getWidenedType(
		+ ScalarTy,
		+ cast<FixedVectorType>(Op2->getType())->getNumElements()),
		+ GetOperandSignedness(&OpTE2));
		+ }
		+ if (E->ReorderIndices.empty()) {
		+ SmallVector<int> Mask(E->getVectorFactor(), PoisonMaskElem);
		+ std::iota(
		+ Mask.begin(),
		+ std::next(Mask.begin(), E->CombinedEntriesWithIndices.back().second),
		+ 0);
		+ Value *Vec = Builder.CreateShuffleVector(Op1, Mask);
		+ Vec = createInsertVector(Builder, Vec, Op2,
		+ E->CombinedEntriesWithIndices.back().second);
		+ E->VectorizedValue = Vec;
		+ return Vec;
		+ }
		+ unsigned CommonVF =
		+ std::max(OpTE1.getVectorFactor(), OpTE2.getVectorFactor());
		+ if (getNumElements(Op1->getType()) != CommonVF) {
		+ SmallVector<int> Mask(CommonVF, PoisonMaskElem);
		+ std::iota(Mask.begin(), std::next(Mask.begin(), OpTE1.getVectorFactor()),
		+ 0);
		+ Op1 = Builder.CreateShuffleVector(Op1, Mask);
		+ }
		+ if (getNumElements(Op2->getType()) != CommonVF) {
		+ SmallVector<int> Mask(CommonVF, PoisonMaskElem);
		+ std::iota(Mask.begin(), std::next(Mask.begin(), OpTE2.getVectorFactor()),
		+ 0);
		+ Op2 = Builder.CreateShuffleVector(Op2, Mask);
		+ }
		+ Value *Vec = Builder.CreateShuffleVector(Op1, Op2, E->getSplitMask());
		+ E->VectorizedValue = Vec;
		+ return Vec;
		+ }

	bool IsReverseOrder =	bool IsReverseOrder =
	!E->ReorderIndices.empty() && isReverseOrder(E->ReorderIndices);	!E->ReorderIndices.empty() && isReverseOrder(E->ReorderIndices);
@@ -17138,7 +17757,7 @@ Value *BoUpSLP::vectorizeTree(
	TreeEntry *Entry = TEPtr.get();	TreeEntry *Entry = TEPtr.get();

	// No need to handle users of gathered values.	// No need to handle users of gathered values.
	- if (Entry->isGather())	+ if (Entry->isGather() \|\| Entry->State == TreeEntry::SplitVectorize)
	continue;	continue;

	assert(Entry->VectorizedValue && "Can't find vectorizable value");	assert(Entry->VectorizedValue && "Can't find vectorizable value");
@@ -17191,6 +17810,9 @@ Value *BoUpSLP::vectorizeTree(
	VectorizableTree.front().get()) \|\|	VectorizableTree.front().get()) \|\|
	(IE->UserTreeIndex.UserTE == VectorizableTree.front().get() &&	(IE->UserTreeIndex.UserTE == VectorizableTree.front().get() &&
	IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&	IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&
		+ !(VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
		+ IE->UserTreeIndex &&
		+ is_contained(VectorizableTree.front()->Scalars, I)) &&
	!(GatheredLoadsEntriesFirst.has_value() &&	!(GatheredLoadsEntriesFirst.has_value() &&
	IE->Idx >= *GatheredLoadsEntriesFirst &&	IE->Idx >= *GatheredLoadsEntriesFirst &&
	VectorizableTree.front()->isGather() &&	VectorizableTree.front()->isGather() &&
@@ -18228,6 +18850,13 @@ bool BoUpSLP::collectValuesToDemote(
	ToDemote.push_back(E.Idx);	ToDemote.push_back(E.Idx);
	return IsProfitableToDemote;	return IsProfitableToDemote;
	};	};
		+
		+ if (E.State == TreeEntry::SplitVectorize)
		+ return TryProcessInstruction(
		+ BitWidth,
		+ {VectorizableTree[E.CombinedEntriesWithIndices.front().first].get(),
		+ VectorizableTree[E.CombinedEntriesWithIndices.back().first].get()});
		+
	switch (E.getOpcode()) {	switch (E.getOpcode()) {

	// We can always demote truncations and extensions. Since truncations can	// We can always demote truncations and extensions. Since truncations can

llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll
@@ -17,15 +17,12 @@

	define void @s116_modified(ptr %a) {	define void @s116_modified(ptr %a) {
	; CHECK-LABEL: @s116_modified(	; CHECK-LABEL: @s116_modified(
	-; CHECK-NEXT: [[A:%.]] = getelementptr inbounds float, ptr [[GEP1:%.]], i64 2	+; CHECK-NEXT: [[A:%.]] = getelementptr inbounds float, ptr [[GEP1:%.]], i64 4
	-; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds float, ptr [[GEP1]], i64 3
	; CHECK-NEXT: [[LD0:%.*]] = load float, ptr [[A]], align 4	; CHECK-NEXT: [[LD0:%.*]] = load float, ptr [[A]], align 4
	-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP1]], align 4	+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[GEP1]], align 4
	-; CHECK-NEXT: [[TMP2:%.*]] = ~~load~~ <2 x float>, ~~ptr~~ [[~~GEP3~~]], ~~align~~ 4	+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 2, i32 3, i32 poison>
	-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>	+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP2]], float [[LD0]], i32 3
	-; CHECK-NEXT: [[~~TMP5:%~~.*]] = ~~insertelement~~ <4 x float> [[~~TMP4~~]], ~~float~~ ~~[[LD0]]~~, i32 1	+; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 3>
	-; CHECK-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP2]], i64 2)
	-; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP6]], <4 x i32> <i32 1, i32 1, i32 5, i32 6>
	; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <4 x float> [[TMP6]], [[TMP7]]	; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <4 x float> [[TMP6]], [[TMP7]]
	; CHECK-NEXT: store <4 x float> [[TMP8]], ptr [[GEP1]], align 4	; CHECK-NEXT: store <4 x float> [[TMP8]], ptr [[GEP1]], align 4
	; CHECK-NEXT: ret void	; CHECK-NEXT: ret void

llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
@@ -1,5 +1,5 @@
	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
	-; RUN: opt -S -mtriple riscv64-unknown-linux-gnu < %s --passes=slp-vectorizer -mattr=+v ~~-slp-threshold=-20~~ \| FileCheck %s	+; RUN: opt -S -mtriple riscv64-unknown-linux-gnu < %s --passes=slp-vectorizer -mattr=+v \| FileCheck %s
	; RUN: opt -S -mtriple riscv64-unknown-linux-gnu < %s --passes=slp-vectorizer -mattr=+v -slp-threshold=-15 \| FileCheck %s --check-prefix=THR15	; RUN: opt -S -mtriple riscv64-unknown-linux-gnu < %s --passes=slp-vectorizer -mattr=+v -slp-threshold=-15 \| FileCheck %s --check-prefix=THR15

	define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.ptr, ptr %add.ptr64) {	define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.ptr, ptr %add.ptr64) {
@@ -17,78 +17,122 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
	; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 4	; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 4
	; CHECK-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 4	; CHECK-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 4
	; CHECK-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4	; CHECK-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4
	-; CHECK-NEXT: [[~~TMP52:%~~.*]] = load i8, ptr null, align 1	+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr null, align 1
	-; CHECK-NEXT: [[~~TMP3:%~~.*]] = load i8, ptr null, align 1	+; CHECK-NEXT: [[TMP115:%.*]] = load i8, ptr null, align 1
	; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[PIX1]], align 1	; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[PIX1]], align 1
	-; CHECK-NEXT: [[~~TMP92:%~~.*]] = ~~load~~ <4 x i8>, ~~ptr~~ [[~~PIX2~~]], ~~align~~ 1	+; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32>
	-; CHECK-NEXT: [[~~TMP95:%~~.*]] = load <4 x i8>, ptr [[~~TMP1~~]], align 1	+; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[PIX2]], align 1
	-; CHECK-NEXT: [[~~TMP98:%~~.*]] = ~~load~~ <4 x i8>, ~~ptr~~ [[~~ARRAYIDX5~~]], ~~align~~ 1	+; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32>
	-; CHECK-NEXT: [[TMP6:%.*]] = ~~load~~ <4 x ~~i8>,~~ ~~ptr~~ [[~~ADD_PTR3~~]], ~~align 1~~	+; CHECK-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]]
	-; CHECK-NEXT: [[~~TMP132:%~~.*]] = load <4 x i8>, ptr [[~~ADD_PTR644~~]], align 1	+; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
	-; CHECK-NEXT: [[~~TMP135:%~~.*]] = ~~load~~ <4 x i8>, ~~ptr~~ [[~~ARRAYIDX3_1~~]], ~~align~~ 1	+; CHECK-NEXT: [[TMP8:%.*]] = zext <4 x i8> [[TMP7]] to <4 x i32>
	-; CHECK-NEXT: [[~~TMP138:%~~.*]] = load <4 x i8>, ptr [[~~ARRAYIDX5_1~~]], align 1	+; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1
	-; CHECK-NEXT: [[TMP10:%.*]] = ~~load~~ <4 x i8>, ~~ptr~~ [[~~ADD_PTR_1~~]], ~~align~~ 1	+; CHECK-NEXT: [[TMP10:%.*]] = zext <4 x i8> [[TMP9]] to <4 x i32>
	-; CHECK-NEXT: [[TMP11:%.*]] = ~~load~~ <4 x ~~i8>,~~ ~~ptr~~ [[~~ADD_PTR64_1~~]], ~~align 1~~	+; CHECK-NEXT: [[TMP11:%.*]] = sub <4 x i32> [[TMP8]], [[TMP10]]
	-; CHECK-NEXT: [[TMP12:%.*]] = ~~load~~ <4 x ~~i8>,~~ ~~ptr~~ [[~~ARRAYIDX3_2~~]], ~~align~~ 1	+; CHECK-NEXT: [[TMP12:%.*]] = shl <4 x i32> [[TMP11]], splat (i32 16)
	-; CHECK-NEXT: [[TMP13:%.*]] = ~~load~~ <4 x ~~i8>,~~ ~~ptr~~ [[~~ARRAYIDX5_2~~]], ~~align 1~~	+; CHECK-NEXT: [[TMP13:%.*]] = add <4 x i32> [[TMP12]], [[TMP6]]
	-; CHECK-NEXT: [[TMP14:%.*]] = load <4 x i8>, ptr null, align 1	+; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP13]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
	-; CHECK-NEXT: [[TMP15:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> poison, <4 x i8> [[TMP10]], i64 0)	+; CHECK-NEXT: [[TMP15:%.*]] = add <4 x i32> [[TMP14]], [[TMP13]]
	-; CHECK-NEXT: [[TMP16:%.*]] = ~~call~~ ~~<16~~ x ~~i8>~~ ~~@llvm.vector.insert.v16i8.v4i8(<16 x i8>~~ [[~~TMP15~~]], ~~<4 x i8>~~ [[~~TMP14~~]]~~, i64 4)~~	+; CHECK-NEXT: [[TMP16:%.*]] = sub <4 x i32> [[TMP14]], [[TMP13]]
	-; CHECK-NEXT: [[TMP17:%.*]] = ~~call~~ ~~<16~~ x ~~i8>~~ ~~@llvm.vector.insert.v16i8.v4i8(<16~~ x ~~i8>~~ [[TMP16]], <4 x ~~i8>~~ ~~[[TMP2]]~~, ~~i64~~ 8)	+; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP15]], <4 x i32> [[TMP16]], <4 x i32> <i32 2, i32 7, i32 0, i32 5>
	-; CHECK-NEXT: [[TMP18:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP17]], <4 x i8> [[TMP6]], i64 12)	+; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
	-; CHECK-NEXT: [[TMP19:%.*]] = ~~zext~~ ~~<16~~ x ~~i8>~~ [[~~TMP18~~]] ~~to <16 x i32>~~	+; CHECK-NEXT: [[TMP19:%.*]] = add <4 x i32> [[TMP17]], [[TMP18]]
	-; CHECK-NEXT: [[TMP20:%.*]] = ~~load~~ <4 x ~~i8>,~~ ~~ptr null~~, ~~align 1~~	+; CHECK-NEXT: [[TMP20:%.*]] = sub <4 x i32> [[TMP17]], [[TMP18]]
	-; CHECK-NEXT: [[TMP21:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> poison, <4 x i8> [[TMP11]], i64 0)	+; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x i32> [[TMP19]], <4 x i32> [[TMP20]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
	-; CHECK-NEXT: [[TMP22:%.*]] = ~~call~~ ~~<16~~ x i8> ~~@llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP21]]~~, <4 ~~x i8>~~ [[~~TMP20~~]], ~~i64~~ 4)	+; CHECK-NEXT: [[TMP22:%.*]] = load <4 x i8>, ptr [[ADD_PTR3]], align 1
	-; CHECK-NEXT: [[TMP23:%.*]] = ~~call~~ ~~<16~~ x i8> ~~@llvm.vector.insert.v16i8.v4i8(<16 x i8>~~ [[TMP22]], <4 x ~~i8> [[TMP92]], i64 8)~~	+; CHECK-NEXT: [[TMP23:%.*]] = zext <4 x i8> [[TMP22]] to <4 x i32>
	-; CHECK-NEXT: [[TMP24:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP23]], <4 x i8> [[TMP132]], i64 12)	+; CHECK-NEXT: [[TMP24:%.*]] = load <4 x i8>, ptr [[ADD_PTR644]], align 1
	-; CHECK-NEXT: [[TMP25:%.*]] = zext ~~<16~~ x i8> [[TMP24]] to ~~<16~~ x i32>	+; CHECK-NEXT: [[TMP25:%.*]] = zext <4 x i8> [[TMP24]] to <4 x i32>
	-; CHECK-NEXT: [[TMP26:%.*]] = sub ~~<16~~ x i32> [[~~TMP19~~]], [[TMP25]]	+; CHECK-NEXT: [[TMP26:%.*]] = sub <4 x i32> [[TMP23]], [[TMP25]]
	-; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <16 x i32> [[TMP26]], <16 x i32> poison, <16 x i32> <i32 3, i32 7, i32 15, i32 11, i32 2, i32 6, i32 14, i32 10, i32 1, i32 5, i32 13, i32 9, i32 0, i32 4, i32 12, i32 8>	+; CHECK-NEXT: [[TMP27:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1
	-; CHECK-NEXT: [[TMP28:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> splat (i1 true), i32 2)	+; CHECK-NEXT: [[TMP28:%.*]] = zext <4 x i8> [[TMP27]] to <4 x i32>
	-; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <2 x i8> [[TMP28]], <2 x i8> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>	+; CHECK-NEXT: [[TMP29:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1
	-; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> [[TMP29]], <16 x i32> <i32 3, i32 4, i32 poison, i32 poison, i32 2, i32 poison, i32 poison, i32 poison, i32 1, i32 poison, i32 poison, i32 poison, i32 0, i32 5, i32 poison, i32 poison>	+; CHECK-NEXT: [[TMP30:%.*]] = zext <4 x i8> [[TMP29]] to <4 x i32>
	-; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <4 x i8> [[TMP135]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>	+; CHECK-NEXT: [[TMP31:%.*]] = sub <4 x i32> [[TMP28]], [[TMP30]]
	-; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <16 x i8> [[TMP30]], <16 x i8> [[TMP31]], <16 x i32> <i32 0, i32 1, i32 19, i32 poison, i32 4, i32 poison, i32 18, i32 poison, i32 8, i32 poison, i32 17, i32 poison, i32 12, i32 13, i32 16, i32 poison>	+; CHECK-NEXT: [[TMP32:%.*]] = shl <4 x i32> [[TMP31]], splat (i32 16)
	-; CHECK-NEXT: [[TMP33:%.*]] = shufflevector <4 x i8> [[TMP95]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>	+; CHECK-NEXT: [[TMP33:%.*]] = add <4 x i32> [[TMP32]], [[TMP26]]
	-; CHECK-NEXT: [[TMP34:%.*]] = shufflevector ~~<16~~ x ~~i8>~~ [[~~TMP32~~]], ~~<16~~ x ~~i8>~~ ~~[[TMP33]]~~, ~~<16~~ x i32> <i32 ~~0, i32~~ 1, i32 2, i32 19, i32 ~~4, i32 poison, i32 6, i32 18, i32 8, i32 poison, i32 10, i32 17, i32 12, i32 13, i32 14, i32 16>~~	+; CHECK-NEXT: [[TMP34:%.*]] = shufflevector <4 x i32> [[TMP33]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
	-; CHECK-NEXT: [[TMP35:%.*]] = ~~insertelement~~ ~~<16~~ x ~~i8>~~ [[TMP34]], i8 [[~~TMP3~~]]~~, i32 5~~	+; CHECK-NEXT: [[TMP35:%.*]] = add <4 x i32> [[TMP34]], [[TMP33]]
	-; CHECK-NEXT: [[TMP36:%.*]] = ~~insertelement~~ ~~<16~~ x ~~i8>~~ [[~~TMP35~~]], i8 [[~~TMP52~~]]~~, i32 9~~	+; CHECK-NEXT: [[TMP36:%.*]] = sub <4 x i32> [[TMP34]], [[TMP33]]
	-; CHECK-NEXT: [[TMP37:%.*]] = ~~zext~~ ~~<16~~ x ~~i8>~~ [[~~TMP36~~]] to ~~<16~~ x i32>	+; CHECK-NEXT: [[TMP37:%.*]] = shufflevector <4 x i32> [[TMP35]], <4 x i32> [[TMP36]], <4 x i32> <i32 2, i32 7, i32 0, i32 5>
	-; CHECK-NEXT: [[TMP38:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1	+; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <4 x i32> [[TMP37]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
	-; CHECK-NEXT: [[TMP39:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> poison, <4 x i8> [[TMP13]], i64 0)	+; CHECK-NEXT: [[TMP39:%.*]] = add <4 x i32> [[TMP37]], [[TMP38]]
	-; CHECK-NEXT: [[TMP40:%.*]] = ~~call~~ ~~<16~~ x ~~i8>~~ ~~@llvm.vector.insert.v16i8.v4i8(<16 x i8>~~ [[~~TMP39~~]], ~~<4 x i8>~~ [[TMP38]]~~, i64 4)~~	+; CHECK-NEXT: [[TMP40:%.*]] = sub <4 x i32> [[TMP37]], [[TMP38]]
	-; CHECK-NEXT: [[TMP41:%.*]] = ~~call~~ ~~<16~~ x ~~i8>~~ ~~@llvm.vector.insert.v16i8.v4i8(<16~~ x ~~i8>~~ [[TMP40]], <4 x ~~i8>~~ ~~[[TMP98]]~~, ~~i64~~ 8)	+; CHECK-NEXT: [[TMP41:%.*]] = shufflevector <4 x i32> [[TMP39]], <4 x i32> [[TMP40]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
	-; CHECK-NEXT: [[TMP42:%.*]] = ~~call~~ ~~<16~~ x i8> ~~@llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP41]]~~, <4 ~~x i8>~~ [[~~TMP138~~]], ~~i64~~ ~~12)~~	+; CHECK-NEXT: [[TMP42:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1
	-; CHECK-NEXT: [[TMP43:%.*]] = zext ~~<16~~ x i8> [[TMP42]] to ~~<16~~ x i32>	+; CHECK-NEXT: [[TMP43:%.*]] = zext <4 x i8> [[TMP42]] to <4 x i32>
	-; CHECK-NEXT: [[TMP44:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> <i32 3, i32 7, i32 15, i32 11, i32 2, i32 6, i32 14, i32 10, i32 1, i32 5, i32 13, i32 9, i32 0, i32 4, i32 12, i32 8>	+; CHECK-NEXT: [[TMP44:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1
	-; CHECK-NEXT: [[TMP45:%.*]] = ~~sub~~ ~~<16~~ x ~~i32>~~ [[~~TMP37]], [[~~TMP44]]	+; CHECK-NEXT: [[TMP45:%.*]] = zext <4 x i8> [[TMP44]] to <4 x i32>
	-; CHECK-NEXT: [[TMP46:%.*]] = ~~shl~~ ~~<16~~ x i32> [[~~TMP45~~]], ~~splat (i32 16)~~	+; CHECK-NEXT: [[TMP46:%.*]] = sub <4 x i32> [[TMP43]], [[TMP45]]
	-; CHECK-NEXT: [[TMP47:%.*]] = ~~add~~ ~~<16~~ x ~~i32>~~ [[~~TMP46~~]], ~~[[TMP27]]~~	+; CHECK-NEXT: [[TMP47:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1
	-; CHECK-NEXT: [[TMP48:%.*]] = shufflevector <16 x i32> [[TMP47]], <16 x i32> poison, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11>	+; CHECK-NEXT: [[TMP48:%.*]] = zext <4 x i8> [[TMP47]] to <4 x i32>
	-; CHECK-NEXT: [[TMP49:%.*]] = ~~add~~ ~~<16~~ x ~~i32>~~ [[~~TMP47~~]], ~~[[TMP48]]~~	+; CHECK-NEXT: [[TMP49:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1
	-; CHECK-NEXT: [[TMP50:%.*]] = ~~sub~~ ~~<16~~ x ~~i32>~~ [[~~TMP47~~]], ~~[[TMP48]]~~	+; CHECK-NEXT: [[TMP50:%.*]] = zext <4 x i8> [[TMP49]] to <4 x i32>
	-; CHECK-NEXT: [[TMP51:%.*]] = shufflevector <16 x i32> [[TMP49]], <16 x i32> [[TMP50]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 28, i32 29, i32 30, i32 31>	+; CHECK-NEXT: [[TMP51:%.*]] = sub <4 x i32> [[TMP48]], [[TMP50]]
	-; CHECK-NEXT: [[TMP70:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> poison, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>	+; CHECK-NEXT: [[TMP52:%.*]] = shl <4 x i32> [[TMP51]], splat (i32 16)
	-; CHECK-NEXT: [[TMP53:%.*]] = add ~~<16~~ x i32> [[~~TMP51~~]], [[~~TMP70~~]]	+; CHECK-NEXT: [[TMP53:%.*]] = add <4 x i32> [[TMP52]], [[TMP46]]
	-; CHECK-NEXT: [[TMP54:%.*]] = ~~sub~~ ~~<16~~ x i32> [[~~TMP51~~]], ~~[[TMP70]]~~	+; CHECK-NEXT: [[TMP54:%.*]] = shufflevector <4 x i32> [[TMP53]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
	-; CHECK-NEXT: [[TMP55:%.*]] = shufflevector <16 x i32> [[TMP53]], <16 x i32> [[TMP54]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>	+; CHECK-NEXT: [[TMP55:%.*]] = add <4 x i32> [[TMP54]], [[TMP53]]
	-; CHECK-NEXT: [[TMP56:%.*]] = shufflevector <16 x i32> [[TMP55]], <16 x i32> poison, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>	+; CHECK-NEXT: [[TMP56:%.*]] = sub <4 x i32> [[TMP54]], [[TMP53]]
	-; CHECK-NEXT: [[TMP57:%.*]] = ~~sub~~ ~~<16~~ x i32> [[TMP55]], [[TMP56]]	+; CHECK-NEXT: [[TMP57:%.*]] = shufflevector <4 x i32> [[TMP55]], <4 x i32> [[TMP56]], <4 x i32> <i32 2, i32 7, i32 0, i32 5>
	-; CHECK-NEXT: [[TMP58:%.*]] = ~~add~~ ~~<16~~ x i32> [[~~TMP55~~]], ~~[[TMP56]]~~	+; CHECK-NEXT: [[TMP58:%.*]] = shufflevector <4 x i32> [[TMP57]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
	-; CHECK-NEXT: [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP57]], <16 x i32> [[TMP58]], <16 x i32> <i32 0, i32 17, i32 18, i32 3, i32 4, i32 21, i32 22, i32 7, i32 8, i32 25, i32 26, i32 11, i32 12, i32 29, i32 30, i32 15>	+; CHECK-NEXT: [[TMP59:%.*]] = add <4 x i32> [[TMP57]], [[TMP58]]
	-; CHECK-NEXT: [[TMP60:%.*]] = shufflevector <16 x i32> [[TMP59]], <16 x i32> poison, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>	+; CHECK-NEXT: [[TMP60:%.*]] = sub <4 x i32> [[TMP57]], [[TMP58]]
	-; CHECK-NEXT: [[TMP61:%.*]] = ~~add~~ ~~<16~~ x i32> [[TMP59]], [[TMP60]]	+; CHECK-NEXT: [[TMP61:%.*]] = shufflevector <4 x i32> [[TMP59]], <4 x i32> [[TMP60]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
	-; CHECK-NEXT: [[TMP62:%.*]] = sub <16 x i32> [[TMP59]], [[TMP60]]	+; CHECK-NEXT: [[TMP62:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> splat (i1 true), i32 2)
	-; CHECK-NEXT: [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP61]], <16 x i32> [[TMP62]], <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 4, i32 5, i32 22, i32 23, i32 8, i32 9, i32 26, i32 27, i32 12, i32 13, i32 30, i32 31>	+; CHECK-NEXT: [[TMP63:%.*]] = load <4 x i8>, ptr null, align 1
	-; CHECK-NEXT: [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> [[TMP19]], <16 x i32> <i32 0, i32 20, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 28, i32 29, i32 30, i32 11, i32 24, i32 25, i32 26, i32 27>	+; CHECK-NEXT: [[TMP64:%.*]] = zext <4 x i8> [[TMP63]] to <4 x i32>
	-; CHECK-NEXT: [[TMP65:%.*]] = ~~lshr~~ ~~<16~~ x ~~i32> [[TMP64]]~~, ~~splat~~ ~~(i32~~ ~~15)~~	+; CHECK-NEXT: [[TMP65:%.*]] = load <4 x i8>, ptr null, align 1
	-; CHECK-NEXT: [[TMP66:%.*]] = ~~and~~ ~~<16~~ x ~~i32>~~ [[TMP65]], ~~splat~~ ~~(i32~~ ~~65537)~~	+; CHECK-NEXT: [[TMP66:%.*]] = zext <4 x i8> [[TMP65]] to <4 x i32>
	-; CHECK-NEXT: [[TMP67:%.*]] = ~~mul~~ ~~<16~~ x i32> [[~~TMP66~~]], ~~splat (i32 65535)~~	+; CHECK-NEXT: [[TMP67:%.*]] = sub <4 x i32> [[TMP64]], [[TMP66]]
	-; CHECK-NEXT: [[TMP68:%.*]] = add <16 x i32> [[TMP67]], [[TMP63]]	+; CHECK-NEXT: [[TMP68:%.*]] = shufflevector <4 x i32> [[TMP67]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
	-; CHECK-NEXT: [[TMP69:%.*]] = ~~xor~~ ~~<16~~ x ~~i32>~~ ~~[[TMP68]]~~, [[~~TMP64~~]]	+; CHECK-NEXT: [[TMP69:%.*]] = insertelement <4 x i8> poison, i8 [[TMP115]], i32 0
	-; CHECK-NEXT: [[ADD113_3:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP69]])	+; CHECK-NEXT: [[TMP70:%.*]] = insertelement <4 x i8> [[TMP69]], i8 [[TMP0]], i32 1
	-; CHECK-NEXT: ret i32 [[ADD113_3]]	+; CHECK-NEXT: [[TMP71:%.*]] = call <4 x i8> @llvm.vector.insert.v4i8.v2i8(<4 x i8> [[TMP70]], <2 x i8> [[TMP62]], i64 2)
		+; CHECK-NEXT: [[TMP72:%.*]] = zext <4 x i8> [[TMP71]] to <4 x i32>
		+; CHECK-NEXT: [[TMP73:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1
		+; CHECK-NEXT: [[TMP74:%.*]] = zext <4 x i8> [[TMP73]] to <4 x i32>
		+; CHECK-NEXT: [[TMP75:%.*]] = shufflevector <4 x i32> [[TMP74]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
		+; CHECK-NEXT: [[TMP76:%.*]] = sub <4 x i32> [[TMP72]], [[TMP75]]
		+; CHECK-NEXT: [[TMP77:%.*]] = shl <4 x i32> [[TMP76]], splat (i32 16)
		+; CHECK-NEXT: [[TMP78:%.*]] = add <4 x i32> [[TMP77]], [[TMP68]]
		+; CHECK-NEXT: [[TMP79:%.*]] = shufflevector <4 x i32> [[TMP78]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
		+; CHECK-NEXT: [[TMP80:%.*]] = add <4 x i32> [[TMP78]], [[TMP79]]
		+; CHECK-NEXT: [[TMP81:%.*]] = sub <4 x i32> [[TMP78]], [[TMP79]]
		+; CHECK-NEXT: [[TMP82:%.*]] = shufflevector <4 x i32> [[TMP80]], <4 x i32> [[TMP81]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
		+; CHECK-NEXT: [[TMP83:%.*]] = shufflevector <4 x i32> [[TMP82]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
		+; CHECK-NEXT: [[TMP84:%.*]] = add <4 x i32> [[TMP82]], [[TMP83]]
		+; CHECK-NEXT: [[TMP85:%.*]] = sub <4 x i32> [[TMP82]], [[TMP83]]
		+; CHECK-NEXT: [[TMP86:%.*]] = shufflevector <4 x i32> [[TMP84]], <4 x i32> [[TMP85]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
		+; CHECK-NEXT: [[TMP87:%.*]] = add <4 x i32> [[TMP41]], [[TMP21]]
		+; CHECK-NEXT: [[TMP88:%.*]] = sub <4 x i32> [[TMP21]], [[TMP41]]
		+; CHECK-NEXT: [[TMP89:%.*]] = shufflevector <4 x i32> [[TMP88]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
		+; CHECK-NEXT: [[TMP90:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP89]], <4 x i32> [[TMP87]], i64 4)
		+; CHECK-NEXT: [[TMP91:%.*]] = add <4 x i32> [[TMP86]], [[TMP61]]
		+; CHECK-NEXT: [[TMP92:%.*]] = sub <4 x i32> [[TMP61]], [[TMP86]]
		+; CHECK-NEXT: [[TMP93:%.*]] = shufflevector <4 x i32> [[TMP92]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
		+; CHECK-NEXT: [[TMP94:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP93]], <4 x i32> [[TMP91]], i64 4)
		+; CHECK-NEXT: [[TMP95:%.*]] = add <8 x i32> [[TMP94]], [[TMP90]]
		+; CHECK-NEXT: [[TMP96:%.*]] = sub <8 x i32> [[TMP90]], [[TMP94]]
		+; CHECK-NEXT: [[TMP97:%.*]] = shufflevector <8 x i32> [[TMP95]], <8 x i32> [[TMP96]], <16 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7, i32 12, i32 8, i32 13, i32 9, i32 14, i32 10, i32 15, i32 11>
		+; CHECK-NEXT: [[TMP98:%.*]] = shufflevector <4 x i32> [[TMP57]], <4 x i32> [[TMP64]], <16 x i32> <i32 0, i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
		+; CHECK-NEXT: [[TMP99:%.*]] = shufflevector <4 x i32> [[TMP43]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
		+; CHECK-NEXT: [[TMP100:%.*]] = shufflevector <16 x i32> [[TMP98]], <16 x i32> [[TMP99]], <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
		+; CHECK-NEXT: [[TMP101:%.*]] = shufflevector <4 x i32> [[TMP23]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
		+; CHECK-NEXT: [[TMP102:%.*]] = shufflevector <16 x i32> [[TMP100]], <16 x i32> [[TMP101]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 18, i32 poison, i32 poison, i32 poison>
		+; CHECK-NEXT: [[TMP103:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
		+; CHECK-NEXT: [[TMP104:%.*]] = shufflevector <16 x i32> [[TMP102]], <16 x i32> [[TMP103]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 12, i32 poison, i32 18, i32 19>
		+; CHECK-NEXT: [[TMP105:%.*]] = shufflevector <4 x i32> [[TMP37]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
		+; CHECK-NEXT: [[TMP106:%.*]] = shufflevector <16 x i32> [[TMP104]], <16 x i32> [[TMP105]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 poison, i32 17, i32 poison, i32 12, i32 poison, i32 14, i32 15>
		+; CHECK-NEXT: [[TMP107:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
		+; CHECK-NEXT: [[TMP108:%.*]] = shufflevector <16 x i32> [[TMP106]], <16 x i32> [[TMP107]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 16, i32 10, i32 17, i32 12, i32 18, i32 14, i32 15>
		+; CHECK-NEXT: [[TMP109:%.*]] = lshr <16 x i32> [[TMP108]], splat (i32 15)
		+; CHECK-NEXT: [[TMP110:%.*]] = and <16 x i32> [[TMP109]], splat (i32 65537)
		+; CHECK-NEXT: [[TMP111:%.*]] = mul <16 x i32> [[TMP110]], splat (i32 65535)
		+; CHECK-NEXT: [[TMP112:%.*]] = add <16 x i32> [[TMP111]], [[TMP97]]
		+; CHECK-NEXT: [[TMP113:%.*]] = xor <16 x i32> [[TMP112]], [[TMP108]]
		+; CHECK-NEXT: [[TMP114:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP113]])
		+; CHECK-NEXT: ret i32 [[TMP114]]
	;	;
	; THR15-LABEL: define i32 @test(	; THR15-LABEL: define i32 @test(
	; THR15-SAME: ptr [[PIX1:%.]], ptr [[PIX2:%.]], i64 [[IDX_EXT:%.]], i64 [[IDX_EXT63:%.]], ptr [[ADD_PTR:%.]], ptr [[ADD_PTR64:%.]]) #[[ATTR0:[0-9]+]] {	; THR15-SAME: ptr [[PIX1:%.]], ptr [[PIX2:%.]], i64 [[IDX_EXT:%.]], i64 [[IDX_EXT63:%.]], ptr [[ADD_PTR:%.]], ptr [[ADD_PTR64:%.]]) #[[ATTR0:[0-9]+]] {
@@ -104,78 +148,122 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
	; THR15-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 4	; THR15-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 4
	; THR15-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 4	; THR15-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 4
	; THR15-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4	; THR15-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4
	-; THR15-NEXT: [[~~TMP48:%~~.*]] = load i8, ptr null, align 1	+; THR15-NEXT: [[TMP0:%.*]] = load i8, ptr null, align 1
	; THR15-NEXT: [[TMP1:%.*]] = load i8, ptr null, align 1	; THR15-NEXT: [[TMP1:%.*]] = load i8, ptr null, align 1
	; THR15-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[PIX1]], align 1	; THR15-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[PIX1]], align 1
	-; THR15-NEXT: [[~~TMP143:%~~.*]] = ~~load~~ <4 x i8>, ~~ptr~~ [[~~PIX2~~]], ~~align~~ 1	+; THR15-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32>
	-; THR15-NEXT: [[~~TMP146:%~~.*]] = load <4 x i8>, ptr [[~~ARRAYIDX3~~]], align 1	+; THR15-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[PIX2]], align 1
	-; THR15-NEXT: [[~~TMP147:%~~.*]] = ~~load~~ <4 x i8>, ~~ptr~~ [[~~ARRAYIDX5~~]], ~~align~~ 1	+; THR15-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32>
	-; THR15-NEXT: [[TMP6:%.*]] = ~~load~~ <4 x ~~i8>,~~ ~~ptr~~ [[~~ADD_PTR3~~]], ~~align 1~~	+; THR15-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]]
	-; THR15-NEXT: [[~~TMP148:%~~.*]] = load <4 x i8>, ptr [[~~ADD_PTR644~~]], align 1	+; THR15-NEXT: [[TMP7:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1
	-; THR15-NEXT: [[~~TMP152:%~~.*]] = ~~load~~ <4 x i8>, ~~ptr~~ [[~~ARRAYIDX3_1~~]], ~~align~~ 1	+; THR15-NEXT: [[TMP8:%.*]] = zext <4 x i8> [[TMP7]] to <4 x i32>
	-; THR15-NEXT: [[~~TMP153:%~~.*]] = load <4 x i8>, ptr [[~~ARRAYIDX5_1~~]], align 1	+; THR15-NEXT: [[TMP9:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1
	-; THR15-NEXT: [[TMP10:%.*]] = ~~load~~ <4 x i8>, ~~ptr~~ [[~~ADD_PTR_1~~]], ~~align~~ 1	+; THR15-NEXT: [[TMP10:%.*]] = zext <4 x i8> [[TMP9]] to <4 x i32>
	-; THR15-NEXT: [[TMP11:%.*]] = ~~load~~ <4 x ~~i8>,~~ ~~ptr~~ [[~~ADD_PTR64_1~~]], ~~align 1~~	+; THR15-NEXT: [[TMP11:%.*]] = sub <4 x i32> [[TMP8]], [[TMP10]]
	-; THR15-NEXT: [[TMP12:%.*]] = ~~load~~ <4 x ~~i8>,~~ ~~ptr~~ [[~~ARRAYIDX3_2~~]], ~~align~~ 1	+; THR15-NEXT: [[TMP12:%.*]] = shl <4 x i32> [[TMP11]], splat (i32 16)
	-; THR15-NEXT: [[TMP13:%.*]] = ~~load~~ <4 x ~~i8>,~~ ~~ptr~~ [[~~ARRAYIDX5_2~~]], ~~align 1~~	+; THR15-NEXT: [[TMP13:%.*]] = add <4 x i32> [[TMP12]], [[TMP6]]
	-; THR15-NEXT: [[TMP14:%.*]] = load <4 x i8>, ptr null, align 1	+; THR15-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP13]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
	-; THR15-NEXT: [[TMP15:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> poison, <4 x i8> [[TMP10]], i64 0)	+; THR15-NEXT: [[TMP15:%.*]] = add <4 x i32> [[TMP14]], [[TMP13]]
	-; THR15-NEXT: [[TMP16:%.*]] = ~~call~~ ~~<16~~ x ~~i8>~~ ~~@llvm.vector.insert.v16i8.v4i8(<16 x i8>~~ [[~~TMP15~~]], ~~<4 x i8>~~ [[~~TMP14~~]]~~, i64 4)~~	+; THR15-NEXT: [[TMP16:%.*]] = sub <4 x i32> [[TMP14]], [[TMP13]]
	-; THR15-NEXT: [[TMP17:%.*]] = ~~call~~ ~~<16~~ x ~~i8>~~ ~~@llvm.vector.insert.v16i8.v4i8(<16~~ x ~~i8>~~ [[TMP16]], <4 x ~~i8>~~ ~~[[TMP2]]~~, ~~i64~~ 8)	+; THR15-NEXT: [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP15]], <4 x i32> [[TMP16]], <4 x i32> <i32 2, i32 7, i32 0, i32 5>
	-; THR15-NEXT: [[TMP18:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP17]], <4 x i8> [[TMP6]], i64 12)	+; THR15-NEXT: [[TMP18:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
	-; THR15-NEXT: [[TMP19:%.*]] = ~~zext~~ ~~<16~~ x ~~i8>~~ [[~~TMP18~~]] ~~to <16 x i32>~~	+; THR15-NEXT: [[TMP19:%.*]] = add <4 x i32> [[TMP17]], [[TMP18]]
	-; THR15-NEXT: [[TMP20:%.*]] = ~~load~~ <4 x ~~i8>,~~ ~~ptr null~~, ~~align 1~~	+; THR15-NEXT: [[TMP20:%.*]] = sub <4 x i32> [[TMP17]], [[TMP18]]
	-; THR15-NEXT: [[TMP21:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> poison, <4 x i8> [[TMP11]], i64 0)	+; THR15-NEXT: [[TMP21:%.*]] = shufflevector <4 x i32> [[TMP19]], <4 x i32> [[TMP20]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
	-; THR15-NEXT: [[TMP22:%.*]] = ~~call~~ ~~<16~~ x i8> ~~@llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP21]]~~, <4 ~~x i8>~~ [[~~TMP20~~]], ~~i64~~ 4)	+; THR15-NEXT: [[TMP22:%.*]] = load <4 x i8>, ptr [[ADD_PTR3]], align 1
	-; THR15-NEXT: [[TMP23:%.*]] = ~~call~~ ~~<16~~ x i8> ~~@llvm.vector.insert.v16i8.v4i8(<16 x i8>~~ [[TMP22]], <4 x ~~i8> [[TMP143]], i64 8)~~	+; THR15-NEXT: [[TMP23:%.*]] = zext <4 x i8> [[TMP22]] to <4 x i32>
	-; THR15-NEXT: [[TMP24:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP23]], <4 x i8> [[TMP148]], i64 12)	+; THR15-NEXT: [[TMP24:%.*]] = load <4 x i8>, ptr [[ADD_PTR644]], align 1
	-; THR15-NEXT: [[TMP25:%.*]] = zext ~~<16~~ x i8> [[TMP24]] to ~~<16~~ x i32>	+; THR15-NEXT: [[TMP25:%.*]] = zext <4 x i8> [[TMP24]] to <4 x i32>
	-; THR15-NEXT: [[TMP26:%.*]] = sub ~~<16~~ x i32> [[~~TMP19~~]], [[TMP25]]	+; THR15-NEXT: [[TMP26:%.*]] = sub <4 x i32> [[TMP23]], [[TMP25]]
	-; THR15-NEXT: [[TMP27:%.*]] = shufflevector <16 x i32> [[TMP26]], <16 x i32> poison, <16 x i32> <i32 3, i32 7, i32 15, i32 11, i32 2, i32 6, i32 14, i32 10, i32 1, i32 5, i32 13, i32 9, i32 0, i32 4, i32 12, i32 8>	+; THR15-NEXT: [[TMP27:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1
	-; THR15-NEXT: [[TMP28:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> splat (i1 true), i32 2)	+; THR15-NEXT: [[TMP28:%.*]] = zext <4 x i8> [[TMP27]] to <4 x i32>
	-; THR15-NEXT: [[TMP29:%.*]] = shufflevector <2 x i8> [[TMP28]], <2 x i8> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>	+; THR15-NEXT: [[TMP29:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1
	-; THR15-NEXT: [[TMP30:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> [[TMP29]], <16 x i32> <i32 3, i32 4, i32 poison, i32 poison, i32 2, i32 poison, i32 poison, i32 poison, i32 1, i32 poison, i32 poison, i32 poison, i32 0, i32 5, i32 poison, i32 poison>	+; THR15-NEXT: [[TMP30:%.*]] = zext <4 x i8> [[TMP29]] to <4 x i32>
	-; THR15-NEXT: [[TMP31:%.*]] = shufflevector <4 x i8> [[TMP152]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>	+; THR15-NEXT: [[TMP31:%.*]] = sub <4 x i32> [[TMP28]], [[TMP30]]
	-; THR15-NEXT: [[TMP32:%.*]] = shufflevector <16 x i8> [[TMP30]], <16 x i8> [[TMP31]], <16 x i32> <i32 0, i32 1, i32 19, i32 poison, i32 4, i32 poison, i32 18, i32 poison, i32 8, i32 poison, i32 17, i32 poison, i32 12, i32 13, i32 16, i32 poison>	+; THR15-NEXT: [[TMP32:%.*]] = shl <4 x i32> [[TMP31]], splat (i32 16)
	-; THR15-NEXT: [[TMP33:%.*]] = shufflevector <4 x i8> [[TMP146]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>	+; THR15-NEXT: [[TMP33:%.*]] = add <4 x i32> [[TMP32]], [[TMP26]]
	-; THR15-NEXT: [[TMP34:%.*]] = shufflevector ~~<16~~ x ~~i8>~~ [[~~TMP32~~]], ~~<16~~ x ~~i8>~~ ~~[[TMP33]]~~, ~~<16~~ x i32> <i32 ~~0, i32~~ 1, i32 2, i32 19, i32 ~~4, i32 poison, i32 6, i32 18, i32 8, i32 poison, i32 10, i32 17, i32 12, i32 13, i32 14, i32 16>~~	+; THR15-NEXT: [[TMP34:%.*]] = shufflevector <4 x i32> [[TMP33]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
	-; THR15-NEXT: [[TMP35:%.*]] = ~~insertelement~~ ~~<16~~ x ~~i8>~~ [[TMP34]], i8 [[~~TMP1~~]]~~, i32 5~~	+; THR15-NEXT: [[TMP35:%.*]] = add <4 x i32> [[TMP34]], [[TMP33]]
	-; THR15-NEXT: [[TMP36:%.*]] = ~~insertelement~~ ~~<16~~ x ~~i8>~~ [[~~TMP35~~]], i8 [[~~TMP48~~]]~~, i32 9~~	+; THR15-NEXT: [[TMP36:%.*]] = sub <4 x i32> [[TMP34]], [[TMP33]]
	-; THR15-NEXT: [[TMP37:%.*]] = ~~zext~~ ~~<16~~ x ~~i8>~~ [[~~TMP36~~]] to ~~<16~~ x i32>	+; THR15-NEXT: [[TMP37:%.*]] = shufflevector <4 x i32> [[TMP35]], <4 x i32> [[TMP36]], <4 x i32> <i32 2, i32 7, i32 0, i32 5>
	-; THR15-NEXT: [[TMP38:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1	+; THR15-NEXT: [[TMP38:%.*]] = shufflevector <4 x i32> [[TMP37]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
	-; THR15-NEXT: [[TMP39:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> poison, <4 x i8> [[TMP13]], i64 0)	+; THR15-NEXT: [[TMP39:%.*]] = add <4 x i32> [[TMP37]], [[TMP38]]
	-; THR15-NEXT: [[TMP40:%.*]] = ~~call~~ ~~<16~~ x ~~i8>~~ ~~@llvm.vector.insert.v16i8.v4i8(<16 x i8>~~ [[~~TMP39~~]], ~~<4 x i8>~~ [[TMP38]]~~, i64 4)~~	+; THR15-NEXT: [[TMP40:%.*]] = sub <4 x i32> [[TMP37]], [[TMP38]]
	-; THR15-NEXT: [[TMP41:%.*]] = ~~call~~ ~~<16~~ x ~~i8>~~ ~~@llvm.vector.insert.v16i8.v4i8(<16~~ x ~~i8>~~ [[TMP40]], <4 x ~~i8>~~ ~~[[TMP147]]~~, ~~i64~~ 8)	+; THR15-NEXT: [[TMP41:%.*]] = shufflevector <4 x i32> [[TMP39]], <4 x i32> [[TMP40]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
	-; THR15-NEXT: [[TMP42:%.*]] = ~~call~~ ~~<16~~ x i8> ~~@llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP41]]~~, <4 ~~x i8>~~ [[~~TMP153~~]], ~~i64~~ ~~12)~~	+; THR15-NEXT: [[TMP42:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1
	-; THR15-NEXT: [[TMP43:%.*]] = zext ~~<16~~ x i8> [[TMP42]] to ~~<16~~ x i32>	+; THR15-NEXT: [[TMP43:%.*]] = zext <4 x i8> [[TMP42]] to <4 x i32>
	-; THR15-NEXT: [[TMP44:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> <i32 3, i32 7, i32 15, i32 11, i32 2, i32 6, i32 14, i32 10, i32 1, i32 5, i32 13, i32 9, i32 0, i32 4, i32 12, i32 8>	+; THR15-NEXT: [[TMP44:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1
	-; THR15-NEXT: [[TMP45:%.*]] = ~~sub~~ ~~<16~~ x ~~i32>~~ [[~~TMP37]], [[~~TMP44]]	+; THR15-NEXT: [[TMP45:%.*]] = zext <4 x i8> [[TMP44]] to <4 x i32>
	-; THR15-NEXT: [[TMP46:%.*]] = ~~shl~~ ~~<16~~ x i32> [[~~TMP45~~]], ~~splat (i32 16)~~	+; THR15-NEXT: [[TMP46:%.*]] = sub <4 x i32> [[TMP43]], [[TMP45]]
	-; THR15-NEXT: [[TMP47:%.*]] = ~~add~~ ~~<16~~ x ~~i32>~~ [[~~TMP46~~]], ~~[[TMP27]]~~	+; THR15-NEXT: [[TMP47:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1
	-; THR15-NEXT: [[TMP70:%.*]] = shufflevector <16 x i32> [[TMP47]], <16 x i32> poison, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11>	+; THR15-NEXT: [[TMP48:%.*]] = zext <4 x i8> [[TMP47]] to <4 x i32>
	-; THR15-NEXT: [[TMP49:%.*]] = ~~add~~ ~~<16~~ x ~~i32>~~ [[~~TMP47~~]], ~~[[TMP70]]~~	+; THR15-NEXT: [[TMP49:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1
	-; THR15-NEXT: [[TMP50:%.*]] = ~~sub~~ ~~<16~~ x ~~i32>~~ [[~~TMP47~~]], ~~[[TMP70]]~~	+; THR15-NEXT: [[TMP50:%.*]] = zext <4 x i8> [[TMP49]] to <4 x i32>
	-; THR15-NEXT: [[TMP51:%.*]] = shufflevector <16 x i32> [[TMP49]], <16 x i32> [[TMP50]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 28, i32 29, i32 30, i32 31>	+; THR15-NEXT: [[TMP51:%.*]] = sub <4 x i32> [[TMP48]], [[TMP50]]
	-; THR15-NEXT: [[TMP52:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> poison, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>	+; THR15-NEXT: [[TMP52:%.*]] = shl <4 x i32> [[TMP51]], splat (i32 16)
	-; THR15-NEXT: [[TMP53:%.*]] = add ~~<16~~ x i32> [[~~TMP51~~]], [[~~TMP52~~]]	+; THR15-NEXT: [[TMP53:%.*]] = add <4 x i32> [[TMP52]], [[TMP46]]
	-; THR15-NEXT: [[TMP54:%.*]] = ~~sub~~ ~~<16~~ x i32> [[~~TMP51~~]], ~~[[TMP52]]~~	+; THR15-NEXT: [[TMP54:%.*]] = shufflevector <4 x i32> [[TMP53]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
	-; THR15-NEXT: [[TMP55:%.*]] = shufflevector <16 x i32> [[TMP53]], <16 x i32> [[TMP54]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>	+; THR15-NEXT: [[TMP55:%.*]] = add <4 x i32> [[TMP54]], [[TMP53]]
	-; THR15-NEXT: [[TMP56:%.*]] = shufflevector <16 x i32> [[TMP55]], <16 x i32> poison, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>	+; THR15-NEXT: [[TMP56:%.*]] = sub <4 x i32> [[TMP54]], [[TMP53]]
	-; THR15-NEXT: [[TMP57:%.*]] = ~~sub~~ ~~<16~~ x i32> [[TMP55]], [[TMP56]]	+; THR15-NEXT: [[TMP57:%.*]] = shufflevector <4 x i32> [[TMP55]], <4 x i32> [[TMP56]], <4 x i32> <i32 2, i32 7, i32 0, i32 5>
	-; THR15-NEXT: [[TMP58:%.*]] = ~~add~~ ~~<16~~ x i32> [[~~TMP55~~]], ~~[[TMP56]]~~	+; THR15-NEXT: [[TMP58:%.*]] = shufflevector <4 x i32> [[TMP57]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
	-; THR15-NEXT: [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP57]], <16 x i32> [[TMP58]], <16 x i32> <i32 0, i32 17, i32 18, i32 3, i32 4, i32 21, i32 22, i32 7, i32 8, i32 25, i32 26, i32 11, i32 12, i32 29, i32 30, i32 15>	+; THR15-NEXT: [[TMP59:%.*]] = add <4 x i32> [[TMP57]], [[TMP58]]
	-; THR15-NEXT: [[TMP60:%.*]] = shufflevector <16 x i32> [[TMP59]], <16 x i32> poison, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>	+; THR15-NEXT: [[TMP60:%.*]] = sub <4 x i32> [[TMP57]], [[TMP58]]
	-; THR15-NEXT: [[TMP61:%.*]] = ~~add~~ ~~<16~~ x i32> [[TMP59]], [[TMP60]]	+; THR15-NEXT: [[TMP61:%.*]] = shufflevector <4 x i32> [[TMP59]], <4 x i32> [[TMP60]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
	-; THR15-NEXT: [[TMP62:%.*]] = sub <16 x i32> [[TMP59]], [[TMP60]]	+; THR15-NEXT: [[TMP62:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> splat (i1 true), i32 2)
	-; THR15-NEXT: [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP61]], <16 x i32> [[TMP62]], <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 4, i32 5, i32 22, i32 23, i32 8, i32 9, i32 26, i32 27, i32 12, i32 13, i32 30, i32 31>	+; THR15-NEXT: [[TMP63:%.*]] = load <4 x i8>, ptr null, align 1
	-; THR15-NEXT: [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> [[TMP19]], <16 x i32> <i32 0, i32 20, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 28, i32 29, i32 30, i32 11, i32 24, i32 25, i32 26, i32 27>	+; THR15-NEXT: [[TMP64:%.*]] = zext <4 x i8> [[TMP63]] to <4 x i32>
	-; THR15-NEXT: [[TMP65:%.*]] = ~~lshr~~ ~~<16~~ x ~~i32> [[TMP64]]~~, ~~splat~~ ~~(i32~~ ~~15)~~	+; THR15-NEXT: [[TMP65:%.*]] = load <4 x i8>, ptr null, align 1
	-; THR15-NEXT: [[TMP66:%.*]] = ~~and~~ ~~<16~~ x ~~i32>~~ [[TMP65]], ~~splat~~ ~~(i32~~ ~~65537)~~	+; THR15-NEXT: [[TMP66:%.*]] = zext <4 x i8> [[TMP65]] to <4 x i32>
	-; THR15-NEXT: [[TMP67:%.*]] = ~~mul~~ ~~<16~~ x i32> [[~~TMP66~~]], ~~splat (i32 65535)~~	+; THR15-NEXT: [[TMP67:%.*]] = sub <4 x i32> [[TMP64]], [[TMP66]]
	-; THR15-NEXT: [[TMP68:%.*]] = add <16 x i32> [[TMP67]], [[TMP63]]	+; THR15-NEXT: [[TMP68:%.*]] = shufflevector <4 x i32> [[TMP67]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
	-; THR15-NEXT: [[TMP69:%.*]] = ~~xor~~ ~~<16~~ x ~~i32>~~ ~~[[TMP68]]~~, [[~~TMP64~~]]	+; THR15-NEXT: [[TMP69:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i32 0
	-; THR15-NEXT: [[ADD113_3:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP69]])	+; THR15-NEXT: [[TMP70:%.*]] = insertelement <4 x i8> [[TMP69]], i8 [[TMP0]], i32 1
	-; THR15-NEXT: ret i32 [[ADD113_3]]	+; THR15-NEXT: [[TMP71:%.*]] = call <4 x i8> @llvm.vector.insert.v4i8.v2i8(<4 x i8> [[TMP70]], <2 x i8> [[TMP62]], i64 2)
		+; THR15-NEXT: [[TMP72:%.*]] = zext <4 x i8> [[TMP71]] to <4 x i32>
		+; THR15-NEXT: [[TMP73:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1
		+; THR15-NEXT: [[TMP74:%.*]] = zext <4 x i8> [[TMP73]] to <4 x i32>
		+; THR15-NEXT: [[TMP75:%.*]] = shufflevector <4 x i32> [[TMP74]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
		+; THR15-NEXT: [[TMP76:%.*]] = sub <4 x i32> [[TMP72]], [[TMP75]]
		+; THR15-NEXT: [[TMP77:%.*]] = shl <4 x i32> [[TMP76]], splat (i32 16)
		+; THR15-NEXT: [[TMP78:%.*]] = add <4 x i32> [[TMP77]], [[TMP68]]
		+; THR15-NEXT: [[TMP79:%.*]] = shufflevector <4 x i32> [[TMP78]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
		+; THR15-NEXT: [[TMP80:%.*]] = add <4 x i32> [[TMP78]], [[TMP79]]
		+; THR15-NEXT: [[TMP81:%.*]] = sub <4 x i32> [[TMP78]], [[TMP79]]
		+; THR15-NEXT: [[TMP82:%.*]] = shufflevector <4 x i32> [[TMP80]], <4 x i32> [[TMP81]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
		+; THR15-NEXT: [[TMP83:%.*]] = shufflevector <4 x i32> [[TMP82]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
		+; THR15-NEXT: [[TMP84:%.*]] = add <4 x i32> [[TMP82]], [[TMP83]]
		+; THR15-NEXT: [[TMP85:%.*]] = sub <4 x i32> [[TMP82]], [[TMP83]]
		+; THR15-NEXT: [[TMP86:%.*]] = shufflevector <4 x i32> [[TMP84]], <4 x i32> [[TMP85]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
		+; THR15-NEXT: [[TMP87:%.*]] = add <4 x i32> [[TMP41]], [[TMP21]]
		+; THR15-NEXT: [[TMP88:%.*]] = sub <4 x i32> [[TMP21]], [[TMP41]]
		+; THR15-NEXT: [[TMP89:%.*]] = shufflevector <4 x i32> [[TMP88]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
		+; THR15-NEXT: [[TMP90:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP89]], <4 x i32> [[TMP87]], i64 4)
		+; THR15-NEXT: [[TMP91:%.*]] = add <4 x i32> [[TMP86]], [[TMP61]]
		+; THR15-NEXT: [[TMP92:%.*]] = sub <4 x i32> [[TMP61]], [[TMP86]]
		+; THR15-NEXT: [[TMP93:%.*]] = shufflevector <4 x i32> [[TMP92]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
		+; THR15-NEXT: [[TMP94:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP93]], <4 x i32> [[TMP91]], i64 4)
		+; THR15-NEXT: [[TMP95:%.*]] = add <8 x i32> [[TMP94]], [[TMP90]]
		+; THR15-NEXT: [[TMP96:%.*]] = sub <8 x i32> [[TMP90]], [[TMP94]]
		+; THR15-NEXT: [[TMP97:%.*]] = shufflevector <8 x i32> [[TMP95]], <8 x i32> [[TMP96]], <16 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7, i32 12, i32 8, i32 13, i32 9, i32 14, i32 10, i32 15, i32 11>
		+; THR15-NEXT: [[TMP98:%.*]] = shufflevector <4 x i32> [[TMP57]], <4 x i32> [[TMP64]], <16 x i32> <i32 0, i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
		+; THR15-NEXT: [[TMP99:%.*]] = shufflevector <4 x i32> [[TMP43]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
		+; THR15-NEXT: [[TMP100:%.*]] = shufflevector <16 x i32> [[TMP98]], <16 x i32> [[TMP99]], <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
		+; THR15-NEXT: [[TMP101:%.*]] = shufflevector <4 x i32> [[TMP23]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
		+; THR15-NEXT: [[TMP102:%.*]] = shufflevector <16 x i32> [[TMP100]], <16 x i32> [[TMP101]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 18, i32 poison, i32 poison, i32 poison>
		+; THR15-NEXT: [[TMP103:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
		+; THR15-NEXT: [[TMP104:%.*]] = shufflevector <16 x i32> [[TMP102]], <16 x i32> [[TMP103]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 12, i32 poison, i32 18, i32 19>
		+; THR15-NEXT: [[TMP105:%.*]] = shufflevector <4 x i32> [[TMP37]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
		+; THR15-NEXT: [[TMP106:%.*]] = shufflevector <16 x i32> [[TMP104]], <16 x i32> [[TMP105]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 poison, i32 17, i32 poison, i32 12, i32 poison, i32 14, i32 15>
		+; THR15-NEXT: [[TMP107:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
		+; THR15-NEXT: [[TMP108:%.*]] = shufflevector <16 x i32> [[TMP106]], <16 x i32> [[TMP107]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 16, i32 10, i32 17, i32 12, i32 18, i32 14, i32 15>
		+; THR15-NEXT: [[TMP109:%.*]] = lshr <16 x i32> [[TMP108]], splat (i32 15)
		+; THR15-NEXT: [[TMP110:%.*]] = and <16 x i32> [[TMP109]], splat (i32 65537)
		+; THR15-NEXT: [[TMP111:%.*]] = mul <16 x i32> [[TMP110]], splat (i32 65535)
		+; THR15-NEXT: [[TMP112:%.*]] = add <16 x i32> [[TMP111]], [[TMP97]]
		+; THR15-NEXT: [[TMP113:%.*]] = xor <16 x i32> [[TMP112]], [[TMP108]]
		+; THR15-NEXT: [[TMP114:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP113]])
		+; THR15-NEXT: ret i32 [[TMP114]]
	;	;
	entry:	entry:
	%0 = load i8, ptr %pix1, align 1	%0 = load i8, ptr %pix1, align 1

llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll
@@ -1022,10 +1022,8 @@ define i32 @stride_sum_abs_diff(ptr %p, ptr %q, i64 %stride) {
	; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[Q]], align 4	; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[Q]], align 4
	; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr [[P_2]], align 4	; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr [[P_2]], align 4
	; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr [[Q_2]], align 4	; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr [[Q_2]], align 4
	-; CHECK-NEXT: [[~~TMP5:%~~.*]] = ~~call~~ <4 x i32> ~~@llvm.vector.insert.v4i32.v2i32(<4 x i32> poison~~, <2 x i32> [[~~TMP1~~]], ~~i64~~ 0)	+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
	-; CHECK-NEXT: [[~~TMP6:%~~.*]] = ~~call~~ <4 x i32> ~~@llvm.vector.insert.v4i32.v2i32(<4 x i32>~~ [[~~TMP5~~]], <2 x i32> [[~~TMP3~~]], ~~i64~~ 2)	+; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
	-; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> poison, <2 x i32> [[TMP2]], i64 0)
	-; CHECK-NEXT: [[TMP8:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP7]], <2 x i32> [[TMP4]], i64 2)
	; CHECK-NEXT: [[TMP9:%.*]] = sub <4 x i32> [[TMP6]], [[TMP8]]	; CHECK-NEXT: [[TMP9:%.*]] = sub <4 x i32> [[TMP6]], [[TMP8]]
	; CHECK-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP9]], i1 true)	; CHECK-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP9]], i1 true)
	; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP10]])	; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP10]])

llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll
@@ -1,17 +1,45 @@
	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S \| FileCheck %s	+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S \| FileCheck %s --check-prefixes=CHECK,SSE2
	-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S \| FileCheck %s	+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S \| FileCheck %s --check-prefixes=CHECK,SLM
	-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S \| FileCheck %s	+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S \| FileCheck %s --check-prefixes=CHECK,AVX
	-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S \| FileCheck %s	+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S \| FileCheck %s --check-prefixes=CHECK,AVX2
	-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S \| FileCheck %s	+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S \| FileCheck %s --check-prefixes=CHECK,AVX512
	-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S \| FileCheck %s	+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S \| FileCheck %s --check-prefixes=CHECK,AVX512

	define <8 x float> @sitofp_uitofp(<8 x i32> %a) {	define <8 x float> @sitofp_uitofp(<8 x i32> %a) {
	-; ~~CHECK~~-LABEL: @sitofp_uitofp(	+; SSE2-LABEL: @sitofp_uitofp(
	-; CHECK-NEXT: [[TMP1:%.]] = sitofp <8 x i32> [[A:%.]] to <8 x float>	+; SSE2-NEXT: [[TMP1:%.]] = shufflevector <8 x i32> [[A:%.]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
	-; ~~CHECK~~-NEXT: [[TMP2:%.*]] = ~~uitofp~~ <8 x i32> [[A]] to <8 x float>	+; SSE2-NEXT: [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
	-; ~~CHECK~~-NEXT: [[TMP3:%.*]] = shufflevector <8 x ~~float>~~ [[~~TMP1~~]], <8 x ~~float>~~ ~~[[TMP2]]~~, <8 x i32> <i32 0, i32 1, i32 2, i32 ~~3, i32 12, i32 13, i32 14, i32 15>~~	+; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
	-; CHECK-NEXT: ret <8 x float> [[TMP3]]	+; SSE2-NEXT: [[TMP4:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x float>
		+; SSE2-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
		+; SSE2-NEXT: ret <8 x float> [[TMP5]]
		+;
		+; SLM-LABEL: @sitofp_uitofp(
		+; SLM-NEXT: [[TMP1:%.]] = shufflevector <8 x i32> [[A:%.]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
		+; SLM-NEXT: [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
		+; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
		+; SLM-NEXT: [[TMP4:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x float>
		+; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
		+; SLM-NEXT: ret <8 x float> [[TMP5]]
		+;
		+; AVX-LABEL: @sitofp_uitofp(
		+; AVX-NEXT: [[TMP1:%.]] = sitofp <8 x i32> [[A:%.]] to <8 x float>
		+; AVX-NEXT: [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float>
		+; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
		+; AVX-NEXT: ret <8 x float> [[TMP3]]
		+;
		+; AVX2-LABEL: @sitofp_uitofp(
		+; AVX2-NEXT: [[TMP1:%.]] = sitofp <8 x i32> [[A:%.]] to <8 x float>
		+; AVX2-NEXT: [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float>
		+; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
		+; AVX2-NEXT: ret <8 x float> [[TMP3]]
		+;
		+; AVX512-LABEL: @sitofp_uitofp(
		+; AVX512-NEXT: [[TMP1:%.]] = sitofp <8 x i32> [[A:%.]] to <8 x float>
		+; AVX512-NEXT: [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float>
		+; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
		+; AVX512-NEXT: ret <8 x float> [[TMP3]]
	;	;
	%a0 = extractelement <8 x i32> %a, i32 0	%a0 = extractelement <8 x i32> %a, i32 0
	%a1 = extractelement <8 x i32> %a, i32 1	%a1 = extractelement <8 x i32> %a, i32 1
@@ -41,11 +69,39 @@ define <8 x float> @sitofp_uitofp(<8 x i32> %a) {
	}	}

	define <8 x i32> @fptosi_fptoui(<8 x float> %a) {	define <8 x i32> @fptosi_fptoui(<8 x float> %a) {
	-; ~~CHECK~~-LABEL: @fptosi_fptoui(	+; SSE2-LABEL: @fptosi_fptoui(
	-; ~~CHECK~~-NEXT: [[TMP1:%.]] = ~~fptosi~~ <8 x float> [[A:%.]] to <8 x i32>	+; SSE2-NEXT: [[TMP1:%.]] = shufflevector <8 x float> [[A:%.]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
	-; ~~CHECK~~-NEXT: [[TMP2:%.*]] = ~~fptoui~~ <8 x float> [[A]] to <8 x i32>	+; SSE2-NEXT: [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
	-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>	+; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
	-; CHECK-NEXT: ret <8 x i32> [[TMP3]]	+; SSE2-NEXT: [[TMP4:%.*]] = fptoui <4 x float> [[TMP3]] to <4 x i32>
		+; SSE2-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
		+; SSE2-NEXT: ret <8 x i32> [[TMP5]]
		+;
		+; SLM-LABEL: @fptosi_fptoui(
		+; SLM-NEXT: [[TMP1:%.]] = shufflevector <8 x float> [[A:%.]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
		+; SLM-NEXT: [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
		+; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
		+; SLM-NEXT: [[TMP4:%.*]] = fptoui <4 x float> [[TMP3]] to <4 x i32>
		+; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
		+; SLM-NEXT: ret <8 x i32> [[TMP5]]
		+;
		+; AVX-LABEL: @fptosi_fptoui(
		+; AVX-NEXT: [[TMP1:%.]] = fptosi <8 x float> [[A:%.]] to <8 x i32>
		+; AVX-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[A]] to <8 x i32>
		+; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
		+; AVX-NEXT: ret <8 x i32> [[TMP3]]
		+;
		+; AVX2-LABEL: @fptosi_fptoui(
		+; AVX2-NEXT: [[TMP1:%.]] = fptosi <8 x float> [[A:%.]] to <8 x i32>
		+; AVX2-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[A]] to <8 x i32>
		+; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
		+; AVX2-NEXT: ret <8 x i32> [[TMP3]]
		+;
		+; AVX512-LABEL: @fptosi_fptoui(
		+; AVX512-NEXT: [[TMP1:%.]] = fptosi <8 x float> [[A:%.]] to <8 x i32>
		+; AVX512-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[A]] to <8 x i32>
		+; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
		+; AVX512-NEXT: ret <8 x i32> [[TMP3]]
	;	;
	%a0 = extractelement <8 x float> %a, i32 0	%a0 = extractelement <8 x float> %a, i32 0
	%a1 = extractelement <8 x float> %a, i32 1	%a1 = extractelement <8 x float> %a, i32 1
@@ -75,11 +131,39 @@ define <8 x i32> @fptosi_fptoui(<8 x float> %a) {
	}	}

	define <8 x float> @fneg_fabs(<8 x float> %a) {	define <8 x float> @fneg_fabs(<8 x float> %a) {
	-; ~~CHECK~~-LABEL: @fneg_fabs(	+; SSE2-LABEL: @fneg_fabs(
	-; CHECK-NEXT: [[TMP1:%.]] = fneg <8 x float> [[A:%.]]	+; SSE2-NEXT: [[TMP1:%.]] = shufflevector <8 x float> [[A:%.]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
	-; CHECK-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]])	+; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
	-; CHECK-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>	+; SSE2-NEXT: [[TMP3:%.*]] = fneg <4 x float> [[TMP1]]
	-; CHECK-NEXT: ret <8 x float> [[DOTUNCASTED]]	+; SSE2-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP2]])
		+; SSE2-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
		+; SSE2-NEXT: ret <8 x float> [[DOTUNCASTED]]
		+;
		+; SLM-LABEL: @fneg_fabs(
		+; SLM-NEXT: [[TMP1:%.]] = shufflevector <8 x float> [[A:%.]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
		+; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
		+; SLM-NEXT: [[TMP3:%.*]] = fneg <4 x float> [[TMP1]]
		+; SLM-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP2]])
		+; SLM-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
		+; SLM-NEXT: ret <8 x float> [[DOTUNCASTED]]
		+;
		+; AVX-LABEL: @fneg_fabs(
		+; AVX-NEXT: [[TMP1:%.]] = fneg <8 x float> [[A:%.]]
		+; AVX-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]])
		+; AVX-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
		+; AVX-NEXT: ret <8 x float> [[DOTUNCASTED]]
		+;
		+; AVX2-LABEL: @fneg_fabs(
		+; AVX2-NEXT: [[TMP1:%.]] = fneg <8 x float> [[A:%.]]
		+; AVX2-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]])
		+; AVX2-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
		+; AVX2-NEXT: ret <8 x float> [[DOTUNCASTED]]
		+;
		+; AVX512-LABEL: @fneg_fabs(
		+; AVX512-NEXT: [[TMP1:%.]] = fneg <8 x float> [[A:%.]]
		+; AVX512-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]])
		+; AVX512-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
		+; AVX512-NEXT: ret <8 x float> [[DOTUNCASTED]]
	;	;
	%a0 = extractelement <8 x float> %a, i32 0	%a0 = extractelement <8 x float> %a, i32 0
	%a1 = extractelement <8 x float> %a, i32 1	%a1 = extractelement <8 x float> %a, i32 1
@@ -125,11 +209,39 @@ define <8 x float> @fneg_fabs(<8 x float> %a) {
	}	}

	define <8 x i32> @sext_zext(<8 x i16> %a) {	define <8 x i32> @sext_zext(<8 x i16> %a) {
	-; ~~CHECK~~-LABEL: @sext_zext(	+; SSE2-LABEL: @sext_zext(
	-; ~~CHECK~~-NEXT: [[TMP1:%.]] = ~~sext~~ <8 x i16> [[A:%.]] to <8 x i32>	+; SSE2-NEXT: [[TMP1:%.]] = shufflevector <8 x i16> [[A:%.]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
	-; ~~CHECK~~-NEXT: [[TMP2:%.*]] = ~~zext~~ <8 x i16> [[A]] to <8 x i32>	+; SSE2-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
	-; ~~CHECK~~-NEXT: [[TMP3:%.*]] = shufflevector <8 x ~~i32>~~ [[~~TMP1~~]], <8 x ~~i32>~~ ~~[[TMP2]]~~, <8 x i32> <i32 0, i32 1, i32 2, i32 ~~3, i32 12, i32 13, i32 14, i32 15>~~	+; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
	-; CHECK-NEXT: ret <8 x i32> [[TMP3]]	+; SSE2-NEXT: [[TMP4:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
		+; SSE2-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
		+; SSE2-NEXT: ret <8 x i32> [[TMP5]]
		+;
		+; SLM-LABEL: @sext_zext(
		+; SLM-NEXT: [[TMP1:%.]] = shufflevector <8 x i16> [[A:%.]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
		+; SLM-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
		+; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
		+; SLM-NEXT: [[TMP4:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
		+; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
		+; SLM-NEXT: ret <8 x i32> [[TMP5]]
		+;
		+; AVX-LABEL: @sext_zext(
		+; AVX-NEXT: [[TMP1:%.]] = sext <8 x i16> [[A:%.]] to <8 x i32>
		+; AVX-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[A]] to <8 x i32>
		+; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
		+; AVX-NEXT: ret <8 x i32> [[TMP3]]
		+;
		+; AVX2-LABEL: @sext_zext(
		+; AVX2-NEXT: [[TMP1:%.]] = sext <8 x i16> [[A:%.]] to <8 x i32>
		+; AVX2-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[A]] to <8 x i32>
		+; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
		+; AVX2-NEXT: ret <8 x i32> [[TMP3]]
		+;
		+; AVX512-LABEL: @sext_zext(
		+; AVX512-NEXT: [[TMP1:%.]] = sext <8 x i16> [[A:%.]] to <8 x i32>
		+; AVX512-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[A]] to <8 x i32>
		+; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
		+; AVX512-NEXT: ret <8 x i32> [[TMP3]]
	;	;
	%a0 = extractelement <8 x i16> %a, i32 0	%a0 = extractelement <8 x i16> %a, i32 0
	%a1 = extractelement <8 x i16> %a, i32 1	%a1 = extractelement <8 x i16> %a, i32 1

llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
@@ -1,17 +1,45 @@
	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S \| FileCheck %s	+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S \| FileCheck %s --check-prefixes=CHECK,SSE2
	-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S \| FileCheck %s	+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S \| FileCheck %s --check-prefixes=CHECK,SLM
	-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S \| FileCheck %s	+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S \| FileCheck %s --check-prefixes=CHECK,AVX
	-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S \| FileCheck %s	+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S \| FileCheck %s --check-prefixes=CHECK,AVX2
	-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S \| FileCheck %s	+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S \| FileCheck %s --check-prefixes=CHECK,AVX512
	-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S \| FileCheck %s	+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S \| FileCheck %s --check-prefixes=CHECK,AVX512

	define <8 x float> @sitofp_uitofp(<8 x i32> %a) {	define <8 x float> @sitofp_uitofp(<8 x i32> %a) {
	-; ~~CHECK~~-LABEL: @sitofp_uitofp(	+; SSE2-LABEL: @sitofp_uitofp(
	-; CHECK-NEXT: [[TMP1:%.]] = sitofp <8 x i32> [[A:%.]] to <8 x float>	+; SSE2-NEXT: [[TMP1:%.]] = shufflevector <8 x i32> [[A:%.]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
	-; ~~CHECK~~-NEXT: [[TMP2:%.*]] = ~~uitofp~~ <8 x i32> [[A]] to <8 x float>	+; SSE2-NEXT: [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
	-; ~~CHECK~~-NEXT: [[TMP3:%.*]] = shufflevector <8 x ~~float>~~ [[~~TMP1~~]], <8 x ~~float>~~ ~~[[TMP2]]~~, <8 x i32> <i32 0, i32 1, i32 2, i32 ~~3, i32 12, i32 13, i32 14, i32 15>~~	+; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
	-; CHECK-NEXT: ret <8 x float> [[TMP3]]	+; SSE2-NEXT: [[TMP4:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x float>
		+; SSE2-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
		+; SSE2-NEXT: ret <8 x float> [[TMP5]]
		+;
		+; SLM-LABEL: @sitofp_uitofp(
		+; SLM-NEXT: [[TMP1:%.]] = shufflevector <8 x i32> [[A:%.]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
		+; SLM-NEXT: [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
		+; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
		+; SLM-NEXT: [[TMP4:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x float>
		+; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
		+; SLM-NEXT: ret <8 x float> [[TMP5]]
		+;
		+; AVX-LABEL: @sitofp_uitofp(
		+; AVX-NEXT: [[TMP1:%.]] = sitofp <8 x i32> [[A:%.]] to <8 x float>
		+; AVX-NEXT: [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float>
		+; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
		+; AVX-NEXT: ret <8 x float> [[TMP3]]
		+;
		+; AVX2-LABEL: @sitofp_uitofp(
		+; AVX2-NEXT: [[TMP1:%.]] = sitofp <8 x i32> [[A:%.]] to <8 x float>
		+; AVX2-NEXT: [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float>
		+; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
		+; AVX2-NEXT: ret <8 x float> [[TMP3]]
		+;
		+; AVX512-LABEL: @sitofp_uitofp(
		+; AVX512-NEXT: [[TMP1:%.]] = sitofp <8 x i32> [[A:%.]] to <8 x float>
		+; AVX512-NEXT: [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float>
		+; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
		+; AVX512-NEXT: ret <8 x float> [[TMP3]]
	;	;
	%a0 = extractelement <8 x i32> %a, i32 0	%a0 = extractelement <8 x i32> %a, i32 0
	%a1 = extractelement <8 x i32> %a, i32 1	%a1 = extractelement <8 x i32> %a, i32 1
@@ -41,11 +69,39 @@ define <8 x float> @sitofp_uitofp(<8 x i32> %a) {
	}	}

	define <8 x i32> @fptosi_fptoui(<8 x float> %a) {	define <8 x i32> @fptosi_fptoui(<8 x float> %a) {
	-; ~~CHECK~~-LABEL: @fptosi_fptoui(	+; SSE2-LABEL: @fptosi_fptoui(
	-; ~~CHECK~~-NEXT: [[TMP1:%.]] = ~~fptosi~~ <8 x float> [[A:%.]] to <8 x i32>	+; SSE2-NEXT: [[TMP1:%.]] = shufflevector <8 x float> [[A:%.]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
	-; ~~CHECK~~-NEXT: [[TMP2:%.*]] = ~~fptoui~~ <8 x float> [[A]] to <8 x i32>	+; SSE2-NEXT: [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
	-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>	+; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
	-; CHECK-NEXT: ret <8 x i32> [[TMP3]]	+; SSE2-NEXT: [[TMP4:%.*]] = fptoui <4 x float> [[TMP3]] to <4 x i32>
		+; SSE2-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
		+; SSE2-NEXT: ret <8 x i32> [[TMP5]]
		+;
		+; SLM-LABEL: @fptosi_fptoui(
		+; SLM-NEXT: [[TMP1:%.]] = shufflevector <8 x float> [[A:%.]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
		+; SLM-NEXT: [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
		+; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
		+; SLM-NEXT: [[TMP4:%.*]] = fptoui <4 x float> [[TMP3]] to <4 x i32>
		+; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
		+; SLM-NEXT: ret <8 x i32> [[TMP5]]
		+;
		+; AVX-LABEL: @fptosi_fptoui(
		+; AVX-NEXT: [[TMP1:%.]] = fptosi <8 x float> [[A:%.]] to <8 x i32>
		+; AVX-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[A]] to <8 x i32>
		+; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
		+; AVX-NEXT: ret <8 x i32> [[TMP3]]
		+;
		+; AVX2-LABEL: @fptosi_fptoui(
		+; AVX2-NEXT: [[TMP1:%.]] = fptosi <8 x float> [[A:%.]] to <8 x i32>
		+; AVX2-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[A]] to <8 x i32>
		+; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
		+; AVX2-NEXT: ret <8 x i32> [[TMP3]]
		+;
		+; AVX512-LABEL: @fptosi_fptoui(
		+; AVX512-NEXT: [[TMP1:%.]] = fptosi <8 x float> [[A:%.]] to <8 x i32>
		+; AVX512-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[A]] to <8 x i32>
		+; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
		+; AVX512-NEXT: ret <8 x i32> [[TMP3]]
	;	;
	%a0 = extractelement <8 x float> %a, i32 0	%a0 = extractelement <8 x float> %a, i32 0
	%a1 = extractelement <8 x float> %a, i32 1	%a1 = extractelement <8 x float> %a, i32 1
@@ -75,11 +131,39 @@ define <8 x i32> @fptosi_fptoui(<8 x float> %a) {
	}	}

	define <8 x float> @fneg_fabs(<8 x float> %a) {	define <8 x float> @fneg_fabs(<8 x float> %a) {
	-; ~~CHECK~~-LABEL: @fneg_fabs(	+; SSE2-LABEL: @fneg_fabs(
	-; CHECK-NEXT: [[TMP1:%.]] = fneg <8 x float> [[A:%.]]	+; SSE2-NEXT: [[TMP1:%.]] = shufflevector <8 x float> [[A:%.]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
	-; CHECK-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]])	+; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
	-; CHECK-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>	+; SSE2-NEXT: [[TMP3:%.*]] = fneg <4 x float> [[TMP1]]
	-; CHECK-NEXT: ret <8 x float> [[DOTUNCASTED]]	+; SSE2-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP2]])
		+; SSE2-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
		+; SSE2-NEXT: ret <8 x float> [[DOTUNCASTED]]
		+;
		+; SLM-LABEL: @fneg_fabs(
		+; SLM-NEXT: [[TMP1:%.]] = shufflevector <8 x float> [[A:%.]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
		+; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
		+; SLM-NEXT: [[TMP3:%.*]] = fneg <4 x float> [[TMP1]]
		+; SLM-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP2]])
		+; SLM-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
		+; SLM-NEXT: ret <8 x float> [[DOTUNCASTED]]
		+;
		+; AVX-LABEL: @fneg_fabs(
		+; AVX-NEXT: [[TMP1:%.]] = fneg <8 x float> [[A:%.]]
		+; AVX-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]])
		+; AVX-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
		+; AVX-NEXT: ret <8 x float> [[DOTUNCASTED]]
		+;
		+; AVX2-LABEL: @fneg_fabs(
		+; AVX2-NEXT: [[TMP1:%.]] = fneg <8 x float> [[A:%.]]
		+; AVX2-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]])
		+; AVX2-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
		+; AVX2-NEXT: ret <8 x float> [[DOTUNCASTED]]
		+;
		+; AVX512-LABEL: @fneg_fabs(
		+; AVX512-NEXT: [[TMP1:%.]] = fneg <8 x float> [[A:%.]]
		+; AVX512-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]])
		+; AVX512-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
		+; AVX512-NEXT: ret <8 x float> [[DOTUNCASTED]]
	;	;
	%a0 = extractelement <8 x float> %a, i32 0	%a0 = extractelement <8 x float> %a, i32 0
	%a1 = extractelement <8 x float> %a, i32 1	%a1 = extractelement <8 x float> %a, i32 1
@@ -125,11 +209,39 @@ define <8 x float> @fneg_fabs(<8 x float> %a) {
	}	}

	define <8 x i32> @sext_zext(<8 x i16> %a) {	define <8 x i32> @sext_zext(<8 x i16> %a) {
	-; ~~CHECK~~-LABEL: @sext_zext(	+; SSE2-LABEL: @sext_zext(
	-; ~~CHECK~~-NEXT: [[TMP1:%.]] = ~~sext~~ <8 x i16> [[A:%.]] to <8 x i32>	+; SSE2-NEXT: [[TMP1:%.]] = shufflevector <8 x i16> [[A:%.]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
	-; ~~CHECK~~-NEXT: [[TMP2:%.*]] = ~~zext~~ <8 x i16> [[A]] to <8 x i32>	+; SSE2-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
	-; ~~CHECK~~-NEXT: [[TMP3:%.*]] = shufflevector <8 x ~~i32>~~ [[~~TMP1~~]], <8 x ~~i32>~~ ~~[[TMP2]]~~, <8 x i32> <i32 0, i32 1, i32 2, i32 ~~3, i32 12, i32 13, i32 14, i32 15>~~	+; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
	-; CHECK-NEXT: ret <8 x i32> [[TMP3]]	+; SSE2-NEXT: [[TMP4:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
		+; SSE2-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
		+; SSE2-NEXT: ret <8 x i32> [[TMP5]]
		+;
		+; SLM-LABEL: @sext_zext(
		+; SLM-NEXT: [[TMP1:%.]] = shufflevector <8 x i16> [[A:%.]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
		+; SLM-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
		+; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
		+; SLM-NEXT: [[TMP4:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
		+; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
		+; SLM-NEXT: ret <8 x i32> [[TMP5]]
		+;
		+; AVX-LABEL: @sext_zext(
		+; AVX-NEXT: [[TMP1:%.]] = sext <8 x i16> [[A:%.]] to <8 x i32>
		+; AVX-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[A]] to <8 x i32>
		+; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
		+; AVX-NEXT: ret <8 x i32> [[TMP3]]
		+;
		+; AVX2-LABEL: @sext_zext(
		+; AVX2-NEXT: [[TMP1:%.]] = sext <8 x i16> [[A:%.]] to <8 x i32>
		+; AVX2-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[A]] to <8 x i32>
		+; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
		+; AVX2-NEXT: ret <8 x i32> [[TMP3]]
		+;
		+; AVX512-LABEL: @sext_zext(
		+; AVX512-NEXT: [[TMP1:%.]] = sext <8 x i16> [[A:%.]] to <8 x i32>
		+; AVX512-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[A]] to <8 x i32>
		+; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
		+; AVX512-NEXT: ret <8 x i32> [[TMP3]]
	;	;
	%a0 = extractelement <8 x i16> %a, i32 0	%a0 = extractelement <8 x i16> %a, i32 0
	%a1 = extractelement <8 x i16> %a, i32 1	%a1 = extractelement <8 x i16> %a, i32 1

llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll
@@ -1,17 +1,47 @@
	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S \| FileCheck %s --check-~~prefix=CHECK --check-~~prefix=SSE	+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S \| FileCheck %s --check-prefix=SSE
	-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S \| FileCheck %s --check-~~prefix=CHECK --check-~~prefix=SLM	+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S \| FileCheck %s --check-prefix=SLM
	-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S \| FileCheck %s --check-~~prefix=CHECK --check-~~prefix=AVX	+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S \| FileCheck %s --check-prefix=AVX
	-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S \| FileCheck %s --check-~~prefix=CHECK --check-prefix=AVX~~	+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S \| FileCheck %s --check-prefix=AVX2
	-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S \| FileCheck %s --check-~~prefix=CHECK --check-~~prefix=AVX512	+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S \| FileCheck %s --check-prefix=AVX512
	-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S \| FileCheck %s --check-~~prefix=CHECK --check-~~prefix=AVX512	+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S \| FileCheck %s --check-prefix=AVX512

	define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) {	define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) {
	-; ~~CHECK~~-LABEL: @fadd_fsub_v8f32(	+; SSE-LABEL: @fadd_fsub_v8f32(
	-; ~~CHECK~~-NEXT: [[TMP1:%.]] = ~~fadd~~ <8 x float> [[A:%.]], [[B:%.*]]	+; SSE-NEXT: [[TMP1:%.]] = fsub <8 x float> [[A:%.]], [[B:%.*]]
	-; CHECK-NEXT: [[TMP2:%.*]] = fsub <8 x float> [[A]], [[B]]	+; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
	-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 13, i32 14, i32 7>	+; SSE-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]]
	-; CHECK-NEXT: ret <8 x float> [[TMP3]]	+; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
		+; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
		+; SSE-NEXT: ret <8 x float> [[TMP5]]
		+;
		+; SLM-LABEL: @fadd_fsub_v8f32(
		+; SLM-NEXT: [[TMP1:%.]] = fsub <8 x float> [[A:%.]], [[B:%.*]]
		+; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
		+; SLM-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]]
		+; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
		+; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
		+; SLM-NEXT: ret <8 x float> [[TMP5]]
		+;
		+; AVX-LABEL: @fadd_fsub_v8f32(
		+; AVX-NEXT: [[TMP1:%.]] = fsub <8 x float> [[A:%.]], [[B:%.*]]
		+; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
		+; AVX-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]]
		+; AVX-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
		+; AVX-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
		+; AVX-NEXT: ret <8 x float> [[TMP5]]
		+;
		+; AVX2-LABEL: @fadd_fsub_v8f32(
		+; AVX2-NEXT: [[TMP1:%.]] = fadd <8 x float> [[A:%.]], [[B:%.*]]
		+; AVX2-NEXT: [[TMP2:%.*]] = fsub <8 x float> [[A]], [[B]]
		+; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 13, i32 14, i32 7>
		+; AVX2-NEXT: ret <8 x float> [[TMP3]]
		+;
		+; AVX512-LABEL: @fadd_fsub_v8f32(
		+; AVX512-NEXT: [[TMP1:%.]] = fadd <8 x float> [[A:%.]], [[B:%.*]]
		+; AVX512-NEXT: [[TMP2:%.*]] = fsub <8 x float> [[A]], [[B]]
		+; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 13, i32 14, i32 7>
		+; AVX512-NEXT: ret <8 x float> [[TMP3]]
	;	;
	%a0 = extractelement <8 x float> %a, i32 0	%a0 = extractelement <8 x float> %a, i32 0
	%a1 = extractelement <8 x float> %a, i32 1	%a1 = extractelement <8 x float> %a, i32 1
@@ -49,11 +79,43 @@ define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) {
	}	}

	define <8 x float> @fmul_fdiv_v8f32(<8 x float> %a, <8 x float> %b) {	define <8 x float> @fmul_fdiv_v8f32(<8 x float> %a, <8 x float> %b) {
	-; ~~CHECK~~-LABEL: @fmul_fdiv_v8f32(	+; SSE-LABEL: @fmul_fdiv_v8f32(
	-; ~~CHECK~~-NEXT: [[TMP1:%.]] = ~~fmul~~ <8 x float> [[A:%.]], [[B:%.*]]	+; SSE-NEXT: [[TMP1:%.]] = fdiv <8 x float> [[A:%.]], [[B:%.*]]
	-; CHECK-NEXT: [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]]	+; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
	-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 13, i32 14, i32 7>	+; SSE-NEXT: [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]]
	-; CHECK-NEXT: ret <8 x float> [[TMP3]]	+; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
		+; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
		+; SSE-NEXT: ret <8 x float> [[TMP5]]
		+;
		+; SLM-LABEL: @fmul_fdiv_v8f32(
		+; SLM-NEXT: [[TMP1:%.]] = fdiv <8 x float> [[A:%.]], [[B:%.*]]
		+; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
		+; SLM-NEXT: [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]]
		+; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
		+; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
		+; SLM-NEXT: ret <8 x float> [[TMP5]]
		+;
		+; AVX-LABEL: @fmul_fdiv_v8f32(
		+; AVX-NEXT: [[TMP1:%.]] = fdiv <8 x float> [[A:%.]], [[B:%.*]]
		+; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
		+; AVX-NEXT: [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]]
		+; AVX-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
		+; AVX-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
		+; AVX-NEXT: ret <8 x float> [[TMP5]]
		+;
		+; AVX2-LABEL: @fmul_fdiv_v8f32(
		+; AVX2-NEXT: [[TMP1:%.]] = fdiv <8 x float> [[A:%.]], [[B:%.*]]
		+; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
		+; AVX2-NEXT: [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]]
		+; AVX2-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
		+; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
		+; AVX2-NEXT: ret <8 x float> [[TMP5]]
		+;
		+; AVX512-LABEL: @fmul_fdiv_v8f32(
		+; AVX512-NEXT: [[TMP1:%.]] = fmul <8 x float> [[A:%.]], [[B:%.*]]
		+; AVX512-NEXT: [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]]
		+; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 13, i32 14, i32 7>
		+; AVX512-NEXT: ret <8 x float> [[TMP3]]
	;	;
	%a0 = extractelement <8 x float> %a, i32 0	%a0 = extractelement <8 x float> %a, i32 0
	%a1 = extractelement <8 x float> %a, i32 1	%a1 = extractelement <8 x float> %a, i32 1
@@ -110,6 +172,10 @@ define <4 x float> @fmul_fdiv_v4f32_const(<4 x float> %a) {
	; AVX-NEXT: [[TMP1:%.]] = fmul <4 x float> [[A:%.]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>	; AVX-NEXT: [[TMP1:%.]] = fmul <4 x float> [[A:%.]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
	; AVX-NEXT: ret <4 x float> [[TMP1]]	; AVX-NEXT: ret <4 x float> [[TMP1]]
	;	;
		+; AVX2-LABEL: @fmul_fdiv_v4f32_const(
		+; AVX2-NEXT: [[TMP1:%.]] = fmul <4 x float> [[A:%.]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
		+; AVX2-NEXT: ret <4 x float> [[TMP1]]
		+;
	; AVX512-LABEL: @fmul_fdiv_v4f32_const(	; AVX512-LABEL: @fmul_fdiv_v4f32_const(
	; AVX512-NEXT: [[TMP1:%.]] = fmul <4 x float> [[A:%.]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>	; AVX512-NEXT: [[TMP1:%.]] = fmul <4 x float> [[A:%.]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
	; AVX512-NEXT: ret <4 x float> [[TMP1]]	; AVX512-NEXT: ret <4 x float> [[TMP1]]

llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll
@@ -1,17 +1,47 @@
	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S \| FileCheck %s --check-~~prefix=CHECK --check-~~prefix=SSE	+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S \| FileCheck %s --check-prefix=SSE
	-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S \| FileCheck %s --check-~~prefix=CHECK --check-~~prefix=SLM	+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S \| FileCheck %s --check-prefix=SLM
	-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S \| FileCheck %s --check-~~prefix=CHECK --check-~~prefix=AVX	+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S \| FileCheck %s --check-prefix=AVX
	-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S \| FileCheck %s --check-~~prefix=CHECK --check-prefix=AVX~~	+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S \| FileCheck %s --check-prefix=AVX2
	-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S \| FileCheck %s --check-~~prefix=CHECK --check-~~prefix=AVX512	+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S \| FileCheck %s --check-prefix=AVX512
	-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S \| FileCheck %s --check-~~prefix=CHECK --check-~~prefix=AVX512	+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S \| FileCheck %s --check-prefix=AVX512

	define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) {	define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) {
	-; ~~CHECK~~-LABEL: @fadd_fsub_v8f32(	+; SSE-LABEL: @fadd_fsub_v8f32(
	-; ~~CHECK~~-NEXT: [[TMP1:%.]] = ~~fadd~~ <8 x float> [[A:%.]], [[B:%.*]]	+; SSE-NEXT: [[TMP1:%.]] = fsub <8 x float> [[A:%.]], [[B:%.*]]
	-; CHECK-NEXT: [[TMP2:%.*]] = fsub <8 x float> [[A]], [[B]]	+; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
	-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 13, i32 14, i32 7>	+; SSE-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]]
	-; CHECK-NEXT: ret <8 x float> [[TMP3]]	+; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
		+; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
		+; SSE-NEXT: ret <8 x float> [[TMP5]]
		+;
		+; SLM-LABEL: @fadd_fsub_v8f32(
		+; SLM-NEXT: [[TMP1:%.]] = fsub <8 x float> [[A:%.]], [[B:%.*]]
		+; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
		+; SLM-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]]
		+; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
		+; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
		+; SLM-NEXT: ret <8 x float> [[TMP5]]
		+;
		+; AVX-LABEL: @fadd_fsub_v8f32(
		+; AVX-NEXT: [[TMP1:%.]] = fsub <8 x float> [[A:%.]], [[B:%.*]]
		+; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
		+; AVX-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]]
		+; AVX-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
		+; AVX-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
		+; AVX-NEXT: ret <8 x float> [[TMP5]]
		+;
		+; AVX2-LABEL: @fadd_fsub_v8f32(
		+; AVX2-NEXT: [[TMP1:%.]] = fadd <8 x float> [[A:%.]], [[B:%.*]]
		+; AVX2-NEXT: [[TMP2:%.*]] = fsub <8 x float> [[A]], [[B]]
		+; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 13, i32 14, i32 7>
		+; AVX2-NEXT: ret <8 x float> [[TMP3]]
		+;
		+; AVX512-LABEL: @fadd_fsub_v8f32(
		+; AVX512-NEXT: [[TMP1:%.]] = fadd <8 x float> [[A:%.]], [[B:%.*]]
		+; AVX512-NEXT: [[TMP2:%.*]] = fsub <8 x float> [[A]], [[B]]
		+; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 13, i32 14, i32 7>
		+; AVX512-NEXT: ret <8 x float> [[TMP3]]
	;	;
	%a0 = extractelement <8 x float> %a, i32 0	%a0 = extractelement <8 x float> %a, i32 0
	%a1 = extractelement <8 x float> %a, i32 1	%a1 = extractelement <8 x float> %a, i32 1
@@ -49,11 +79,43 @@ define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) {
	}	}

	define <8 x float> @fmul_fdiv_v8f32(<8 x float> %a, <8 x float> %b) {	define <8 x float> @fmul_fdiv_v8f32(<8 x float> %a, <8 x float> %b) {
	-; ~~CHECK~~-LABEL: @fmul_fdiv_v8f32(	+; SSE-LABEL: @fmul_fdiv_v8f32(
	-; ~~CHECK~~-NEXT: [[TMP1:%.]] = ~~fmul~~ <8 x float> [[A:%.]], [[B:%.*]]	+; SSE-NEXT: [[TMP1:%.]] = fdiv <8 x float> [[A:%.]], [[B:%.*]]
	-; CHECK-NEXT: [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]]	+; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
	-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 13, i32 14, i32 7>	+; SSE-NEXT: [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]]
	-; CHECK-NEXT: ret <8 x float> [[TMP3]]	+; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
		+; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
		+; SSE-NEXT: ret <8 x float> [[TMP5]]
		+;
		+; SLM-LABEL: @fmul_fdiv_v8f32(
		+; SLM-NEXT: [[TMP1:%.]] = fdiv <8 x float> [[A:%.]], [[B:%.*]]
		+; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
		+; SLM-NEXT: [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]]
		+; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
		+; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
		+; SLM-NEXT: ret <8 x float> [[TMP5]]
		+;
		+; AVX-LABEL: @fmul_fdiv_v8f32(
		+; AVX-NEXT: [[TMP1:%.]] = fdiv <8 x float> [[A:%.]], [[B:%.*]]
		+; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
		+; AVX-NEXT: [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]]
		+; AVX-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
		+; AVX-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
		+; AVX-NEXT: ret <8 x float> [[TMP5]]
		+;
		+; AVX2-LABEL: @fmul_fdiv_v8f32(
		+; AVX2-NEXT: [[TMP1:%.]] = fdiv <8 x float> [[A:%.]], [[B:%.*]]
		+; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
		+; AVX2-NEXT: [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]]
		+; AVX2-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
		+; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
		+; AVX2-NEXT: ret <8 x float> [[TMP5]]
		+;
		+; AVX512-LABEL: @fmul_fdiv_v8f32(
		+; AVX512-NEXT: [[TMP1:%.]] = fmul <8 x float> [[A:%.]], [[B:%.*]]
		+; AVX512-NEXT: [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]]
		+; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 13, i32 14, i32 7>
		+; AVX512-NEXT: ret <8 x float> [[TMP3]]
	;	;
	%a0 = extractelement <8 x float> %a, i32 0	%a0 = extractelement <8 x float> %a, i32 0
	%a1 = extractelement <8 x float> %a, i32 1	%a1 = extractelement <8 x float> %a, i32 1
@@ -110,6 +172,10 @@ define <4 x float> @fmul_fdiv_v4f32_const(<4 x float> %a) {
	; AVX-NEXT: [[TMP1:%.]] = fmul <4 x float> [[A:%.]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>	; AVX-NEXT: [[TMP1:%.]] = fmul <4 x float> [[A:%.]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
	; AVX-NEXT: ret <4 x float> [[TMP1]]	; AVX-NEXT: ret <4 x float> [[TMP1]]
	;	;
		+; AVX2-LABEL: @fmul_fdiv_v4f32_const(
		+; AVX2-NEXT: [[TMP1:%.]] = fmul <4 x float> [[A:%.]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
		+; AVX2-NEXT: ret <4 x float> [[TMP1]]
		+;
	; AVX512-LABEL: @fmul_fdiv_v4f32_const(	; AVX512-LABEL: @fmul_fdiv_v4f32_const(
	; AVX512-NEXT: [[TMP1:%.]] = fmul <4 x float> [[A:%.]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>	; AVX512-NEXT: [[TMP1:%.]] = fmul <4 x float> [[A:%.]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
	; AVX512-NEXT: ret <4 x float> [[TMP1]]	; AVX512-NEXT: ret <4 x float> [[TMP1]]

llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll
@@ -7,11 +7,39 @@
	; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S \| FileCheck %s --check-prefixes=CHECK,AVX512	; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S \| FileCheck %s --check-prefixes=CHECK,AVX512

	define <8 x i32> @add_sub_v8i32(<8 x i32> %a, <8 x i32> %b) {	define <8 x i32> @add_sub_v8i32(<8 x i32> %a, <8 x i32> %b) {
	-; ~~CHECK~~-LABEL: @add_sub_v8i32(	+; SSE-LABEL: @add_sub_v8i32(
	-; ~~CHECK~~-NEXT: [[TMP1:%.]] = add <8 x i32> [[A:%.]], [[B:%.*]]	+; SSE-NEXT: [[TMP1:%.]] = add <8 x i32> [[A:%.]], [[B:%.*]]
	-; ~~CHECK~~-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]]	+; SSE-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]]
	-; ~~CHECK~~-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[~~TMP1~~]], <8 x i32> ~~[[TMP2]]~~, <8 x i32> <i32 0, i32 1, i32 2, i32 ~~3, i32 12, i32 13, i32 14, i32 15>~~	+; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
	-; CHECK-NEXT: ret <8 x i32> [[TMP3]]	+; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
		+; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
		+; SSE-NEXT: ret <8 x i32> [[TMP5]]
		+;
		+; SLM-LABEL: @add_sub_v8i32(
		+; SLM-NEXT: [[TMP1:%.]] = add <8 x i32> [[A:%.]], [[B:%.*]]
		+; SLM-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]]
		+; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
		+; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
		+; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
		+; SLM-NEXT: ret <8 x i32> [[TMP5]]
		+;
		+; AVX1-LABEL: @add_sub_v8i32(
		+; AVX1-NEXT: [[TMP1:%.]] = add <8 x i32> [[A:%.]], [[B:%.*]]
		+; AVX1-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]]
		+; AVX1-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
		+; AVX1-NEXT: ret <8 x i32> [[TMP3]]
		+;
		+; AVX2-LABEL: @add_sub_v8i32(
		+; AVX2-NEXT: [[TMP1:%.]] = add <8 x i32> [[A:%.]], [[B:%.*]]
		+; AVX2-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]]
		+; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
		+; AVX2-NEXT: ret <8 x i32> [[TMP3]]
		+;
		+; AVX512-LABEL: @add_sub_v8i32(
		+; AVX512-NEXT: [[TMP1:%.]] = add <8 x i32> [[A:%.]], [[B:%.*]]
		+; AVX512-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]]
		+; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
		+; AVX512-NEXT: ret <8 x i32> [[TMP3]]
	;	;
	%a0 = extractelement <8 x i32> %a, i32 0	%a0 = extractelement <8 x i32> %a, i32 0
	%a1 = extractelement <8 x i32> %a, i32 1	%a1 = extractelement <8 x i32> %a, i32 1
@@ -106,14 +134,16 @@ define <8 x i32> @ashr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
	; SSE-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]	; SSE-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]
	; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>	; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
	; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>	; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
	-; SSE-NEXT: [[~~R71:%~~.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>	+; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
	-; SSE-NEXT: ret <8 x i32> [[~~R71~~]]	+; SSE-NEXT: ret <8 x i32> [[TMP5]]
	;	;
	; SLM-LABEL: @ashr_shl_v8i32(	; SLM-LABEL: @ashr_shl_v8i32(
	; SLM-NEXT: [[TMP1:%.]] = ashr <8 x i32> [[A:%.]], [[B:%.*]]	; SLM-NEXT: [[TMP1:%.]] = ashr <8 x i32> [[A:%.]], [[B:%.*]]
	; SLM-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]	; SLM-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]
	-; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[~~TMP1~~]], <8 x i32> ~~[[TMP2]]~~, <8 x i32> <i32 0, i32 1, i32 2, i32 ~~3, i32 12, i32 13, i32 14, i32 15>~~	+; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
	-; SLM-NEXT: ret <8 x i32> [[TMP3]]	+; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
		+; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
		+; SLM-NEXT: ret <8 x i32> [[TMP5]]
	;	;
	; AVX1-LABEL: @ashr_shl_v8i32(	; AVX1-LABEL: @ashr_shl_v8i32(
	; AVX1-NEXT: [[TMP1:%.]] = ashr <8 x i32> [[A:%.]], [[B:%.*]]	; AVX1-NEXT: [[TMP1:%.]] = ashr <8 x i32> [[A:%.]], [[B:%.*]]
@@ -174,16 +204,16 @@ define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) {
	; SSE-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 2)	; SSE-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 2)
	; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>	; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
	; SSE-NEXT: [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 3)	; SSE-NEXT: [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 3)
	-; SSE-NEXT: [[~~R71:%~~.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>	+; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
	-; SSE-NEXT: ret <8 x i32> [[~~R71~~]]	+; SSE-NEXT: ret <8 x i32> [[TMP5]]
	;	;
	; SLM-LABEL: @ashr_shl_v8i32_const(	; SLM-LABEL: @ashr_shl_v8i32_const(
	; SLM-NEXT: [[TMP1:%.]] = shufflevector <8 x i32> [[A:%.]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>	; SLM-NEXT: [[TMP1:%.]] = shufflevector <8 x i32> [[A:%.]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
	; SLM-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 2)	; SLM-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 2)
	; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>	; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
	; SLM-NEXT: [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 3)	; SLM-NEXT: [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 3)
	-; SLM-NEXT: [[~~R71:%~~.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>	+; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
	-; SLM-NEXT: ret <8 x i32> [[~~R71~~]]	+; SLM-NEXT: ret <8 x i32> [[TMP5]]
	;	;
	; AVX1-LABEL: @ashr_shl_v8i32_const(	; AVX1-LABEL: @ashr_shl_v8i32_const(
	; AVX1-NEXT: [[TMP1:%.]] = ashr <8 x i32> [[A:%.]], <i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>	; AVX1-NEXT: [[TMP1:%.]] = ashr <8 x i32> [[A:%.]], <i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
@@ -501,13 +531,49 @@ define <8 x i32> @sdiv_v8i32_undefs(<8 x i32> %a) {
	}	}

	define <8 x i32> @add_sub_v8i32_splat(<8 x i32> %a, i32 %b) {	define <8 x i32> @add_sub_v8i32_splat(<8 x i32> %a, i32 %b) {
	-; ~~CHECK~~-LABEL: @add_sub_v8i32_splat(	+; SSE-LABEL: @add_sub_v8i32_splat(
	-; ~~CHECK~~-NEXT: [[TMP1:%.]] = ~~insertelement~~ <8 x i32> ~~poison, i32~~ [[~~B:%~~.]], ~~i64~~ 0	+; SSE-NEXT: [[TMP1:%.]] = shufflevector <8 x i32> [[A:%.]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
	-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer	+; SSE-NEXT: [[TMP2:%.]] = insertelement <4 x i32> poison, i32 [[B:%.]], i64 0
	-; ~~CHECK~~-NEXT: [[TMP3:%.]] = ~~add~~ <8 x i32> [[TMP2]], ~~[[A:%.]]~~	+; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
	-; ~~CHECK~~-NEXT: [[TMP4:%.*]] = ~~sub~~ <8 x i32> [[~~TMP2~~]], [[A]]	+; SSE-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
	-; ~~CHECK~~-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[~~TMP3~~]], <8 x i32> ~~[[TMP4]]~~, <8 x i32> <i32 0, i32 1, i32 2, i32 ~~3, i32 12, i32 13, i32 14, i32 15>~~	+; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
	-; CHECK-NEXT: ret <8 x i32> [[TMP5]]	+; SSE-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]]
		+; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
		+; SSE-NEXT: ret <8 x i32> [[TMP7]]
		+;
		+; SLM-LABEL: @add_sub_v8i32_splat(
		+; SLM-NEXT: [[TMP1:%.]] = shufflevector <8 x i32> [[A:%.]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
		+; SLM-NEXT: [[TMP2:%.]] = insertelement <4 x i32> poison, i32 [[B:%.]], i64 0
		+; SLM-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
		+; SLM-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
		+; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
		+; SLM-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]]
		+; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
		+; SLM-NEXT: ret <8 x i32> [[TMP7]]
		+;
		+; AVX1-LABEL: @add_sub_v8i32_splat(
		+; AVX1-NEXT: [[TMP1:%.]] = insertelement <8 x i32> poison, i32 [[B:%.]], i64 0
		+; AVX1-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer
		+; AVX1-NEXT: [[TMP3:%.]] = add <8 x i32> [[TMP2]], [[A:%.]]
		+; AVX1-NEXT: [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]]
		+; AVX1-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
		+; AVX1-NEXT: ret <8 x i32> [[TMP5]]
		+;
		+; AVX2-LABEL: @add_sub_v8i32_splat(
		+; AVX2-NEXT: [[TMP1:%.]] = insertelement <8 x i32> poison, i32 [[B:%.]], i64 0
		+; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer
		+; AVX2-NEXT: [[TMP3:%.]] = add <8 x i32> [[TMP2]], [[A:%.]]
		+; AVX2-NEXT: [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]]
		+; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
		+; AVX2-NEXT: ret <8 x i32> [[TMP5]]
		+;
		+; AVX512-LABEL: @add_sub_v8i32_splat(
		+; AVX512-NEXT: [[TMP1:%.]] = insertelement <8 x i32> poison, i32 [[B:%.]], i64 0
		+; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer
		+; AVX512-NEXT: [[TMP3:%.]] = add <8 x i32> [[TMP2]], [[A:%.]]
		+; AVX512-NEXT: [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]]
		+; AVX512-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
		+; AVX512-NEXT: ret <8 x i32> [[TMP5]]
	;	;
	%a0 = extractelement <8 x i32> %a, i32 0	%a0 = extractelement <8 x i32> %a, i32 0
	%a1 = extractelement <8 x i32> %a, i32 1	%a1 = extractelement <8 x i32> %a, i32 1

llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
@@ -7,11 +7,39 @@
	; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S \| FileCheck %s --check-prefixes=CHECK,AVX512	; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S \| FileCheck %s --check-prefixes=CHECK,AVX512

	define <8 x i32> @add_sub_v8i32(<8 x i32> %a, <8 x i32> %b) {	define <8 x i32> @add_sub_v8i32(<8 x i32> %a, <8 x i32> %b) {
	-; ~~CHECK~~-LABEL: @add_sub_v8i32(	+; SSE-LABEL: @add_sub_v8i32(
	-; ~~CHECK~~-NEXT: [[TMP1:%.]] = add <8 x i32> [[A:%.]], [[B:%.*]]	+; SSE-NEXT: [[TMP1:%.]] = add <8 x i32> [[A:%.]], [[B:%.*]]
	-; ~~CHECK~~-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]]	+; SSE-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]]
	-; ~~CHECK~~-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[~~TMP1~~]], <8 x i32> ~~[[TMP2]]~~, <8 x i32> <i32 0, i32 1, i32 2, i32 ~~3, i32 12, i32 13, i32 14, i32 15>~~	+; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
	-; CHECK-NEXT: ret <8 x i32> [[TMP3]]	+; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
		+; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
		+; SSE-NEXT: ret <8 x i32> [[TMP5]]
		+;
		+; SLM-LABEL: @add_sub_v8i32(
		+; SLM-NEXT: [[TMP1:%.]] = add <8 x i32> [[A:%.]], [[B:%.*]]
		+; SLM-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]]
		+; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
		+; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
		+; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
		+; SLM-NEXT: ret <8 x i32> [[TMP5]]
		+;
		+; AVX1-LABEL: @add_sub_v8i32(
		+; AVX1-NEXT: [[TMP1:%.]] = add <8 x i32> [[A:%.]], [[B:%.*]]
		+; AVX1-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]]
		+; AVX1-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
		+; AVX1-NEXT: ret <8 x i32> [[TMP3]]
		+;
		+; AVX2-LABEL: @add_sub_v8i32(
		+; AVX2-NEXT: [[TMP1:%.]] = add <8 x i32> [[A:%.]], [[B:%.*]]
		+; AVX2-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]]
		+; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
		+; AVX2-NEXT: ret <8 x i32> [[TMP3]]
		+;
		+; AVX512-LABEL: @add_sub_v8i32(
		+; AVX512-NEXT: [[TMP1:%.]] = add <8 x i32> [[A:%.]], [[B:%.*]]
		+; AVX512-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]]
		+; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
		+; AVX512-NEXT: ret <8 x i32> [[TMP3]]
	;	;
	%a0 = extractelement <8 x i32> %a, i32 0	%a0 = extractelement <8 x i32> %a, i32 0
	%a1 = extractelement <8 x i32> %a, i32 1	%a1 = extractelement <8 x i32> %a, i32 1
@@ -106,14 +134,16 @@ define <8 x i32> @ashr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
	; SSE-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]	; SSE-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]
	; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>	; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
	; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>	; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
	-; SSE-NEXT: [[~~R71:%~~.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>	+; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
	-; SSE-NEXT: ret <8 x i32> [[~~R71~~]]	+; SSE-NEXT: ret <8 x i32> [[TMP5]]
	;	;
	; SLM-LABEL: @ashr_shl_v8i32(	; SLM-LABEL: @ashr_shl_v8i32(
	; SLM-NEXT: [[TMP1:%.]] = ashr <8 x i32> [[A:%.]], [[B:%.*]]	; SLM-NEXT: [[TMP1:%.]] = ashr <8 x i32> [[A:%.]], [[B:%.*]]
	; SLM-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]	; SLM-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]
	-; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[~~TMP1~~]], <8 x i32> ~~[[TMP2]]~~, <8 x i32> <i32 0, i32 1, i32 2, i32 ~~3, i32 12, i32 13, i32 14, i32 15>~~	+; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
	-; SLM-NEXT: ret <8 x i32> [[TMP3]]	+; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
		+; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
		+; SLM-NEXT: ret <8 x i32> [[TMP5]]
	;	;
	; AVX1-LABEL: @ashr_shl_v8i32(	; AVX1-LABEL: @ashr_shl_v8i32(
	; AVX1-NEXT: [[TMP1:%.]] = ashr <8 x i32> [[A:%.]], [[B:%.*]]	; AVX1-NEXT: [[TMP1:%.]] = ashr <8 x i32> [[A:%.]], [[B:%.*]]
@@ -174,16 +204,16 @@ define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) {
	; SSE-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 2)	; SSE-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 2)
	; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>	; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
	; SSE-NEXT: [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 3)	; SSE-NEXT: [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 3)
	-; SSE-NEXT: [[~~R71:%~~.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>	+; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
	-; SSE-NEXT: ret <8 x i32> [[~~R71~~]]	+; SSE-NEXT: ret <8 x i32> [[TMP5]]
	;	;
	; SLM-LABEL: @ashr_shl_v8i32_const(	; SLM-LABEL: @ashr_shl_v8i32_const(
	; SLM-NEXT: [[TMP1:%.]] = shufflevector <8 x i32> [[A:%.]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>	; SLM-NEXT: [[TMP1:%.]] = shufflevector <8 x i32> [[A:%.]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
	; SLM-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 2)	; SLM-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 2)
	; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>	; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
	; SLM-NEXT: [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 3)	; SLM-NEXT: [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 3)
	-; SLM-NEXT: [[~~R71:%~~.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>	+; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
	-; SLM-NEXT: ret <8 x i32> [[~~R71~~]]	+; SLM-NEXT: ret <8 x i32> [[TMP5]]
	;	;
	; AVX1-LABEL: @ashr_shl_v8i32_const(	; AVX1-LABEL: @ashr_shl_v8i32_const(
	; AVX1-NEXT: [[TMP1:%.]] = ashr <8 x i32> [[A:%.]], <i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>	; AVX1-NEXT: [[TMP1:%.]] = ashr <8 x i32> [[A:%.]], <i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
@@ -501,13 +531,49 @@ define <8 x i32> @sdiv_v8i32_undefs(<8 x i32> %a) {
	}	}

	define <8 x i32> @add_sub_v8i32_splat(<8 x i32> %a, i32 %b) {	define <8 x i32> @add_sub_v8i32_splat(<8 x i32> %a, i32 %b) {
	-; ~~CHECK~~-LABEL: @add_sub_v8i32_splat(	+; SSE-LABEL: @add_sub_v8i32_splat(
	-; ~~CHECK~~-NEXT: [[TMP1:%.]] = ~~insertelement~~ <8 x i32> ~~poison, i32~~ [[~~B:%~~.]], ~~i64~~ 0	+; SSE-NEXT: [[TMP1:%.]] = shufflevector <8 x i32> [[A:%.]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
	-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer	+; SSE-NEXT: [[TMP2:%.]] = insertelement <4 x i32> poison, i32 [[B:%.]], i64 0
	-; ~~CHECK~~-NEXT: [[TMP3:%.]] = ~~add~~ <8 x i32> [[TMP2]], ~~[[A:%.]]~~	+; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
	-; ~~CHECK~~-NEXT: [[TMP4:%.*]] = ~~sub~~ <8 x i32> [[~~TMP2~~]], [[A]]	+; SSE-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
	-; ~~CHECK~~-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[~~TMP3~~]], <8 x i32> ~~[[TMP4]]~~, <8 x i32> <i32 0, i32 1, i32 2, i32 ~~3, i32 12, i32 13, i32 14, i32 15>~~	+; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
	-; CHECK-NEXT: ret <8 x i32> [[TMP5]]	+; SSE-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]]
		+; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
		+; SSE-NEXT: ret <8 x i32> [[TMP7]]
		+;
		+; SLM-LABEL: @add_sub_v8i32_splat(
		+; SLM-NEXT: [[TMP1:%.]] = shufflevector <8 x i32> [[A:%.]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
		+; SLM-NEXT: [[TMP2:%.]] = insertelement <4 x i32> poison, i32 [[B:%.]], i64 0
		+; SLM-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
		+; SLM-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
		+; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
		+; SLM-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]]
		+; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
		+; SLM-NEXT: ret <8 x i32> [[TMP7]]
		+;
		+; AVX1-LABEL: @add_sub_v8i32_splat(
		+; AVX1-NEXT: [[TMP1:%.]] = insertelement <8 x i32> poison, i32 [[B:%.]], i64 0
		+; AVX1-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer
		+; AVX1-NEXT: [[TMP3:%.]] = add <8 x i32> [[TMP2]], [[A:%.]]
		+; AVX1-NEXT: [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]]
		+; AVX1-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
		+; AVX1-NEXT: ret <8 x i32> [[TMP5]]
		+;
		+; AVX2-LABEL: @add_sub_v8i32_splat(
		+; AVX2-NEXT: [[TMP1:%.]] = insertelement <8 x i32> poison, i32 [[B:%.]], i64 0
		+; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer
		+; AVX2-NEXT: [[TMP3:%.]] = add <8 x i32> [[TMP2]], [[A:%.]]
		+; AVX2-NEXT: [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]]
		+; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
		+; AVX2-NEXT: ret <8 x i32> [[TMP5]]
		+;
		+; AVX512-LABEL: @add_sub_v8i32_splat(
		+; AVX512-NEXT: [[TMP1:%.]] = insertelement <8 x i32> poison, i32 [[B:%.]], i64 0
		+; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer
		+; AVX512-NEXT: [[TMP3:%.]] = add <8 x i32> [[TMP2]], [[A:%.]]
		+; AVX512-NEXT: [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]]
		+; AVX512-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
		+; AVX512-NEXT: ret <8 x i32> [[TMP5]]
	;	;
	%a0 = extractelement <8 x i32> %a, i32 0	%a0 = extractelement <8 x i32> %a, i32 0
	%a1 = extractelement <8 x i32> %a, i32 1	%a1 = extractelement <8 x i32> %a, i32 1

  ; CHECK-NEXT:    [[ADD:%.*]] = add i32 1, 0
  ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> <i32 0, i32 0, i32 0, i32 poison>, i32 [[ADD]], i32 3
  ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <4 x i32> [[TMP0]], zeroinitializer
--; CHECK-NEXT:    [[ICMP:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2
++; CHECK-NEXT:    [[ICMP:%.*]] = icmp samesign ult i32 0, 0
  ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[ICMP]], i32 0, i32 0
  ; CHECK-NEXT:    [[ZEXT:%.*]] = zext i32 [[SELECT]] to i64
  ; CHECK-NEXT:    [[GETELEMENTPTR:%.*]] = getelementptr ptr addrspace(1), ptr addrspace(1) null, i64 [[ZEXT]]
  ; CHECK-NEXT:    [[CALL:%.*]] = call i32 null(<2 x double> zeroinitializer)
  ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> <i32 0, i32 0, i32 0, i32 poison>, i32 [[CALL]], i32 3
  ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i32> [[TMP2]], zeroinitializer
++; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
++; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i1> @llvm.vector.insert.v8i1.v4i1(<8 x i1> [[TMP4]], <4 x i1> [[TMP1]], i64 4)
  ; CHECK-NEXT:    ret void
  ;
  bb:

  ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[ARG26]], i64 17
  ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[ARG1]], i64 8
  ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[ARG1]], i64 12
--; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !noalias [[META0:![0-9]+]]
++; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i32>, ptr [[TMP6]], align 8, !noalias [[META0:![0-9]+]]
--; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i32>, ptr [[TMP6]], align 8, !noalias [[META0]]
++; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !noalias [[META0]]
  ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
  ; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <16 x i32> <i32 poison, i32 poison, i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
  ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>

llvm/test/Transforms/SLPVectorizer/X86/long-full-reg-stores.ll
@@ -9,10 +9,10 @@ define void @test(ptr noalias %0, ptr noalias %1) {
	; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP0]], i64 8	; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP0]], i64 8
	; CHECK-NEXT: [[TMP6:%.*]] = load <2 x double>, ptr [[TMP9]], align 16	; CHECK-NEXT: [[TMP6:%.*]] = load <2 x double>, ptr [[TMP9]], align 16
	; CHECK-NEXT: [[TMP7:%.*]] = load <4 x double>, ptr [[TMP11]], align 8	; CHECK-NEXT: [[TMP7:%.*]] = load <4 x double>, ptr [[TMP11]], align 8
	-; CHECK-NEXT: [[~~TMP8:%~~.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>	+; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
	-; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[~~TMP8~~]], <6 x i32> <i32 2, i32 4, i32 0, i32 3, i32 poison, i32 poison>	+; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[TMP10]], <6 x i32> <i32 2, i32 4, i32 0, i32 3, i32 poison, i32 poison>
	-; CHECK-NEXT: [[~~TMP10:%~~.*]] = shufflevector <4 x double> [[~~TMP8~~]], <4 x double> [[TMP7]], <6 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 1, i32 5>	+; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> [[TMP7]], <6 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 1, i32 5>
	-; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <6 x double> [[TMP12]], <6 x double> [[~~TMP10~~]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 10, i32 11>	+; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <6 x double> [[TMP12]], <6 x double> [[TMP14]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 10, i32 11>
	; CHECK-NEXT: store <6 x double> [[TMP13]], ptr [[TMP5]], align 8	; CHECK-NEXT: store <6 x double> [[TMP13]], ptr [[TMP5]], align 8
	; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP0]], i64 40	; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP0]], i64 40
	; CHECK-NEXT: [[TMP22:%.*]] = load double, ptr [[TMP21]], align 8	; CHECK-NEXT: [[TMP22:%.*]] = load double, ptr [[TMP21]], align 8

  ; CHECK-NEXT:    [[A2:%.*]] = load double, ptr [[IDXA2]], align 8
  ; CHECK-NEXT:    [[A1:%.*]] = load double, ptr [[IDXA1]], align 8
  ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A]], align 8
--; CHECK-NEXT:    [[B1:%.*]] = load double, ptr [[IDXB1]], align 8
  ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B]], align 8
++; CHECK-NEXT:    [[B1:%.*]] = load double, ptr [[IDXB1]], align 8
  ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B2]], i32 1
  ; CHECK-NEXT:    [[TMP3:%.*]] = fsub fast <2 x double> [[TMP0]], [[TMP2]]
  ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0

llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll
@@ -10,22 +10,24 @@ define i32 @bar() local_unnamed_addr {
	; CHECK-NEXT: [[SUB102_1:%.*]] = sub nsw i32 undef, undef	; CHECK-NEXT: [[SUB102_1:%.*]] = sub nsw i32 undef, undef
	; CHECK-NEXT: [[ADD78_2:%.*]] = add nsw i32 undef, undef	; CHECK-NEXT: [[ADD78_2:%.*]] = add nsw i32 undef, undef
	; CHECK-NEXT: [[SUB102_3:%.*]] = sub nsw i32 undef, undef	; CHECK-NEXT: [[SUB102_3:%.*]] = sub nsw i32 undef, undef
	-; CHECK-NEXT: [[TMP0:%.*]] = insertelement ~~<16~~ x i32> <i32 undef, i32 undef, i32 ~~undef, i32 undef, i32~~ poison, i32 poison, i32 ~~poison, i32 poison, i32~~ undef, i32 ~~poison, i32 poison, i32~~ undef, i32 undef, i32 ~~undef, i32 undef, i32~~ undef>, i32 [[SUB102_1]], i32 4	+; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> <i32 undef, i32 undef, i32 poison, i32 poison, i32 undef, i32 undef, i32 undef, i32 undef>, i32 [[SUB102_1]], i32 2
	-; CHECK-NEXT: [[TMP1:%.*]] = insertelement ~~<16~~ x i32> [[TMP0]], i32 [[ADD94_1]], i32 5	+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> [[TMP0]], i32 [[ADD94_1]], i32 3
	-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> [[TMP1]], i32 [[ADD78_1]], i32 6	+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> <i32 undef, i32 undef, i32 poison, i32 poison, i32 undef, i32 poison, i32 poison, i32 undef>, i32 [[SUB86_1]], i32 2
	-; CHECK-NEXT: [[TMP3:%.*]] = insertelement ~~<16~~ x i32> [[TMP2]], i32 [[~~SUB86_1~~]], i32 7	+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[ADD78_1]], i32 3
	-; CHECK-NEXT: [[TMP4:%.*]] = insertelement ~~<16~~ x i32> [[TMP3]], i32 [[ADD78_2]], i32 9	+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[ADD78_2]], i32 5
	-; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 9, i32 11, i32 12, i32 13, i32 14, i32 15>	+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[SUB102_3]], i32 6
	-; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 poison, i32 poison, i32 poison, i32 poison, i32 undef, i32 undef, i32 undef, i32 undef, i32 poison, i32 undef, i32 undef, i32 poison>, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 7, i32 6, i32 5, i32 4, i32 24, i32 25, i32 26, i32 27, i32 poison, i32 29, i32 30, i32 poison>	+; CHECK-NEXT: [[TMP6:%.*]] = add nsw <8 x i32> [[TMP1]], [[TMP5]]
	-; CHECK-NEXT: [[TMP7:%.*]] = insertelement <16 x i32> [[TMP18]], i32 [[SUB102_3]], i32 12	+; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> <i32 undef, i32 undef, i32 poison, i32 poison, i32 poison, i32 undef, i32 undef, i32 undef>, <8 x i32> <i32 8, i32 9, i32 3, i32 2, i32 5, i32 13, i32 14, i32 15>
	-; CHECK-NEXT: [[TMP8:%.*]] = shufflevector ~~<16~~ x i32> [[~~TMP7~~]], ~~<16~~ x i32> ~~poison~~, ~~<16~~ x i32> <i32 0, i32 1, i32 ~~2, i32~~ 3, i32 4, i32 5, i32 6, i32 7, i32 ~~8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 12>~~	+; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP5]], <8 x i32> <i32 poison, i32 poison, i32 3, i32 2, i32 poison, i32 poison, i32 poison, i32 14>
	-; CHECK-NEXT: [[TMP9:%.*]] = add nsw <16 x i32> [[TMP5]], [[TMP8]]	+; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> <i32 undef, i32 undef, i32 poison, i32 poison, i32 undef, i32 undef, i32 undef, i32 poison>, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 12, i32 13, i32 14, i32 7>
	-; CHECK-NEXT: [[TMP10:%.*]] = sub nsw ~~<16~~ x i32> [[~~TMP5~~]], [[~~TMP8~~]]	+; CHECK-NEXT: [[TMP10:%.*]] = sub nsw <8 x i32> [[TMP7]], [[TMP9]]
	-; CHECK-NEXT: [[~~TMP11:%~~.*]] = shufflevector ~~<16~~ x i32> [[~~TMP9~~]], ~~<16~~ x i32> ~~[[TMP10]]~~, <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 4, i32 5, i32 22, i32 23, i32 8, i32 9, i32 26, i32 27, i32 12, i32 13, i32 30, i32 ~~31>~~	+; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
		+; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v8i32(<16 x i32> [[TMP18]], <8 x i32> [[TMP10]], i64 8)
	; CHECK-NEXT: [[TMP12:%.*]] = lshr <16 x i32> [[TMP11]], splat (i32 15)	; CHECK-NEXT: [[TMP12:%.*]] = lshr <16 x i32> [[TMP11]], splat (i32 15)
	; CHECK-NEXT: [[TMP13:%.*]] = and <16 x i32> [[TMP12]], splat (i32 65537)	; CHECK-NEXT: [[TMP13:%.*]] = and <16 x i32> [[TMP12]], splat (i32 65537)
	; CHECK-NEXT: [[TMP14:%.*]] = mul nuw <16 x i32> [[TMP13]], splat (i32 65535)	; CHECK-NEXT: [[TMP14:%.*]] = mul nuw <16 x i32> [[TMP13]], splat (i32 65535)
	-; CHECK-NEXT: [[TMP15:%.*]] = add <16 x i32> [[TMP14]], [[TMP11]]	+; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> [[TMP10]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
		+; CHECK-NEXT: [[TMP15:%.*]] = add <16 x i32> [[TMP14]], [[TMP20]]
	; CHECK-NEXT: [[TMP16:%.*]] = xor <16 x i32> [[TMP15]], [[TMP14]]	; CHECK-NEXT: [[TMP16:%.*]] = xor <16 x i32> [[TMP15]], [[TMP14]]
	; CHECK-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP16]])	; CHECK-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP16]])
	; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP17]], 16	; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP17]], 16

llvm/test/Transforms/SLPVectorizer/X86/non-load-reduced-as-part-of-bv.ll
@@ -6,11 +6,11 @@ define i1 @foo() {
	; CHECK-NEXT: [[ENTRY:.*:]]	; CHECK-NEXT: [[ENTRY:.*:]]
	; CHECK-NEXT: [[TOBOOL_NOT_NOT509_I_2329_I_I:%.*]] = icmp ne i32 0, 0	; CHECK-NEXT: [[TOBOOL_NOT_NOT509_I_2329_I_I:%.*]] = icmp ne i32 0, 0
	; CHECK-NEXT: [[STOREMERGE_2333_I_I:%.*]] = select i1 [[TOBOOL_NOT_NOT509_I_2329_I_I]], i32 0, i32 0	; CHECK-NEXT: [[STOREMERGE_2333_I_I:%.*]] = select i1 [[TOBOOL_NOT_NOT509_I_2329_I_I]], i32 0, i32 0
	-; CHECK-NEXT: [[TOBOOL_NOT_NOT509_I_1_2_I_I:%.*]] = icmp ne i32 [[STOREMERGE_2333_I_I]], 0	+; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> <i32 0, i32 poison>, i32 [[STOREMERGE_2333_I_I]], i32 1
	-; CHECK-NEXT: [[~~TMP0:%~~.*]] = ~~insertelement~~ <8 x ~~i1>~~ ~~poison~~, i1 [[~~TOBOOL_NOT_NOT509_I_1_2_I_I~~]]~~, i32 4~~	+; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <2 x i32> zeroinitializer, [[TMP0]]
	-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i1> [[TMP0]], i1 [[TOBOOL_NOT_NOT509_I_2329_I_I]], i32 5	+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i1> [[TMP1]], <2 x i1> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 0>
	-; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.vector.insert.v8i1.v4i1(<8 x i1> [[TMP1]], <4 x i1> zeroinitializer, i64 0)	+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> <i1 false, i1 false, i1 undef, i1 undef>, <4 x i32> <i32 0, i32 4, i32 5, i32 3>
	-; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i1> @llvm.vector.insert.v8i1.~~v2i1~~(<8 x i1> ~~[[TMP2]]~~, <2 x i1> ~~zeroinitializer~~, i64 6)	+; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i1> @llvm.vector.insert.v8i1.v4i1(<8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 undef, i1 undef, i1 undef, i1 undef>, <4 x i1> [[TMP6]], i64 4)
	; CHECK-NEXT: [[TMP4:%.*]] = freeze <8 x i1> [[TMP3]]	; CHECK-NEXT: [[TMP4:%.*]] = freeze <8 x i1> [[TMP3]]
	; CHECK-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP4]])	; CHECK-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP4]])
	; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 false, i1 [[TMP5]], i1 false	; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 false, i1 [[TMP5]], i1 false

  ; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]]
  ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 3
  ; CHECK-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV_NEXT]]
--; CHECK-NEXT:    [[TMP7]] = load <2 x float>, ptr [[ARRAYIDX19]], align 4
  ; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x float>, ptr [[ARRAYIDX14]], align 4
++; CHECK-NEXT:    [[TMP7]] = load <2 x float>, ptr [[ARRAYIDX19]], align 4
  ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <4 x i32> <i32 poison, i32 0, i32 1, i32 poison>
  ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
  ; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x i32> <i32 5, i32 1, i32 2, i32 poison>
  ; CHECK-NEXT:    [[I1_0:%.*]] = load x86_fp80, ptr [[I1:%.*]], align 16
  ; CHECK-NEXT:    [[I1_GEP1:%.*]] = getelementptr x86_fp80, ptr [[I1]], i64 1
  ; CHECK-NEXT:    [[I1_1:%.*]] = load x86_fp80, ptr [[I1_GEP1]], align 16
--; CHECK-NEXT:    br i1 %arg, label [[THEN:%.*]], label [[END:%.*]]
++; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[THEN:%.*]], label [[END:%.*]]
  ; CHECK:       then:
  ; CHECK-NEXT:    [[I2_0:%.*]] = load x86_fp80, ptr [[I2:%.*]], align 16
  ; CHECK-NEXT:    [[I2_GEP1:%.*]] = getelementptr inbounds x86_fp80, ptr [[I2]], i64 1

  ; CHECK-NEXT:    [[TMP9:%.*]] = fadd fast <2 x double> <double 4.000000e+00, double 4.100000e+00>, [[TMP8]]
  ; CHECK-NEXT:    [[TMP10:%.*]] = fadd fast <2 x double> [[TMP9]], <double 2.000000e+00, double 2.100000e+00>
  ; CHECK-NEXT:    [[TMP11:%.*]] = fadd fast <2 x double> [[TMP10]], <double 3.000000e+00, double 3.100000e+00>
++; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x double> [[TMP11]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
  ; CHECK-NEXT:    br label [[BB2:%.*]]
  ; CHECK:       bb2:
--; CHECK-NEXT:    [[TMP12:%.*]] = phi <2 x double> [ [[TMP11]], [[BB1]] ], [ [[TMP16:%.*]], [[BB6:%.*]] ]
++; CHECK-NEXT:    [[TMP13:%.*]] = phi <2 x double> [ [[TMP12]], [[BB1]] ], [ [[TMP15:%.*]], [[BB6:%.*]] ]
--; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[TMP12]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
  ; CHECK-NEXT:    [[X0:%.*]] = getelementptr inbounds double, ptr [[P2:%.*]], i32 0
  ; CHECK-NEXT:    [[TMP14:%.*]] = load <2 x double>, ptr [[X0]], align 8
  ; CHECK-NEXT:    br i1 poison, label [[BB3:%.*]], label [[BB6]]
  ; CHECK:       bb5:
  ; CHECK-NEXT:    br label [[BB6]]
  ; CHECK:       bb6:
--; CHECK-NEXT:    [[TMP15:%.*]] = phi <2 x double> [ [[TMP13]], [[BB2]] ], [ [[TMP14]], [[BB4]] ], [ [[TMP14]], [[BB5]] ]
++; CHECK-NEXT:    [[TMP15]] = phi <2 x double> [ [[TMP13]], [[BB2]] ], [ [[TMP14]], [[BB4]] ], [ [[TMP14]], [[BB5]] ]
--; CHECK-NEXT:    [[TMP16]] = shufflevector <2 x double> [[TMP15]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
  ; CHECK-NEXT:    br label [[BB2]]
  ;
  entry:

  ; CHECK-NEXT:    [[TMP6:%.*]] = shl <4 x i16> [[TMP5]], zeroinitializer
  ; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i16> [[TMP6]], zeroinitializer
  ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i16> [[TMP7]], <4 x i16> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
--; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i16> [[TMP7]], [[TMP8]]
++; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i16> [[TMP8]], [[TMP7]]
--; CHECK-NEXT:    [[TMP10:%.*]] = sub <4 x i16> [[TMP7]], [[TMP8]]
++; CHECK-NEXT:    [[TMP10:%.*]] = sub <4 x i16> [[TMP8]], [[TMP7]]
--; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i32> <i32 1, i32 4, i32 3, i32 6>
++; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
  ; CHECK-NEXT:    [[TMP12:%.*]] = add <4 x i16> zeroinitializer, [[TMP11]]
  ; CHECK-NEXT:    [[TMP13:%.*]] = sub <4 x i16> zeroinitializer, [[TMP11]]
  ; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>

llvm/test/Transforms/SLPVectorizer/X86/same-values-sub-node-with-poisons.ll
@@ -1,5 +1,5 @@
	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
	-; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-10 < %s \| FileCheck %s	+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-4 < %s \| FileCheck %s

	define i32 @test(ptr %f, i1 %tobool.i.4, i32 %retval.0.i.219) {	define i32 @test(ptr %f, i1 %tobool.i.4, i32 %retval.0.i.219) {
	; CHECK-LABEL: define i32 @test(	; CHECK-LABEL: define i32 @test(
@@ -13,7 +13,7 @@ define i32 @test(ptr %f, i1 %tobool.i.4, i32 %retval.0.i.219) {
	; CHECK-NEXT: br i1 false, label %[[D_EXIT_3]], label %[[D_EXIT_6:.*]]	; CHECK-NEXT: br i1 false, label %[[D_EXIT_3]], label %[[D_EXIT_6:.*]]
	; CHECK: [[D_EXIT_3]]:	; CHECK: [[D_EXIT_3]]:
	; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ poison, %[[IF_END_I_2]] ], [ zeroinitializer, %[[ENTRY]] ], [ poison, %[[IF_END_I_1]] ]	; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ poison, %[[IF_END_I_2]] ], [ zeroinitializer, %[[ENTRY]] ], [ poison, %[[IF_END_I_1]] ]
	-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> <i32 poison, i32 1, i32 ~~0, i32 0>, i32~~ [[RETVAL_0_I_219]], i32 0	+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> <i32 poison, i32 1>, i32 [[RETVAL_0_I_219]], i32 0
	; CHECK-NEXT: br i1 [[TOBOOL_I_4]], label %[[D_EXIT_4:.*]], label %[[D_EXIT_6]]	; CHECK-NEXT: br i1 [[TOBOOL_I_4]], label %[[D_EXIT_4:.*]], label %[[D_EXIT_6]]
	; CHECK: [[D_EXIT_4]]:	; CHECK: [[D_EXIT_4]]:
	; CHECK-NEXT: br label %[[D_EXIT_6]]	; CHECK-NEXT: br label %[[D_EXIT_6]]
@@ -21,25 +21,29 @@ define i32 @test(ptr %f, i1 %tobool.i.4, i32 %retval.0.i.219) {
	; CHECK-NEXT: br i1 false, label %[[D_EXIT_6]], label %[[D_EXIT_7:.*]]	; CHECK-NEXT: br i1 false, label %[[D_EXIT_6]], label %[[D_EXIT_7:.*]]
	; CHECK: [[D_EXIT_6]]:	; CHECK: [[D_EXIT_6]]:
	; CHECK-NEXT: [[TMP3:%.*]] = phi <2 x i32> [ poison, %[[IF_END_I_5]] ], [ [[TMP1]], %[[D_EXIT_3]] ], [ poison, %[[IF_END_I_2]] ], [ [[TMP1]], %[[D_EXIT_4]] ]	; CHECK-NEXT: [[TMP3:%.*]] = phi <2 x i32> [ poison, %[[IF_END_I_5]] ], [ [[TMP1]], %[[D_EXIT_3]] ], [ poison, %[[IF_END_I_2]] ], [ [[TMP1]], %[[D_EXIT_4]] ]
	-; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i32> [ poison, %[[IF_END_I_5]] ], [ [[TMP2]], %[[D_EXIT_3]] ], [ poison, %[[IF_END_I_2]] ], [ zeroinitializer, %[[D_EXIT_4]] ]	+; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x i32> [ poison, %[[IF_END_I_5]] ], [ [[TMP2]], %[[D_EXIT_3]] ], [ poison, %[[IF_END_I_2]] ], [ zeroinitializer, %[[D_EXIT_4]] ]
		+; CHECK-NEXT: [[TMP8:%.*]] = phi <2 x i32> [ poison, %[[IF_END_I_5]] ], [ zeroinitializer, %[[D_EXIT_3]] ], [ poison, %[[IF_END_I_2]] ], [ zeroinitializer, %[[D_EXIT_4]] ]
	; CHECK-NEXT: br label %[[D_EXIT_7]]	; CHECK-NEXT: br label %[[D_EXIT_7]]
	; CHECK: [[D_EXIT_7]]:	; CHECK: [[D_EXIT_7]]:
	-; CHECK-NEXT: [[~~TMP5:%~~.*]] = phi <2 x i32> [ [[TMP3]], %[[D_EXIT_6]] ], [ poison, %[[IF_END_I_5]] ]	+; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x i32> [ [[TMP3]], %[[D_EXIT_6]] ], [ poison, %[[IF_END_I_5]] ]
	-; CHECK-NEXT: [[~~TMP6:%~~.*]] = phi <4 x i32> [ [[TMP4]], %[[D_EXIT_6]] ], [ poison, %[[IF_END_I_5]] ]	+; CHECK-NEXT: [[TMP7:%.*]] = phi <2 x i32> [ [[TMP4]], %[[D_EXIT_6]] ], [ poison, %[[IF_END_I_5]] ]
	-; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>	+; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x i32> [ [[TMP8]], %[[D_EXIT_6]] ], [ poison, %[[IF_END_I_5]] ]
	-; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 poison, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
	-; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> <i32 1, i32 poison, i32 poison, i32 1, i32 poison, i32 1, i32 1, i32 poison>, <8 x i32> <i32 8, i32 1, i32 2, i32 11, i32 poison, i32 13, i32 14, i32 poison>
	-; CHECK-NEXT: [[TMP10:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[TMP0]], i32 4
	-; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[RETVAL_0_I_219]], i32 7
	-; CHECK-NEXT: [[TMP12:%.*]] = add <8 x i32> [[TMP11]], [[TMP7]]
	; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>	; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
	-; CHECK-NEXT: [[~~TMP14:%~~.*]] = ~~insertelement~~ <4 x i32> ~~<i32 poison~~, ~~i32~~ poison, ~~i32~~ 1, i32 1>, i32 ~~[[RETVAL_0_I_219]]~~, i32 0	+; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
	-; CHECK-NEXT: [[~~TMP15:%~~.*]] = ~~shufflevector~~ <4 x i32> ~~[[TMP14]],~~ ~~<4 x i32>~~ poison, <4 x ~~i32>~~ ~~<i32 0~~, i32 0, i32 2, i32 3>	+; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> <i32 poison, i32 1, i32 1, i32 poison>, i32 [[TMP0]], i32 0
		+; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[RETVAL_0_I_219]], i32 3
	; CHECK-NEXT: [[TMP16:%.*]] = add <4 x i32> [[TMP15]], [[TMP13]]	; CHECK-NEXT: [[TMP16:%.*]] = add <4 x i32> [[TMP15]], [[TMP13]]
		+; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
		+; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x i32> [[TMP14]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
		+; CHECK-NEXT: [[TMP22:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP21]], <4 x i32> [[TMP10]], i64 4)
		+; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> <i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 0, i32 0, i32 poison>
		+; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <8 x i32> [[TMP23]], <8 x i32> <i32 poison, i32 poison, i32 1, i32 1, i32 1, i32 poison, i32 poison, i32 1>, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 12, i32 5, i32 6, i32 15>
		+; CHECK-NEXT: [[TMP19:%.*]] = add <8 x i32> [[TMP18]], [[TMP22]]
		+; CHECK-NEXT: [[TMP20:%.*]] = call <4 x i32> @llvm.vector.extract.v4i32.v8i32(<8 x i32> [[TMP19]], i64 0)
		+; CHECK-NEXT: [[RDX_OP:%.*]] = or <4 x i32> [[TMP20]], [[TMP16]]
		+; CHECK-NEXT: [[TMP12:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP19]], <4 x i32> [[RDX_OP]], i64 0)
	; CHECK-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> [[TMP12]])	; CHECK-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> [[TMP12]])
	-; CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP16]])	+; CHECK-NEXT: ret i32 [[TMP17]]
	-; CHECK-NEXT: [[OP_RDX4:%.*]] = or i32 [[TMP18]], [[TMP17]]
	-; CHECK-NEXT: ret i32 [[OP_RDX4]]
	;	;
	entry:	entry:
	%0 = load i32, ptr %f, align 4	%0 = load i32, ptr %f, align 4

llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll
@@ -7,20 +7,14 @@ define void @test(i1 %c, ptr %arg) {
	; CHECK: if:	; CHECK: if:
	; CHECK-NEXT: [[ARG2_2:%.]] = getelementptr inbounds i8, ptr [[ARG:%.]], i64 24	; CHECK-NEXT: [[ARG2_2:%.]] = getelementptr inbounds i8, ptr [[ARG:%.]], i64 24
	; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARG]], align 8	; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARG]], align 8
	-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
	; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARG2_2]], align 8	; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARG2_2]], align 8
	-; CHECK-NEXT: [[~~TMP4:%~~.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> ~~poison~~, <2 x i32> <i32 1, i32 0>	+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP1]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
	-; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> poison, <2 x i64> [[TMP4]], i64 0)
	-; CHECK-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP5]], <2 x i64> [[TMP2]], i64 2)
	; CHECK-NEXT: br label [[JOIN:%.*]]	; CHECK-NEXT: br label [[JOIN:%.*]]
	; CHECK: else:	; CHECK: else:
	; CHECK-NEXT: [[ARG_2:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 24	; CHECK-NEXT: [[ARG_2:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 24
	; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARG]], align 8	; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARG]], align 8
	-; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP7]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
	; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i64>, ptr [[ARG_2]], align 8	; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i64>, ptr [[ARG_2]], align 8
	-; CHECK-NEXT: [[~~TMP10:%~~.*]] = shufflevector <2 x i64> [[TMP9]], <2 x i64> ~~poison~~, <2 x i32> <i32 1, i32 0>	+; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i64> [[TMP9]], <2 x i64> [[TMP7]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
	-; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> poison, <2 x i64> [[TMP10]], i64 0)
	-; CHECK-NEXT: [[TMP12:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP11]], <2 x i64> [[TMP8]], i64 2)
	; CHECK-NEXT: br label [[JOIN]]	; CHECK-NEXT: br label [[JOIN]]
	; CHECK: join:	; CHECK: join:
	; CHECK-NEXT: [[TMP13:%.*]] = phi <4 x i64> [ [[TMP6]], [[IF]] ], [ [[TMP12]], [[ELSE]] ]	; CHECK-NEXT: [[TMP13:%.*]] = phi <4 x i64> [ [[TMP6]], [[IF]] ], [ [[TMP12]], [[ELSE]] ]

llvm/test/Transforms/SLPVectorizer/X86/splat-score-adjustment.ll
@@ -6,23 +6,19 @@ define i32 @a() {
	; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {	; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
	; CHECK-NEXT: br label %[[BB1:.*]]	; CHECK-NEXT: br label %[[BB1:.*]]
	; CHECK: [[BB1]]:	; CHECK: [[BB1]]:
	-; CHECK-NEXT: [[~~TMP4:%~~.]] = phi <4 x i8> [ zeroinitializer, [[TMP0:%.]] ], [ [[TMP6:%.*]], %[[BB1]] ]	+; CHECK-NEXT: [[TMP2:%.]] = phi <4 x i8> [ zeroinitializer, [[TMP0:%.]] ], [ [[TMP6:%.*]], %[[BB1]] ]
	-; CHECK-NEXT: [[TMP3:%.]] = phi <2 x i8> [ zeroinitializer, [[TMP0]] ], [ [[TMP17:%.]], %[[BB1]] ]	+; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
	-; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
	; CHECK-NEXT: [[TMP6]] = load <4 x i8>, ptr null, align 4	; CHECK-NEXT: [[TMP6]] = load <4 x i8>, ptr null, align 4
	-; CHECK-NEXT: [[~~TMP12:%~~.*]] = shufflevector <4 x i8> [[~~TMP6~~]], <4 x i8> poison, <4 x i32> <i32 ~~poison~~, i32 ~~poison~~, i32 0, i32 1>	+; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
	-; CHECK-NEXT: [[~~TMP7:%~~.*]] = shufflevector <2 x i8> [[~~TMP3~~]], <2 x i8> ~~poison~~, <4 x i32> <i32 0, i32 1, i32 ~~poison~~, i32 ~~poison>~~	+; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
	-; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> [[TMP7]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
	; CHECK-NEXT: [[TMP9:%.*]] = xor <4 x i8> [[TMP6]], [[TMP8]]	; CHECK-NEXT: [[TMP9:%.*]] = xor <4 x i8> [[TMP6]], [[TMP8]]
	-; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i8> [[~~TMP6~~]], <4 x i8> poison, <8 x i32> <i32 ~~poison~~, i32 0, i32 ~~poison~~, i32 1, i32 poison, i32 2, i32 poison, i32 3>	+; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
	-; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <8 x i32> <i32 0, i32 poison, i32 1, i32 poison, i32 2, i32 poison, i32 3, i32 poison>	+; CHECK-NEXT: [[TMP18:%.*]] = call <8 x i8> @llvm.vector.insert.v8i8.v4i8(<8 x i8> [[TMP10]], <4 x i8> [[TMP6]], i64 4)
	-; CHECK-NEXT: [[~~TMP18:%~~.*]] = shufflevector <8 x i8> [[~~TMP10~~]], <8 x i8> [[~~TMP11~~]], <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>	+; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <8 x i8> [[TMP5]], <8 x i8> [[TMP18]], <8 x i32> <i32 1, i32 2, i32 3, i32 12, i32 3, i32 12, i32 13, i32 14>
	-; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
	-; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <8 x i8> [[TMP19]], <8 x i8> [[TMP18]], <8 x i32> <i32 1, i32 3, i32 2, i32 9, i32 3, i32 11, i32 9, i32 13>
	; CHECK-NEXT: [[TMP22:%.*]] = xor <8 x i8> [[TMP18]], [[TMP21]]	; CHECK-NEXT: [[TMP22:%.*]] = xor <8 x i8> [[TMP18]], [[TMP21]]
	; CHECK-NEXT: [[TMP23:%.*]] = xor <8 x i8> [[TMP22]], [[TMP5]]	; CHECK-NEXT: [[TMP23:%.*]] = xor <8 x i8> [[TMP22]], [[TMP5]]
	-; CHECK-NEXT: store <8 x i8> [[TMP23]], ptr null, align 4	+; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <8 x i8> [[TMP23]], <8 x i8> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
	-; CHECK-NEXT: [[TMP17]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <2 x i32> <i32 2, i32 3>	+; CHECK-NEXT: store <8 x i8> [[TMP13]], ptr null, align 4
	; CHECK-NEXT: br label %[[BB1]]	; CHECK-NEXT: br label %[[BB1]]
	;	;
	br label %1	br label %1

llvm/test/Transforms/SLPVectorizer/X86/split-node-no-reorder-copy.ll
@@ -0,0 +1,72 @@
			+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
			+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-sie-ps5 -mcpu=znver2 < %s \| FileCheck %s
			+
			+define i1 @test(ptr %0, ptr %1, <2 x float> %2, <2 x float> %3, <2 x float> %4) {
			+; CHECK-LABEL: define i1 @test(
			+; CHECK-SAME: ptr [[TMP0:%.]], ptr [[TMP1:%.]], <2 x float> [[TMP2:%.]], <2 x float> [[TMP3:%.]], <2 x float> [[TMP4:%.*]]) #[[ATTR0:[0-9]+]] {
			+; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[TMP1]], align 4
			+; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[TMP0]], align 4
			+; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[TMP0]], align 4
			+; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP4]], i64 0
			+; CHECK-NEXT: [[TMP10:%.*]] = insertelement <8 x float> poison, float [[TMP6]], i32 0
			+; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x float> [[TMP10]], float [[TMP7]], i32 1
			+; CHECK-NEXT: [[TMP12:%.*]] = insertelement <8 x float> [[TMP11]], float [[TMP8]], i32 6
			+; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 6, i32 6>
			+; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP3]], <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0, i32 poison>
			+; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x float> [[TMP14]], float [[TMP9]], i32 7
			+; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <8 x float> [[TMP13]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
			+; CHECK-NEXT: [[TMP17:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v8f32(<16 x float> [[TMP16]], <8 x float> [[TMP15]], i64 8)
			+; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <8 x float> [[TMP14]], <8 x float> [[TMP12]], <16 x i32> <i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 14, i32 14, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 poison>
			+; CHECK-NEXT: [[TMP19:%.*]] = insertelement <16 x float> [[TMP18]], float [[TMP9]], i32 15
			+; CHECK-NEXT: [[TMP20:%.*]] = fmul <16 x float> [[TMP17]], [[TMP19]]
			+; CHECK-NEXT: [[TMP21:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP20]])
			+; CHECK-NEXT: [[TMP22:%.*]] = call float @foo(float [[TMP21]])
			+; CHECK-NEXT: ret i1 false
			+;
			+ %6 = load float, ptr %1, align 4
			+ %7 = load float, ptr %0, align 4
			+ %8 = fmul float %6, %6
			+ %9 = fmul float %7, %7
			+ %10 = fmul float %7, %7
			+ %11 = fmul float %7, %7
			+ %12 = fmul float %7, %7
			+ %13 = fmul float %7, %7
			+ %14 = load float, ptr %0, align 4
			+ %15 = fmul float %14, %14
			+ %16 = fmul float %14, %14
			+ %17 = extractelement <2 x float> %2, i64 0
			+ %18 = extractelement <2 x float> %2, i64 0
			+ %19 = fmul float %17, %17
			+ %20 = fmul float %18, %18
			+ %21 = extractelement <2 x float> %3, i64 0
			+ %22 = extractelement <2 x float> %2, i64 0
			+ %23 = fmul float %21, %21
			+ %24 = fmul float %22, %22
			+ %25 = extractelement <2 x float> %2, i64 0
			+ %26 = extractelement <2 x float> %2, i64 0
			+ %27 = fmul float %25, %25
			+ %28 = fmul float %26, %26
			+ %29 = extractelement <2 x float> %2, i64 0
			+ %30 = extractelement <2 x float> %4, i64 0
			+ %31 = fmul float %29, %29
			+ %32 = fmul float %30, %30
			+ %33 = fadd reassoc nsz float %8, %9
			+ %34 = fadd reassoc nsz float %33, %10
			+ %35 = fadd reassoc nsz float %34, %11
			+ %36 = fadd reassoc nsz float %35, %12
			+ %37 = fadd reassoc nsz float %36, %13
			+ %38 = fadd reassoc nsz float %37, %15
			+ %39 = fadd reassoc nsz float %38, %16
			+ %40 = fadd reassoc nsz float %39, %19
			+ %41 = fadd reassoc nsz float %40, %20
			+ %42 = fadd reassoc nsz float %41, %23
			+ %43 = fadd reassoc nsz float %42, %24
			+ %44 = fadd reassoc nsz float %43, %27
			+ %45 = fadd reassoc nsz float %44, %28
			+ %46 = fadd reassoc nsz float %45, %31
			+ %47 = fadd reassoc nsz float %46, %32
			+ %48 = call float @foo(float %47)
			+ ret i1 false
			+}
			+
			+declare float @foo(float)

llvm/test/Transforms/SLPVectorizer/X86/split-node-reorder-node-with-ops.ll
@@ -0,0 +1,222 @@
			+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
			+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s \| FileCheck %s
			+
			+define void @test(i32 %0, i8 %1, i64 %2, float %3) {
			+; CHECK-LABEL: define void @test(
			+; CHECK-SAME: i32 [[TMP0:%.]], i8 [[TMP1:%.]], i64 [[TMP2:%.]], float [[TMP3:%.]]) {
			+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i32 0
			+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> poison, <2 x i32> zeroinitializer
			+; CHECK-NEXT: [[TMP7:%.*]] = and <2 x i64> [[TMP6]], <i64 255, i64 -65536>
			+; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> <i64 1, i64 poison>, <2 x i32> <i32 2, i32 0>
			+; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i64> [[TMP7]], [[TMP8]]
			+; CHECK-NEXT: [[TMP10:%.*]] = lshr <2 x i64> [[TMP9]], <i64 1, i64 16>
			+; CHECK-NEXT: [[TMP11:%.*]] = trunc <2 x i64> [[TMP10]] to <2 x i8>
			+; CHECK-NEXT: [[TMP12:%.*]] = call <2 x i8> @llvm.smax.v2i8(<2 x i8> [[TMP11]], <2 x i8> zeroinitializer)
			+; CHECK-NEXT: [[TMP13:%.*]] = uitofp <2 x i8> [[TMP12]] to <2 x float>
			+; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i32 0
			+; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP14]], <4 x i8> poison, <4 x i32> zeroinitializer
			+; CHECK-NEXT: [[TMP16:%.*]] = uitofp <4 x i8> [[TMP15]] to <4 x float>
			+; CHECK-NEXT: [[TMP17:%.*]] = fdiv <2 x float> [[TMP13]], zeroinitializer
			+; CHECK-NEXT: [[TMP18:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0
			+; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <2 x i32> [[TMP18]], <2 x i32> poison, <2 x i32> zeroinitializer
			+; CHECK-NEXT: [[TMP20:%.*]] = sitofp <2 x i32> [[TMP19]] to <2 x float>
			+; CHECK-NEXT: [[TMP21:%.*]] = fdiv <2 x float> zeroinitializer, [[TMP20]]
			+; CHECK-NEXT: [[TMP22:%.*]] = trunc <2 x i64> [[TMP7]] to <2 x i32>
			+; CHECK-NEXT: [[TMP23:%.*]] = sub <2 x i32> zeroinitializer, [[TMP22]]
			+; CHECK-NEXT: [[TMP24:%.*]] = ashr <2 x i32> [[TMP23]], splat (i32 1)
			+; CHECK-NEXT: [[TMP25:%.*]] = sitofp <2 x i32> [[TMP24]] to <2 x float>
			+; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
			+; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <4 x float> [[TMP26]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
			+; CHECK-NEXT: [[TMP28:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP27]], <4 x float> [[TMP16]], i64 4)
			+; CHECK-NEXT: [[TMP29:%.*]] = fdiv <8 x float> zeroinitializer, [[TMP28]]
			+; CHECK-NEXT: [[TMP30:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[TMP29]])
			+; CHECK-NEXT: [[TMP31:%.*]] = bitcast <8 x float> [[TMP30]] to <8 x i32>
			+; CHECK-NEXT: [[TMP32:%.*]] = icmp ult <8 x i32> [[TMP31]], splat (i32 1325400064)
			+; CHECK-NEXT: [[TMP33:%.*]] = extractelement <8 x i1> [[TMP32]], i32 6
			+; CHECK-NEXT: [[TMP34:%.*]] = select i1 [[TMP33]], i64 0, i64 2147483648
			+; CHECK-NEXT: [[TMP35:%.*]] = extractelement <8 x i1> [[TMP32]], i32 5
			+; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i64 0, i64 4286578688
			+; CHECK-NEXT: [[TMP37:%.*]] = or i64 [[TMP34]], [[TMP36]]
			+; CHECK-NEXT: [[TMP38:%.*]] = extractelement <8 x i1> [[TMP32]], i32 7
			+; CHECK-NEXT: [[TMP39:%.*]] = select i1 [[TMP38]], i64 0, i64 128
			+; CHECK-NEXT: [[TMP40:%.*]] = extractelement <8 x i1> [[TMP32]], i32 4
			+; CHECK-NEXT: [[TMP41:%.*]] = select i1 [[TMP40]], i64 0, i64 128
			+; CHECK-NEXT: [[TMP42:%.*]] = extractelement <8 x i1> [[TMP32]], i32 2
			+; CHECK-NEXT: [[TMP43:%.*]] = select i1 [[TMP42]], i64 0, i64 8388608
			+; CHECK-NEXT: [[TMP44:%.*]] = extractelement <8 x i1> [[TMP32]], i32 3
			+; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], i64 0, i64 32768
			+; CHECK-NEXT: [[TMP46:%.*]] = or i64 [[TMP43]], [[TMP45]]
			+; CHECK-NEXT: [[TMP47:%.*]] = extractelement <8 x i1> [[TMP32]], i32 0
			+; CHECK-NEXT: [[TMP48:%.*]] = select i1 [[TMP47]], i64 0, i64 8388608
			+; CHECK-NEXT: [[TMP49:%.*]] = extractelement <8 x i1> [[TMP32]], i32 1
			+; CHECK-NEXT: [[TMP50:%.*]] = select i1 [[TMP49]], i64 0, i64 32768
			+; CHECK-NEXT: br label %[[BB52:.*]]
			+; CHECK: [[BB51:.*]]:
			+; CHECK-NEXT: unreachable
			+; CHECK: [[BB52]]:
			+; CHECK-NEXT: br label %[[BB53:.*]]
			+; CHECK: [[BB53]]:
			+; CHECK-NEXT: [[TMP54:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> [[TMP17]])
			+; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 0, ptr null)
			+; CHECK-NEXT: [[TMP55:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> [[TMP21]])
			+; CHECK-NEXT: [[TMP56:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0
			+; CHECK-NEXT: [[TMP57:%.*]] = shufflevector <8 x float> [[TMP56]], <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison>
			+; CHECK-NEXT: [[TMP58:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v2f32(<8 x float> [[TMP57]], <2 x float> [[TMP55]], i64 0)
			+; CHECK-NEXT: [[TMP59:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v2f32(<8 x float> [[TMP58]], <2 x float> [[TMP54]], i64 6)
			+; CHECK-NEXT: [[TMP60:%.*]] = bitcast <8 x float> [[TMP59]] to <8 x i32>
			+; CHECK-NEXT: [[TMP61:%.*]] = icmp ult <8 x i32> [[TMP60]], splat (i32 1325400064)
			+; CHECK-NEXT: [[TMP62:%.*]] = extractelement <8 x i1> [[TMP61]], i32 5
			+; CHECK-NEXT: [[TMP63:%.*]] = select i1 [[TMP62]], i64 [[TMP37]], i64 0
			+; CHECK-NEXT: [[TMP64:%.*]] = extractelement <8 x i1> [[TMP61]], i32 4
			+; CHECK-NEXT: [[TMP65:%.*]] = select i1 [[TMP64]], i64 0, i64 4294967168
			+; CHECK-NEXT: [[TMP66:%.*]] = or i64 [[TMP63]], [[TMP65]]
			+; CHECK-NEXT: [[TMP67:%.*]] = extractelement <8 x i1> [[TMP61]], i32 7
			+; CHECK-NEXT: [[TMP68:%.*]] = select i1 [[TMP67]], i64 0, i64 8388608
			+; CHECK-NEXT: [[TMP69:%.*]] = extractelement <8 x i1> [[TMP61]], i32 6
			+; CHECK-NEXT: [[TMP70:%.*]] = select i1 [[TMP69]], i64 0, i64 32768
			+; CHECK-NEXT: [[TMP71:%.*]] = or i64 [[TMP68]], [[TMP70]]
			+; CHECK-NEXT: [[TMP72:%.*]] = or i64 [[TMP71]], [[TMP66]]
			+; CHECK-NEXT: [[TMP73:%.*]] = or i64 [[TMP72]], [[TMP39]]
			+; CHECK-NEXT: store i64 [[TMP73]], ptr null, align 1
			+; CHECK-NEXT: store i64 [[TMP41]], ptr null, align 1
			+; CHECK-NEXT: [[TMP74:%.*]] = extractelement <8 x i1> [[TMP61]], i32 3
			+; CHECK-NEXT: [[TMP75:%.*]] = select i1 [[TMP74]], i64 0, i64 -9223372036854775808
			+; CHECK-NEXT: [[TMP76:%.*]] = extractelement <8 x i1> [[TMP61]], i32 2
			+; CHECK-NEXT: [[TMP77:%.*]] = zext i1 [[TMP76]] to i64
			+; CHECK-NEXT: [[TMP78:%.*]] = or i64 [[TMP46]], [[TMP77]]
			+; CHECK-NEXT: [[TMP79:%.*]] = or i64 [[TMP78]], [[TMP75]]
			+; CHECK-NEXT: store i64 [[TMP79]], ptr null, align 1
			+; CHECK-NEXT: [[TMP80:%.*]] = extractelement <8 x i1> [[TMP61]], i32 1
			+; CHECK-NEXT: [[TMP81:%.*]] = select i1 [[TMP80]], i64 0, i64 2147483648
			+; CHECK-NEXT: [[TMP82:%.*]] = extractelement <8 x i1> [[TMP61]], i32 0
			+; CHECK-NEXT: [[TMP83:%.*]] = select i1 [[TMP82]], i64 0, i64 128
			+; CHECK-NEXT: [[TMP84:%.*]] = or i64 [[TMP83]], [[TMP50]]
			+; CHECK-NEXT: [[TMP85:%.*]] = or i64 [[TMP84]], [[TMP48]]
			+; CHECK-NEXT: [[TMP86:%.*]] = or i64 [[TMP85]], [[TMP81]]
			+; CHECK-NEXT: store i64 [[TMP86]], ptr null, align 1
			+; CHECK-NEXT: br label %[[BB51]]
			+;
			+ %5 = and i64 %2, 255
			+ %6 = and i64 %2, -65536
			+ %7 = add i64 %5, 1
			+ %8 = add i64 %2, %6
			+ %9 = lshr i64 %7, 1
			+ %10 = trunc i64 %9 to i8
			+ %11 = tail call i8 @llvm.smax.i8(i8 %10, i8 0)
			+ %12 = lshr i64 %8, 16
			+ %13 = trunc i64 %12 to i8
			+ %14 = tail call i8 @llvm.smax.i8(i8 %13, i8 0)
			+ %15 = uitofp i8 %11 to float
			+ %16 = uitofp i8 %14 to float
			+ %17 = uitofp i8 %1 to float
			+ %18 = uitofp i8 %1 to float
			+ %19 = uitofp i8 %1 to float
			+ %20 = fdiv float 0.000000e+00, %17
			+ %21 = fdiv float %15, 0.000000e+00
			+ %22 = fdiv float %16, 0.000000e+00
			+ %23 = call float @llvm.fabs.f32(float %20)
			+ %24 = bitcast float %23 to i32
			+ %25 = icmp ult i32 %24, 1325400064
			+ %26 = fdiv float 0.000000e+00, %18
			+ %27 = fdiv float 0.000000e+00, %19
			+ %28 = call float @llvm.fabs.f32(float %27)
			+ %29 = bitcast float %28 to i32
			+ %30 = icmp ult i32 %29, 1325400064
			+ %31 = select i1 %30, i64 0, i64 2147483648
			+ %32 = call float @llvm.fabs.f32(float %26)
			+ %33 = bitcast float %32 to i32
			+ %34 = icmp ult i32 %33, 1325400064
			+ %35 = select i1 %34, i64 0, i64 4286578688
			+ %36 = or i64 %31, %35
			+ %37 = select i1 %25, i64 0, i64 128
			+ %38 = fdiv float 0.000000e+00, %17
			+ %39 = call float @llvm.fabs.f32(float %38)
			+ %40 = bitcast float %39 to i32
			+ %41 = icmp ult i32 %40, 1325400064
			+ %42 = select i1 %41, i64 0, i64 128
			+ %43 = trunc i64 %5 to i32
			+ %44 = sub i32 0, %43
			+ %45 = trunc i64 %6 to i32
			+ %46 = sub i32 0, %45
			+ %47 = ashr i32 %44, 1
			+ %48 = ashr i32 %46, 1
			+ %49 = sitofp i32 %0 to float
			+ %50 = sitofp i32 %47 to float
			+ %51 = sitofp i32 %48 to float
			+ %52 = sitofp i32 %0 to float
			+ %53 = fdiv float 0.000000e+00, %50
			+ %54 = fdiv float 0.000000e+00, %51
			+ %55 = call float @llvm.fabs.f32(float %53)
			+ %56 = bitcast float %55 to i32
			+ %57 = icmp ult i32 %56, 1325400064
			+ %58 = call float @llvm.fabs.f32(float %54)
			+ %59 = bitcast float %58 to i32
			+ %60 = icmp ult i32 %59, 1325400064
			+ %61 = select i1 %60, i64 0, i64 8388608
			+ %62 = select i1 %57, i64 0, i64 32768
			+ %63 = or i64 %61, %62
			+ %64 = fdiv float 0.000000e+00, %49
			+ %65 = fdiv float 0.000000e+00, %50
			+ %66 = fdiv float 0.000000e+00, %51
			+ %67 = fdiv float 0.000000e+00, %52
			+ %68 = call float @llvm.fabs.f32(float %65)
			+ %69 = bitcast float %68 to i32
			+ %70 = icmp ult i32 %69, 1325400064
			+ %71 = call float @llvm.fabs.f32(float %66)
			+ %72 = bitcast float %71 to i32
			+ %73 = icmp ult i32 %72, 1325400064
			+ %74 = select i1 %73, i64 0, i64 8388608
			+ %75 = select i1 %70, i64 0, i64 32768
			+ br label %77
			+
			+76: ; preds = %78
			+ unreachable
			+
			+77: ; preds = %4
			+ br label %78
			+
			+78: ; preds = %77
			+ %79 = call float @llvm.fabs.f32(float %22)
			+ %80 = bitcast float %79 to i32
			+ %81 = icmp ult i32 %80, 1325400064
			+ %82 = call float @llvm.fabs.f32(float %21)
			+ %83 = bitcast float %82 to i32
			+ %84 = icmp ult i32 %83, 1325400064
			+ %85 = bitcast float %3 to i32
			+ %86 = icmp ult i32 %85, 1325400064
			+ %87 = select i1 %86, i64 %36, i64 0
			+ %88 = bitcast float %3 to i32
			+ %89 = icmp ult i32 %88, 1325400064
			+ %90 = select i1 %89, i64 0, i64 4294967168
			+ %91 = or i64 %87, %90
			+ %92 = select i1 %81, i64 0, i64 8388608
			+ %93 = select i1 %84, i64 0, i64 32768
			+ %94 = or i64 %92, %93
			+ %95 = or i64 %94, %91
			+ %96 = or i64 %95, %37
			+ store i64 %96, ptr null, align 1
			+ call void @llvm.lifetime.start.p0(i64 0, ptr null)
			+ store i64 %42, ptr null, align 1
			+ %97 = bitcast float %3 to i32
			+ %98 = icmp ult i32 %97, 1325400064
			+ %99 = select i1 %98, i64 0, i64 -9223372036854775808
			+ %100 = bitcast float %3 to i32
			+ %101 = icmp ult i32 %100, 1325400064
			+ %102 = zext i1 %101 to i64
			+ %103 = or i64 %63, %102
			+ %104 = or i64 %103, %99
			+ store i64 %104, ptr null, align 1
			+ %105 = call float @llvm.fabs.f32(float %67)
			+ %106 = bitcast float %105 to i32
			+ %107 = icmp ult i32 %106, 1325400064
			+ %108 = call float @llvm.fabs.f32(float %64)
			+ %109 = bitcast float %108 to i32
			+ %110 = icmp ult i32 %109, 1325400064
			+ %111 = select i1 %107, i64 0, i64 2147483648
			+ %112 = select i1 %110, i64 0, i64 128
			+ %113 = or i64 %112, %75
			+ %114 = or i64 %113, %74
			+ %115 = or i64 %114, %111
			+ store i64 %115, ptr null, align 1
			+ br label %76
			+}

  ; CHECK-NEXT:    [[T47:%.*]] = mul nsw i32 [[T37]], -16069
  ; CHECK-NEXT:    [[T48:%.*]] = mul nsw i32 [[T38]], -3196
  ; CHECK-NEXT:    [[T49:%.*]] = add nsw i32 [[T40]], [[T47]]
--; CHECK-NEXT:    [[T15:%.*]] = load i32, ptr [[T14]], align 4
  ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[T8]], align 4
++; CHECK-NEXT:    [[T15:%.*]] = load i32, ptr [[T14]], align 4
  ; CHECK-NEXT:    [[T9:%.*]] = load i32, ptr [[T8]], align 4
  ; CHECK-NEXT:    [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]]
  ; CHECK-NEXT:    [[T30:%.*]] = add nsw i32 [[T27]], [[T29]]

  ; CHECK-NEXT:    [[T47:%.*]] = mul nsw i32 [[T37]], -16069
  ; CHECK-NEXT:    [[T48:%.*]] = mul nsw i32 [[T38]], -3196
  ; CHECK-NEXT:    [[T49:%.*]] = add nsw i32 [[T40]], [[T47]]
--; CHECK-NEXT:    [[T15:%.*]] = load i32, ptr [[T14]], align 4
  ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[T8]], align 4
++; CHECK-NEXT:    [[T15:%.*]] = load i32, ptr [[T14]], align 4
  ; CHECK-NEXT:    [[T9:%.*]] = load i32, ptr [[T8]], align 4
  ; CHECK-NEXT:    [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]]
  ; CHECK-NEXT:    [[T30:%.*]] = add nsw i32 [[T27]], [[T29]]

  ; CHECK-NEXT:    [[T47:%.*]] = mul nsw i32 [[T37]], -16069
  ; CHECK-NEXT:    [[T48:%.*]] = mul nsw i32 [[T38]], -3196
  ; CHECK-NEXT:    [[T49:%.*]] = add nsw i32 [[T40]], [[T47]]
--; CHECK-NEXT:    [[T15:%.*]] = load i32, ptr [[T14]], align 4
  ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[T8]], align 4
++; CHECK-NEXT:    [[T15:%.*]] = load i32, ptr [[T14]], align 4
  ; CHECK-NEXT:    [[T9:%.*]] = load i32, ptr [[T8]], align 4
  ; CHECK-NEXT:    [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]]
  ; CHECK-NEXT:    [[T30:%.*]] = add nsw i32 [[T27]], [[T29]]

llvm/test/Transforms/SLPVectorizer/addsub.ll
@@ -387,14 +387,10 @@ define void @reorder_alt_rightsubTree(ptr nocapture %c, ptr noalias nocapture re

	define void @vec_shuff_reorder() #0 {	define void @vec_shuff_reorder() #0 {
	; CHECK-LABEL: @vec_shuff_reorder(	; CHECK-LABEL: @vec_shuff_reorder(
	-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr ~~@fa~~, align 4	+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr @fb, align 4
	-; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr ~~@fb~~, align 4	+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr @fa, align 4
	-; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, ptr getelementptr inbounds ([4 x float], ptr @fb, i32 0, i64 2), align 4	+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
	-; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr getelementptr inbounds ([4 x float], ptr @fa, i32 0, i64 2), align 4	+; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
	-; CHECK-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP1]], i64 0)
	-; CHECK-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP3]], i64 2)
	-; CHECK-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP2]], i64 0)
	-; CHECK-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP7]], <2 x float> [[TMP4]], i64 2)
	; CHECK-NEXT: [[TMP9:%.*]] = fadd <4 x float> [[TMP6]], [[TMP8]]	; CHECK-NEXT: [[TMP9:%.*]] = fadd <4 x float> [[TMP6]], [[TMP8]]
	; CHECK-NEXT: [[TMP10:%.*]] = fsub <4 x float> [[TMP6]], [[TMP8]]	; CHECK-NEXT: [[TMP10:%.*]] = fsub <4 x float> [[TMP6]], [[TMP8]]
	; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>	; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>