[SLP] Vectorize struct-returning intrinsics#199433
Conversation
Created using spr 1.3.7
|
@llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-backend-amdgpu Author: Alexey Bataev (alexey-bataev) ChangesAllow SLP to combine across lanes calls that return a literal struct Original Pull Request: #195521 Original Pull Request2: #196756 Recommit after revert #198265 (comment) Added check for valid vectorizable type, small corner cases fixes Patch is 711.79 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/199433.diff 14 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 6bd595c86d55a..54a4d6b68b2e5 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -28,6 +28,7 @@
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVectorExtras.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/iterator.h"
#include "llvm/ADT/iterator_range.h"
@@ -70,6 +71,7 @@
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
#include "llvm/IR/ValueHandle.h"
+#include "llvm/IR/VectorTypeUtils.h"
#ifdef EXPENSIVE_CHECKS
#include "llvm/IR/Verifier.h"
#endif
@@ -299,10 +301,10 @@ static const unsigned MaxPHINumOperands = 128;
/// be inevitably scalarized.
static bool isValidElementType(Type *Ty) {
// TODO: Support ScalableVectorType.
- if (SLPReVec && isa<FixedVectorType>(Ty))
- Ty = Ty->getScalarType();
- return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
- !Ty->isPPC_FP128Ty();
+ if (SLPReVec && isVectorizedTy(Ty) && !getVectorizedTypeVF(Ty).isScalable())
+ Ty = toScalarizedTy(Ty);
+ return canVectorizeTy(Ty) && !Ty->isX86_FP80Ty() && !Ty->isPPC_FP128Ty() &&
+ !Ty->isVoidTy();
}
/// Returns the "element type" of the given value/instruction \p V.
@@ -327,15 +329,33 @@ static Type *getValueType(Value *V, bool LookThroughCmp = false) {
static unsigned getNumElements(Type *Ty) {
assert(!isa<ScalableVectorType>(Ty) &&
"ScalableVectorType is not supported.");
- if (auto *VecTy = dyn_cast<FixedVectorType>(Ty))
- return VecTy->getNumElements();
+ if (isVectorizedTy(Ty))
+ return getVectorizedTypeVF(Ty).getFixedValue();
return 1;
}
/// \returns the vector type of ScalarTy based on vectorization factor.
-static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
- return FixedVectorType::get(ScalarTy->getScalarType(),
- VF * getNumElements(ScalarTy));
+static Type *getWidenedType(Type *ScalarTy, unsigned VF) {
+ if (VF == 1 && !isVectorizedTy(ScalarTy)) {
+ // Workaround for 1 x vector types: toVectorizedTy returns the type
+ // unchanged when EC is scalar, but BoUpSLP relies on widening to
+ // <1 x ScalarTy> (or struct of <1 x ElTy>) to keep the rest of the
+ // pipeline operating on vector types.
+ if (auto *StructTy = dyn_cast<StructType>(ScalarTy)) {
+ assert(isUnpackedStructLiteral(StructTy) &&
+ "expected unpacked struct literal");
+ assert(all_of(StructTy->elements(), VectorType::isValidElementType) &&
+ "expected all element types to be valid vector element types");
+ return StructType::get(
+ StructTy->getContext(),
+ map_to_vector(StructTy->elements(), [&](Type *ElTy) -> Type * {
+ return FixedVectorType::get(ElTy, 1);
+ }));
+ }
+ return FixedVectorType::get(ScalarTy, 1);
+ }
+ return toVectorizedTy(toScalarizedTy(ScalarTy),
+ ElementCount::getFixed(VF * getNumElements(ScalarTy)));
}
/// Returns the number of elements of the given type \p Ty, not less than \p Sz,
@@ -343,7 +363,7 @@ static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
/// legalization.
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI,
Type *Ty, unsigned Sz) {
- if (!isValidElementType(Ty))
+ if (!isValidElementType(Ty) || isa<StructType>(Ty))
return bit_ceil(Sz);
// Find the number of elements, which forms full vectors.
const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
@@ -358,7 +378,7 @@ static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI,
static unsigned
getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty,
unsigned Sz) {
- if (!isValidElementType(Ty))
+ if (!isValidElementType(Ty) || isa<StructType>(Ty))
return bit_floor(Sz);
// Find the number of elements, which forms full vectors.
unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
@@ -2038,6 +2058,8 @@ static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty,
return false;
if (has_single_bit(Sz))
return true;
+ if (isa<StructType>(Ty))
+ return false;
const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
return NumParts > 0 && NumParts < Sz && has_single_bit(Sz / NumParts) &&
Sz % NumParts == 0;
@@ -2047,19 +2069,20 @@ static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty,
/// phase. If the type is going to be scalarized or does not uses whole
/// registers, returns 1.
static unsigned
-getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy,
- Type *ScalarTy,
+getNumberOfParts(const TargetTransformInfo &TTI, Type *VecTy, Type *ScalarTy,
const unsigned Limit = std::numeric_limits<unsigned>::max()) {
+ if (isa<StructType>(VecTy))
+ return 1;
unsigned NumParts = TTI.getNumberOfParts(VecTy);
if (NumParts == 0 || NumParts >= Limit)
return 1;
unsigned Sz = getNumElements(VecTy);
unsigned ScalarSz = getNumElements(ScalarTy);
- unsigned PWSz =
- getFullVectorNumberOfElements(TTI, VecTy->getElementType(), Sz);
+ Type *ElementTy = toScalarizedTy(VecTy);
+ unsigned PWSz = getFullVectorNumberOfElements(TTI, ElementTy, Sz);
if (NumParts >= Sz || PWSz % NumParts != 0 ||
(PWSz / NumParts) % ScalarSz != 0 ||
- !hasFullVectorsOrPowerOf2(TTI, VecTy->getElementType(), PWSz / NumParts))
+ !hasFullVectorsOrPowerOf2(TTI, ElementTy, PWSz / NumParts))
return 1;
const unsigned NumElts = PWSz / NumParts;
if (divideCeil(Sz, NumElts) != NumParts)
@@ -2208,14 +2231,14 @@ class slpvectorizer::BoUpSLP {
ReductionBitWidth >=
DL->getTypeSizeInBits(
VectorizableTree.front()->Scalars.front()->getType()))
- return getWidenedType(
- VectorizableTree.front()->Scalars.front()->getType(),
- VectorizableTree.front()->getVectorFactor());
- return getWidenedType(
+ return cast<FixedVectorType>(
+ getWidenedType(VectorizableTree.front()->Scalars.front()->getType(),
+ VectorizableTree.front()->getVectorFactor()));
+ return cast<FixedVectorType>(getWidenedType(
IntegerType::get(
VectorizableTree.front()->Scalars.front()->getContext(),
ReductionBitWidth),
- VectorizableTree.front()->getVectorFactor());
+ VectorizableTree.front()->getVectorFactor()));
}
/// Returns true if the tree results in one of the reduced bitcasts variants.
@@ -3988,8 +4011,7 @@ class slpvectorizer::BoUpSLP {
/// scalar/slot type used to widen into \p VecTy/\p FinalVecTy and may itself
/// be a FixedVectorType in ReVec mode or an adjusted type due to MinBWs.
InstructionCost getVectorSpillReloadCost(const TreeEntry *E, Type *ScalarTy,
- VectorType *VecTy,
- VectorType *FinalVecTy,
+ Type *VecTy, Type *FinalVecTy,
TTI::TargetCostKind CostKind) const;
/// This is the recursive part of buildTree.
@@ -4324,6 +4346,12 @@ class slpvectorizer::BoUpSLP {
/// other nodes as a series of insertvector instructions.
SmallVector<std::pair<unsigned, unsigned>, 2> CombinedEntriesWithIndices;
+ /// For ExtractValue entries that are vectorized via the struct-call path
+ /// (checkEVsForVecCalls succeeded during tree building), stores the common
+ /// field-index path shared by all scalars in the bundle. Empty for all
+ /// other entry kinds.
+ SmallVector<unsigned, 1> StructEVIndices;
+
private:
/// The operands of each instruction in each lane Operands[op_index][lane].
/// Note: This helps avoid the replication of the code that performs the
@@ -4598,6 +4626,11 @@ class slpvectorizer::BoUpSLP {
else
dbgs() << "<invalid>";
dbgs() << "\n";
+ if (!StructEVIndices.empty()) {
+ dbgs() << "StructEVIndices: ";
+ interleaveComma(StructEVIndices, dbgs());
+ dbgs() << "\n";
+ }
if (!CombinedEntriesWithIndices.empty()) {
dbgs() << "Combined entries: ";
interleaveComma(CombinedEntriesWithIndices, dbgs(), [&](const auto &P) {
@@ -7106,12 +7139,12 @@ static InstructionCost getExtractWithExtendCost(
const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst,
VectorType *VecTy, unsigned Index,
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) {
- if (auto *ScalarTy = dyn_cast<FixedVectorType>(Dst)) {
+ if (isVectorizedTy(Dst)) {
assert(SLPReVec && "Only supported by REVEC.");
- auto *SubTp =
- getWidenedType(VecTy->getElementType(), ScalarTy->getNumElements());
+ auto *SubTp = cast<FixedVectorType>(
+ getWidenedType(toScalarizedTy(VecTy), getNumElements(Dst)));
return getShuffleCost(TTI, TTI::SK_ExtractSubvector, VecTy, {}, CostKind,
- Index * ScalarTy->getNumElements(), SubTp) +
+ Index * getNumElements(Dst), SubTp) +
TTI.getCastInstrCost(Opcode, Dst, SubTp, TTI::CastContextHint::None,
CostKind);
}
@@ -7204,7 +7237,7 @@ static bool isMaskedLoadCompress(
InterleaveFactor = 0;
Type *ScalarTy = VL.front()->getType();
const size_t Sz = VL.size();
- auto *VecTy = getWidenedType(ScalarTy, Sz);
+ auto *VecTy = cast<VectorType>(getWidenedType(ScalarTy, Sz));
constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
SmallVector<int> Mask;
if (!Order.empty())
@@ -7240,7 +7273,7 @@ static bool isMaskedLoadCompress(
// Check for very large distances between elements.
if (*Diff / Sz >= MaxRegSize / 8)
return false;
- LoadVecTy = getWidenedType(ScalarTy, *Diff + 1);
+ LoadVecTy = cast<FixedVectorType>(getWidenedType(ScalarTy, *Diff + 1));
auto *LI = cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()]);
Align CommonAlignment = LI->getAlign();
IsMasked = !isSafeToLoadUnconditionally(
@@ -7289,8 +7322,8 @@ static bool isMaskedLoadCompress(
}
if (IsStrided && !IsMasked && Order.empty()) {
// Check for potential segmented(interleaved) loads.
- VectorType *AlignedLoadVecTy = getWidenedType(
- ScalarTy, getFullVectorNumberOfElements(TTI, ScalarTy, *Diff + 1));
+ VectorType *AlignedLoadVecTy = cast<VectorType>(getWidenedType(
+ ScalarTy, getFullVectorNumberOfElements(TTI, ScalarTy, *Diff + 1)));
if (!isSafeToLoadUnconditionally(Ptr0, AlignedLoadVecTy, CommonAlignment,
DL, cast<LoadInst>(VL.back()), &AC, &DT,
&TLI))
@@ -7481,7 +7514,7 @@ bool BoUpSLP::analyzeConstantStrideCandidate(
Type *StrideTy = DL->getIndexType(Ptr0->getType());
SPtrInfo.StrideVal = ConstantInt::getSigned(StrideTy, StrideIntVal);
- SPtrInfo.Ty = getWidenedType(NewScalarTy, VecSz);
+ SPtrInfo.Ty = cast<FixedVectorType>(getWidenedType(NewScalarTy, VecSz));
return true;
}
@@ -7536,7 +7569,8 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
NewScalarTy = Type::getIntNTy(
SE->getContext(),
DL->getTypeSizeInBits(BaseTy).getFixedValue() * NumOffsets);
- FixedVectorType *StridedLoadTy = getWidenedType(NewScalarTy, VecSz);
+ auto *StridedLoadTy =
+ cast<FixedVectorType>(getWidenedType(NewScalarTy, VecSz));
unsigned MinProfitableStridedOps =
IsLoad ? MinProfitableStridedLoads : MinProfitableStridedStores;
const unsigned BaseTyNumElts = getNumElements(BaseTy);
@@ -7735,7 +7769,9 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
// Check the order of pointer operands or that all pointers are the same.
bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
- auto *VecTy = getWidenedType(ScalarTy, Sz);
+ auto *VecTy = dyn_cast<VectorType>(getWidenedType(ScalarTy, Sz));
+ if (!VecTy)
+ return LoadsState::Gather;
Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
// Cache masked gather legality - both the !IsSorted path below and the
// post-branch check use the same VecTy/CommonAlignment, and the underlying
@@ -7816,7 +7852,7 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
// estimate as a buildvector, otherwise estimate as splat.
APInt DemandedElts = APInt::getAllOnes(Sz);
Type *PtrScalarTy = PointerOps.front()->getType()->getScalarType();
- VectorType *PtrVecTy = getWidenedType(PtrScalarTy, Sz);
+ auto *PtrVecTy = cast<VectorType>(getWidenedType(PtrScalarTy, Sz));
// Cache the underlying object of PointerOps.front() - it is invariant
// across the per-V comparisons below and getUnderlyingObject walks
// GEP/cast chains.
@@ -7913,7 +7949,7 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
}
for (const auto &[SliceStart, LS] : States) {
const unsigned SliceVF = std::min<unsigned>(VF, VL.size() - SliceStart);
- auto *SubVecTy = getWidenedType(ScalarTy, SliceVF);
+ auto *SubVecTy = cast<VectorType>(getWidenedType(ScalarTy, SliceVF));
auto *LI0 = cast<LoadInst>(VL[SliceStart]);
InstructionCost VectorGEPCost =
(LS == LoadsState::ScatterVectorize && ProfitableGatherPointers)
@@ -8518,7 +8554,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
const auto *It = find_if_not(TE.Scalars, isConstant);
if (It == TE.Scalars.begin())
return OrdersType();
- auto *Ty = getWidenedType(TE.Scalars.front()->getType(), Sz);
+ auto *Ty =
+ cast<VectorType>(getWidenedType(TE.Scalars.front()->getType(), Sz));
if (It != TE.Scalars.end()) {
OrdersType Order(Sz, Sz);
unsigned Idx = std::distance(TE.Scalars.begin(), It);
@@ -8776,6 +8813,12 @@ void BoUpSLP::reorderTopToBottom() {
// Maps a TreeEntry to the reorder indices of external users.
DenseMap<const TreeEntry *, SmallVector<OrdersType, 1>>
ExternalUserReorderMap;
+ // TODO: Reordering of struct types is not supported.
+ if (any_of(VectorizableTree, [](const std::unique_ptr<TreeEntry> &TE) {
+ return TE->State == TreeEntry::Vectorize &&
+ isa<StructType>(getValueType(TE->Scalars.front()));
+ }))
+ return;
// Compute IgnoreReorder once - it depends only on UserIgnoreList and
// VectorizableTree.front(), which do not change during this loop.
const bool IgnoreReorder =
@@ -8802,7 +8845,8 @@ void BoUpSLP::reorderTopToBottom() {
if (TE->hasState() && TE->isAltShuffle() &&
TE->State != TreeEntry::SplitVectorize) {
Type *ScalarTy = TE->Scalars[0]->getType();
- VectorType *VecTy = getWidenedType(ScalarTy, TE->Scalars.size());
+ auto *VecTy =
+ cast<VectorType>(getWidenedType(ScalarTy, TE->Scalars.size()));
unsigned Opcode0 = TE->getOpcode();
unsigned Opcode1 = TE->getAltOpcode();
SmallBitVector OpcodeMask(
@@ -9171,6 +9215,10 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
}
if (Users.first) {
auto &Data = Users;
+ // TODO: Reordering of struct types is not supported.
+ if (Data.first->State == TreeEntry::Vectorize &&
+ isa<StructType>(getValueType(Data.first->Scalars.front())))
+ continue;
if (Data.first->State == TreeEntry::SplitVectorize) {
assert(
Data.second.size() <= 2 &&
@@ -9444,6 +9492,10 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
TreeEntry *TE = Op.second;
if (!VisitedOps.insert(TE).second)
continue;
+ // TODO: Reordering of struct types is not supported.
+ if (TE->State == TreeEntry::Vectorize &&
+ isa<StructType>(getValueType(TE->Scalars.front())))
+ continue;
if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
reorderNodeWithReuses(*TE, Mask);
continue;
@@ -9540,13 +9592,14 @@ void BoUpSLP::buildExternalUses(
Value *Scalar = Entry->Scalars[Lane];
if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
continue;
+ bool IsStructScalar = isa<StructType>(Scalar->getType());
// All uses must be replaced already? No need to do it again.
auto It = ScalarToExtUses.find(Scalar);
if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
continue;
- if (Scalar->hasNUsesOrMore(NumVectScalars)) {
+ if (!IsStructScalar && Scalar->hasNUsesOrMore(NumVectScalars)) {
unsigned FoundLane = Entry->findLaneForValue(Scalar);
LLVM_DEBUG(dbgs() << "SLP: Need to extract from lane " << FoundLane
<< " from " << *Scalar << "for many users.\n");
@@ -9607,14 +9660,16 @@ void BoUpSLP::buildExternalUses(
"Bad state");
continue;
}
- U = nullptr;
- if (It != ScalarToExtUses.end()) {
- ExternalUses[It->second].User = nullptr;
- break;
+ if (!IsStructScalar) {
+ U = nullptr;
+ if (It != ScalarToExtUses.end()) {
+ ExternalUses[It->second].User = nullptr;
+ break;
+ }
}
}
- if (U && Scalar->hasNUsesOrMore(UsesLimit))
+ if (U && !IsStructScalar && Scalar->hasNUsesOrMore(UsesLimit))
U = nullptr;
unsigned FoundLane = Entry->findLaneForValue(Scalar);
LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
@@ -9971,7 +10026,8 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
ArrayRef<Value *> Values(reinterpret_cast<Value *const *>(Loads.begin()),
Loads.size());
Align Alignment = computeCommonAlignment<LoadInst>(Values);
- auto *Ty = getWidenedType(Loads.front()->getType(), Loads.size());
+ auto *Ty = cast<VectorType>(
+ getWidenedType(Loads.front()->getType(), Loads.size()));
return TTI->isLegalMaskedGather(Ty, Alignment) &&
!TTI->forceScalarizeMaskedGather(Ty, Alignment);
};
@@ -10269,7 +10325,8 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
// Segmented load detected - vectorize at maximum vector factor.
if (InterleaveFactor <= Slice.size() &&
TTI.isLegalInterleavedAccessType(
- getWidenedType(Slice.front()->getType(), VF),
+ cast<VectorType>(
+ getWidenedType(Slice.front()->getType(), VF)),
InterleaveFactor,
cast<LoadInst>(Slice.front())->getAlign(),
cast<LoadInst>(Slice.front())
@@ -10529,11 +10586,10 @@ buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID,
/// function (if possible) calls. Returns invalid cost for the corresponding
/// calls, if they cannot be vectorized/will be scalarized.
static std::pair<InstructionCost, InstructionCost>
-getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
- TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
- ArrayRef<Type *> ArgTys) {
+getVectorCallCosts(CallInst *CI, Type *VecTy, TargetTransformInfo *TTI,
+ TargetLibraryInfo *TLI, ArrayRef<Type *> ArgTys) {
auto Shape = VFShape::get(CI->getFunctionType(),
- ElementCount::getFixed(VecTy->getNumElements()),
+ ElementCount::getFixed(getNumElements(VecTy)),
false /*HasGlobalPred*/);
Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
auto LibCost = InstructionCost::getInvalid();
@@ -10594,6 +10650,77 @@ ArrayRef<const Loop *> BoUpSLP::getLoopNest(const Loop *L) {
return Res;
}
+/// Detects an extractvalue bundle that can be widened by vectorizing the
+/// underlying struct-returning calls.
+///
+/// \p VL is a bundle whose state \p S is Instruction::ExtractValue. The
+/// bundle is acceptable for widening into one struct-of-vectors call only
+/// when:
+/// - every element of \p VL is an ExtractValueIn...
[truncated]
|
Allow SLP to combine across lanes calls that return a literal struct
(llvm.sincos, llvm.*.with.overflow, llvm.frexp, ...) into a single
call returning a struct of vectors, by widening {T, T, ...} to
{<VF x T>, ...} via VectorTypeUtils and emitting extractvalue +
extractelement for external uses.
Original Pull Request: llvm/llvm-project#195521
Original Pull Request2: llvm/llvm-project#196756
Recommit after revert llvm/llvm-project#198265 (comment)
Added check for valid vectorizable type, small corner cases fixes
Reviewers:
Pull Request: llvm/llvm-project#199433
Allow SLP to combine across lanes calls that return a literal struct
(llvm.sincos, llvm.*.with.overflow, llvm.frexp, ...) into a single
call returning a struct of vectors, by widening {T, T, ...} to
{<VF x T>, ...} via VectorTypeUtils and emitting extractvalue +
extractelement for external uses.
Original Pull Request: llvm/llvm-project#195521
Original Pull Request2: llvm/llvm-project#196756
Recommit after revert llvm/llvm-project#198265 (comment)
Added check for valid vectorizable type, small corner cases fixes
Reviewers:
Pull Request: llvm/llvm-project#199433
Allow SLP to combine across lanes calls that return a literal struct
(llvm.sincos, llvm.*.with.overflow, llvm.frexp, ...) into a single
call returning a struct of vectors, by widening {T, T, ...} to
{, ...} via VectorTypeUtils and emitting extractvalue +
extractelement for external uses.
Original Pull Request: #195521
Original Pull Request2: #196756
Recommit after revert #198265 (comment)
Added check for valid vectorizable type, small corner cases fixes