[SLP] Ensure TreeCost is scaled for ordered fadd reductions#199388
[SLP] Ensure TreeCost is scaled for ordered fadd reductions#199388KavinTheG wants to merge 2 commits into
Conversation
Addresses an issue where `getScaleToLoopIterations()` returns 1 on isolated SLP trees because `UserTreeIndex` is invalid. This prevents `TreeCost` from scaling alongside `ReductionCost`, causing the cost model to incorrectly treat an unprofitable vector reduction as profitable.
|
@llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-backend-risc-v Author: Kavin Gnanapandithan (KavinTheG) ChangesResolves #199267 Addresses an issue where This patch passes the reduction root instruction down into Full diff: https://github.com/llvm/llvm-project/pull/199388.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 6bd595c86d55a..e83f3fcbe65c4 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2153,7 +2153,8 @@ class slpvectorizer::BoUpSLP {
/// Calculates the cost of the subtrees, trims non-profitable ones and returns
/// final cost.
InstructionCost
- calculateTreeCostAndTrimNonProfitable(ArrayRef<Value *> VectorizedVals = {});
+ calculateTreeCostAndTrimNonProfitable(ArrayRef<Value *> VectorizedVals = {},
+ Instruction *RdxRoot = nullptr);
/// \returns the vectorization cost of the subtree that starts at \p VL.
/// A negative number means that this is profitable.
@@ -3973,7 +3974,8 @@ class slpvectorizer::BoUpSLP {
/// LICM hoisting that optimizeGatherSequence() performs after vectorization
/// for inserts with loop-invariant operands. Falls back to the whole-entry
/// scale when per-lane information is unavailable or the feature is off.
- uint64_t getGatherNodeEffectiveScale(const TreeEntry &TE);
+ uint64_t getGatherNodeEffectiveScale(const TreeEntry &TE,
+ Instruction *U = nullptr);
/// Get the loop nest for the given loop \p L.
ArrayRef<const Loop *> getLoopNest(const Loop *L);
@@ -16283,14 +16285,15 @@ uint64_t BoUpSLP::getLoopNestScale(const Loop *L) {
return std::max<uint64_t>(1, Scale);
}
-uint64_t BoUpSLP::getGatherNodeEffectiveScale(const TreeEntry &TE) {
+uint64_t BoUpSLP::getGatherNodeEffectiveScale(const TreeEntry &TE,
+ Instruction *U) {
// Only meaningful for gather/buildvector-like entries; the per-lane
// insertelements that make up such an entry are LICM-hoistable by
// optimizeGatherSequence() when their operand is loop-invariant.
assert((TE.isGather() || TE.State == TreeEntry::SplitVectorize) &&
"Expected gather/split tree entry.");
- uint64_t BaseScale = getScaleToLoopIterations(TE);
+ uint64_t BaseScale = getScaleToLoopIterations(TE, nullptr, U);
if (!PerLaneGatherScale || LoopAwareTripCount == 0 || BaseScale <= 1)
return BaseScale;
@@ -16315,7 +16318,8 @@ uint64_t BoUpSLP::getGatherNodeEffectiveScale(const TreeEntry &TE) {
if (isConstant(V))
continue;
++N;
- uint64_t LaneScale = std::min(getScaleToLoopIterations(TE, V), BaseScale);
+ uint64_t LaneScale =
+ std::min(getScaleToLoopIterations(TE, V, U), BaseScale);
Sum = SaturatingAdd(Sum, LaneScale, &Overflow);
if (Overflow)
return BaseScale;
@@ -18642,8 +18646,9 @@ static T *performExtractsShuffleAction(
return Prev;
}
-InstructionCost BoUpSLP::calculateTreeCostAndTrimNonProfitable(
- ArrayRef<Value *> VectorizedVals) {
+InstructionCost
+BoUpSLP::calculateTreeCostAndTrimNonProfitable(ArrayRef<Value *> VectorizedVals,
+ Instruction *RdxRoot) {
SmallDenseMap<const TreeEntry *, InstructionCost> NodesCosts;
SmallPtrSet<Value *, 4> CheckedExtracts;
SmallSetVector<TreeEntry *, 4> GatheredLoadsNodes;
@@ -18752,8 +18757,8 @@ InstructionCost BoUpSLP::calculateTreeCostAndTrimNonProfitable(
}
}
if (!CostIsFree && !Scale) {
- Scale = IsGatherLike ? getGatherNodeEffectiveScale(TE)
- : getScaleToLoopIterations(TE);
+ Scale = IsGatherLike ? getGatherNodeEffectiveScale(TE, RdxRoot)
+ : getScaleToLoopIterations(TE, nullptr, RdxRoot);
C *= Scale;
EntryToScale.try_emplace(&TE, Scale);
if (!TE.isGather() && TE.hasState()) {
@@ -29490,7 +29495,8 @@ class HorizontalReduction {
V.transformNodes();
V.computeMinimumValueSizes();
- InstructionCost TreeCost = V.calculateTreeCostAndTrimNonProfitable(VL);
+ InstructionCost TreeCost =
+ V.calculateTreeCostAndTrimNonProfitable(VL, RdxRootInst);
V.buildExternalUses(LocalExternallyUsedValues);
InstructionCost ReductionCost =
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/ordered-reduction.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/ordered-reduction.ll
new file mode 100644
index 0000000000000..aa21b0f98f8a6
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/ordered-reduction.ll
@@ -0,0 +1,76 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S --passes=slp-vectorizer -mtriple=riscv64-unknown-linux-gnu -mattr="+v" < %s | FileCheck %s
+
+target triple = "riscv64-unknown-linux-gnu"
+
+define void @main(ptr %arrayidx.i60, ptr %24, ptr %arrayidx44.i61, double %.pre117.i70, double %.pre116.i68, double %.pre115.i67, double %.pre114.i65, double %.pre113.i64, double %.pre.i62) {
+; CHECK-LABEL: define void @main(
+; CHECK-SAME: ptr [[ARRAYIDX_I60:%.*]], ptr [[TMP0:%.*]], ptr [[ARRAYIDX44_I61:%.*]], double [[DOTPRE117_I70:%.*]], double [[DOTPRE116_I68:%.*]], double [[DOTPRE115_I67:%.*]], double [[DOTPRE114_I65:%.*]], double [[DOTPRE113_I64:%.*]], double [[DOTPRE_I62:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[FOR_COND5_PREHEADER_I58:.*]]:
+; CHECK-NEXT: br label %[[FOR_BODY8_I71:.*]]
+; CHECK: [[FOR_BODY8_I71]]:
+; CHECK-NEXT: [[TMP1:%.*]] = phi double [ [[DOTPRE117_I70]], %[[FOR_COND5_PREHEADER_I58]] ], [ [[TMP9:%.*]], %[[FOR_BODY8_I71]] ]
+; CHECK-NEXT: [[TMP2:%.*]] = phi double [ [[DOTPRE116_I68]], %[[FOR_COND5_PREHEADER_I58]] ], [ [[TMP1]], %[[FOR_BODY8_I71]] ]
+; CHECK-NEXT: [[TMP3:%.*]] = phi double [ [[DOTPRE115_I67]], %[[FOR_COND5_PREHEADER_I58]] ], [ [[TMP8:%.*]], %[[FOR_BODY8_I71]] ]
+; CHECK-NEXT: [[TMP4:%.*]] = phi double [ [[DOTPRE114_I65]], %[[FOR_COND5_PREHEADER_I58]] ], [ [[DIV_I86:%.*]], %[[FOR_BODY8_I71]] ]
+; CHECK-NEXT: [[TMP5:%.*]] = phi double [ [[DOTPRE113_I64]], %[[FOR_COND5_PREHEADER_I58]] ], [ [[TMP7:%.*]], %[[FOR_BODY8_I71]] ]
+; CHECK-NEXT: [[TMP6:%.*]] = phi double [ [[DOTPRE_I62]], %[[FOR_COND5_PREHEADER_I58]] ], [ [[TMP5]], %[[FOR_BODY8_I71]] ]
+; CHECK-NEXT: [[INDVARS_IV_I72:%.*]] = phi i64 [ 1, %[[FOR_COND5_PREHEADER_I58]] ], [ [[INDVARS_IV_NEXT_I74:%.*]], %[[FOR_BODY8_I71]] ]
+; CHECK-NEXT: [[ADD_I73:%.*]] = fadd double [[TMP5]], [[TMP6]]
+; CHECK-NEXT: [[INDVARS_IV_NEXT_I74]] = add nuw nsw i64 [[INDVARS_IV_I72]], 1
+; CHECK-NEXT: [[ARRAYIDX23_I75:%.*]] = getelementptr inbounds nuw [8 x i8], ptr [[ARRAYIDX_I60]], i64 [[INDVARS_IV_NEXT_I74]]
+; CHECK-NEXT: [[TMP7]] = load double, ptr [[ARRAYIDX23_I75]], align 8
+; CHECK-NEXT: [[ADD24_I76:%.*]] = fadd double [[ADD_I73]], [[TMP7]]
+; CHECK-NEXT: [[ADD30_I77:%.*]] = fadd double [[TMP4]], [[ADD24_I76]]
+; CHECK-NEXT: [[ARRAYIDX34_I78:%.*]] = getelementptr inbounds nuw [8 x i8], ptr [[TMP0]], i64 [[INDVARS_IV_I72]]
+; CHECK-NEXT: [[ADD35_I79:%.*]] = fadd double [[TMP3]], [[ADD30_I77]]
+; CHECK-NEXT: [[ARRAYIDX40_I80:%.*]] = getelementptr inbounds nuw [8 x i8], ptr [[TMP0]], i64 [[INDVARS_IV_NEXT_I74]]
+; CHECK-NEXT: [[TMP8]] = load double, ptr [[ARRAYIDX40_I80]], align 8
+; CHECK-NEXT: [[ADD41_I81:%.*]] = fadd double [[TMP8]], [[ADD35_I79]]
+; CHECK-NEXT: [[ADD48_I82:%.*]] = fadd double [[TMP2]], [[ADD41_I81]]
+; CHECK-NEXT: [[ADD54_I83:%.*]] = fadd double [[TMP1]], [[ADD48_I82]]
+; CHECK-NEXT: [[ARRAYIDX60_I84:%.*]] = getelementptr inbounds nuw [8 x i8], ptr [[ARRAYIDX44_I61]], i64 [[INDVARS_IV_NEXT_I74]]
+; CHECK-NEXT: [[TMP9]] = load double, ptr [[ARRAYIDX60_I84]], align 8
+; CHECK-NEXT: [[TMP18:%.*]] = fadd double [[TMP9]], [[ADD54_I83]]
+; CHECK-NEXT: [[DIV_I86]] = fdiv double [[TMP18]], 9.000000e+00
+; CHECK-NEXT: store double [[DIV_I86]], ptr [[ARRAYIDX34_I78]], align 8
+; CHECK-NEXT: [[EXITCOND_NOT_I87:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_I74]], 1999
+; CHECK-NEXT: br i1 [[EXITCOND_NOT_I87]], label %[[FOR_INC66_I88:.*]], label %[[FOR_BODY8_I71]]
+; CHECK: [[FOR_INC66_I88]]:
+; CHECK-NEXT: ret void
+;
+for.cond.preheader:
+ br label %for.body
+
+for.body:
+ %25 = phi double [ %.pre117.i70, %for.cond.preheader ], [ %33, %for.body ]
+ %26 = phi double [ %.pre116.i68, %for.cond.preheader ], [ %25, %for.body ]
+ %27 = phi double [ %.pre115.i67, %for.cond.preheader ], [ %32, %for.body ]
+ %28 = phi double [ %.pre114.i65, %for.cond.preheader ], [ %div.i86, %for.body ]
+ %29 = phi double [ %.pre113.i64, %for.cond.preheader ], [ %31, %for.body ]
+ %30 = phi double [ %.pre.i62, %for.cond.preheader ], [ %29, %for.body ]
+ %indvars.iv.i72 = phi i64 [ 1, %for.cond.preheader ], [ %indvars.iv.next.i74, %for.body ]
+ %add.i73 = fadd double %29, %30
+ %indvars.iv.next.i74 = add nuw nsw i64 %indvars.iv.i72, 1
+ %arrayidx23.i75 = getelementptr inbounds nuw [8 x i8], ptr %arrayidx.i60, i64 %indvars.iv.next.i74
+ %31 = load double, ptr %arrayidx23.i75, align 8
+ %add24.i76 = fadd double %add.i73, %31
+ %add30.i77 = fadd double %28, %add24.i76
+ %arrayidx34.i78 = getelementptr inbounds nuw [8 x i8], ptr %24, i64 %indvars.iv.i72
+ %add35.i79 = fadd double %27, %add30.i77
+ %arrayidx40.i80 = getelementptr inbounds nuw [8 x i8], ptr %24, i64 %indvars.iv.next.i74
+ %32 = load double, ptr %arrayidx40.i80, align 8
+ %add41.i81 = fadd double %32, %add35.i79
+ %add48.i82 = fadd double %26, %add41.i81
+ %add54.i83 = fadd double %25, %add48.i82
+ %arrayidx60.i84 = getelementptr inbounds nuw [8 x i8], ptr %arrayidx44.i61, i64 %indvars.iv.next.i74
+ %33 = load double, ptr %arrayidx60.i84, align 8
+ %add61.i85 = fadd double %33, %add54.i83
+ %div.i86 = fdiv double %add61.i85, 9.000000e+00
+ store double %div.i86, ptr %arrayidx34.i78, align 8
+ %exitcond.not.i87 = icmp eq i64 %indvars.iv.next.i74, 1999
+ br i1 %exitcond.not.i87, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
|
|
@alexey-bataev, would appreciate a review! Extended |
| Scale = IsGatherLike ? getGatherNodeEffectiveScale(TE) | ||
| : getScaleToLoopIterations(TE); | ||
| Scale = IsGatherLike ? getGatherNodeEffectiveScale(TE, RdxRoot) | ||
| : getScaleToLoopIterations(TE, nullptr, RdxRoot); |
There was a problem hiding this comment.
No, do not pass RdxRoot here for getScaleToLoopIterations, must be passed only for gathers/buildvectors
| @@ -0,0 +1,76 @@ | |||
| ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 | |||
There was a problem hiding this comment.
Precommit the test in a separate NFC patch
Resolves #199267
Addresses an issue where
getScaleToLoopIterations()can return 1 on isolated SLP trees becauseUserTreeIndexis invalid. This preventsTreeCostfrom scaling alongsideReductionCost, causing the cost model to incorrectly treat an unprofitable vector reduction as profitable.This patch passes the reduction root instruction down into
calculateTreeCostAndTrimNonProfitableand the underlying scale calculation sogetScaleToLoopIterationscan get the correct block context.