From c7a3b95c0f3e0aabc61e39ac635c340387765f30 Mon Sep 17 00:00:00 2001 From: "Duncan P. N. Exon Smith" Date: Fri, 18 Apr 2014 02:17:43 +0000 Subject: Revert "blockfreq: Rewrite BlockFrequencyInfoImpl" This reverts commits r206548, r206549 and r206549. There are some unit tests failing that aren't failing locally [1], so reverting until I have time to investigate. [1]: http://bb.pgr.jp/builders/ninja-x64-msvc-RA-centos6/builds/1816 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@206556 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Analysis/BlockFrequencyInfoImpl.h | 1889 +++----------------- lib/Analysis/BlockFrequencyInfo.cpp | 8 +- lib/Analysis/BlockFrequencyInfoImpl.cpp | 931 ---------- lib/Analysis/CMakeLists.txt | 1 - lib/CodeGen/MachineBlockFrequencyInfo.cpp | 12 +- test/Analysis/BlockFrequencyInfo/bad_input.ll | 50 - test/Analysis/BlockFrequencyInfo/basic.ll | 55 +- test/Analysis/BlockFrequencyInfo/double_exit.ll | 165 -- test/Analysis/BlockFrequencyInfo/irreducible.ll | 197 -- .../BlockFrequencyInfo/loop_with_branch.ll | 44 - .../nested_loop_with_branches.ll | 59 - test/CodeGen/XCore/llvm-intrinsics.ll | 6 +- 12 files changed, 316 insertions(+), 3101 deletions(-) delete mode 100644 lib/Analysis/BlockFrequencyInfoImpl.cpp delete mode 100644 test/Analysis/BlockFrequencyInfo/bad_input.ll delete mode 100644 test/Analysis/BlockFrequencyInfo/double_exit.ll delete mode 100644 test/Analysis/BlockFrequencyInfo/irreducible.ll delete mode 100644 test/Analysis/BlockFrequencyInfo/loop_with_branch.ll delete mode 100644 test/Analysis/BlockFrequencyInfo/nested_loop_with_branches.ll diff --git a/include/llvm/Analysis/BlockFrequencyInfoImpl.h b/include/llvm/Analysis/BlockFrequencyInfoImpl.h index 66d27b7e4a..f891afdf55 100644 --- a/include/llvm/Analysis/BlockFrequencyInfoImpl.h +++ b/include/llvm/Analysis/BlockFrequencyInfoImpl.h @@ -7,7 +7,7 @@ // //===----------------------------------------------------------------------===// // -// Shared implementation of BlockFrequency for IR and Machine Instructions. +// Shared implementation of BlockFrequencyInfo for IR and Machine Instructions. // //===----------------------------------------------------------------------===// @@ -16,6 +16,8 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/PostOrderIterator.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/IR/BasicBlock.h" #include "llvm/Support/BlockFrequency.h" #include "llvm/Support/BranchProbability.h" @@ -24,1699 +26,374 @@ #include #include -//===----------------------------------------------------------------------===// -// -// PositiveFloat definition. -// -// TODO: Make this private to BlockFrequencyInfoImpl or delete. -// -//===----------------------------------------------------------------------===// namespace llvm { -class PositiveFloatBase { -public: - static const int MaxExponent = 16383; - static const int MinExponent = -16382; - static const int DefaultPrecision = 10; - - static void dump(uint64_t D, int16_t E, int Width); - static raw_ostream &print(raw_ostream &OS, uint64_t D, int16_t E, int Width, - unsigned Precision); - static std::string toString(uint64_t D, int16_t E, int Width, - unsigned Precision); - static int countLeadingZeros32(uint32_t N) { return countLeadingZeros(N); } - static int countLeadingZeros64(uint64_t N) { return countLeadingZeros(N); } - static uint64_t getHalf(uint64_t N) { return (N >> 1) + (N & 1); } - - static std::pair splitSigned(int64_t N) { - if (N >= 0) - return std::make_pair(N, false); - if (N == INT64_MIN) - return std::make_pair(uint64_t(N) + 1, true); - return std::make_pair(-N, true); - } - static int64_t joinSigned(uint64_t U, bool IsNeg) { - if (U > INT64_MAX) - return IsNeg ? INT64_MIN : INT64_MAX; - return IsNeg ? -int16_t(U) : U; - } - static int32_t extractLg(const std::pair &Lg) { - return Lg.first; - } - static int32_t extractLgFloor(const std::pair &Lg) { - return Lg.first - (Lg.second > 0); - } - static int32_t extractLgCeiling(const std::pair &Lg) { - return Lg.first + (Lg.second < 0); - } - static uint64_t getDiff(int16_t L, int16_t R) { - assert(L <= R && "arguments in wrong order"); - return (uint64_t)R - (uint64_t)L; - } - - static std::pair divide64(uint64_t L, uint64_t R); - static std::pair multiply64(uint64_t L, uint64_t R); - - static int compare(uint64_t L, uint64_t R, int Shift) { - assert(Shift >= 0); - assert(Shift < 64); - - uint64_t L_adjusted = L >> Shift; - if (L_adjusted < R) - return -1; - if (L_adjusted > R) - return 1; +class BranchProbabilityInfo; +class BlockFrequencyInfo; +class MachineBranchProbabilityInfo; +class MachineBlockFrequencyInfo; - return L > L_adjusted << Shift ? 1 : 0; - } +namespace bfi_detail { +template struct TypeMap {}; +template <> struct TypeMap { + typedef BasicBlock BlockT; + typedef Function FunctionT; + typedef BranchProbabilityInfo BranchProbabilityInfoT; }; - -/// \brief Simple representation of a positive floating point. -/// -/// PositiveFloat is a positive floating point number. It uses simple -/// saturation arithmetic, and every operation is well-defined for every value. -/// -/// The number is split into a signed exponent and unsigned digits. The number -/// represented is \c getDigits()*2^getExponent(). In this way, the digits are -/// much like the mantissa in the x87 long double, but there is no canonical -/// form, so the same number can be represented by many bit representations -/// (it's always in "denormal" mode). -/// -/// PositiveFloat is templated on the underlying integer type for digits, which -/// is expected to be one of uint64_t, uint32_t, uint16_t or uint8_t. -/// -/// Unlike builtin floating point types, PositiveFloat is portable. -/// -/// Unlike APFloat, PositiveFloat does not model architecture floating point -/// behaviour (this should make it a little faster), and implements most -/// operators (this makes it usable). -/// -/// PositiveFloat is totally ordered. However, there is no canonical form, so -/// there are multiple representations of most scalars. E.g.: -/// -/// PositiveFloat(8u, 0) == PositiveFloat(4u, 1) -/// PositiveFloat(4u, 1) == PositiveFloat(2u, 2) -/// PositiveFloat(2u, 2) == PositiveFloat(1u, 3) -/// -/// PositiveFloat implements most arithmetic operations. Precision is kept -/// where possible. Uses simple saturation arithmetic, so that operations -/// saturate to 0.0 or getLargest() rather than under or overflowing. It has -/// some extra arithmetic for unit inversion. 0.0/0.0 is defined to be 0.0. -/// Any other division by 0.0 is defined to be getLargest(). -/// -/// As a convenience for modifying the exponent, left and right shifting are -/// both implemented, and both interpret negative shifts as positive shifts in -/// the opposite direction. -/// -/// Future work might extract most of the implementation into a base class -/// (e.g., \c Float) that has an \c IsSigned template parameter. The initial -/// use case for this only needed positive semantics, but it wouldn't take much -/// work to extend. -/// -/// Exponents are limited to the range accepted by x87 long double. This makes -/// it trivial to add functionality to convert to APFloat (this is already -/// relied on for the implementation of printing). -template class PositiveFloat : PositiveFloatBase { -public: - static_assert(!std::numeric_limits::is_signed, - "only unsigned floats supported"); - - typedef DigitsT DigitsType; - -private: - typedef std::numeric_limits DigitsLimits; - - static const int Width = sizeof(DigitsType) * 8; - static_assert(Width <= 64, "invalid integer width for digits"); - -private: - DigitsType Digits; - int16_t Exponent; - -public: - PositiveFloat() : Digits(0), Exponent(0) {} - - PositiveFloat(DigitsType Digits, int16_t Exponent) - : Digits(Digits), Exponent(Exponent) {} - -private: - PositiveFloat(const std::pair &X) - : Digits(X.first), Exponent(X.second) {} - -public: - static PositiveFloat getZero() { return PositiveFloat(0, 0); } - static PositiveFloat getOne() { return PositiveFloat(1, 0); } - static PositiveFloat getLargest() { - return PositiveFloat(DigitsLimits::max(), MaxExponent); - } - static PositiveFloat getFloat(uint64_t N) { return adjustToWidth(N, 0); } - static PositiveFloat getInverseFloat(uint64_t N) { - return getFloat(N).invert(); - } - static PositiveFloat getFraction(DigitsType N, DigitsType D) { - return getQuotient(N, D); - } - - int16_t getExponent() const { return Exponent; } - DigitsType getDigits() const { return Digits; } - - template IntT toInt() const; - - bool isZero() const { return !Digits; } - bool isLargest() const { return *this == getLargest(); } - bool isOne() const { - if (Exponent > 0 || Exponent <= -Width) - return false; - return Digits == DigitsType(1) << -Exponent; - } - - /// \brief The log base 2, rounded. - /// - /// Get the lg of the scalar. lg 0 is defined to be INT32_MIN. - int32_t lg() const { return extractLg(lgImpl()); } - - /// \brief The log base 2, rounded towards INT32_MIN. - /// - /// Get the lg floor. lg 0 is defined to be INT32_MIN. - int32_t lgFloor() const { return extractLgFloor(lgImpl()); } - - /// \brief The log base 2, rounded towards INT32_MAX. - /// - /// Get the lg ceiling. lg 0 is defined to be INT32_MIN. - int32_t lgCeiling() const { return extractLgCeiling(lgImpl()); } - - bool operator==(const PositiveFloat &X) const { return compare(X) == 0; } - bool operator<(const PositiveFloat &X) const { return compare(X) < 0; } - bool operator!=(const PositiveFloat &X) const { return compare(X) != 0; } - bool operator>(const PositiveFloat &X) const { return compare(X) > 0; } - bool operator<=(const PositiveFloat &X) const { return compare(X) <= 0; } - bool operator>=(const PositiveFloat &X) const { return compare(X) >= 0; } - - bool operator!() const { return isZero(); } - - /// \brief Convert to a decimal representation in a string. - /// - /// Convert to a string. Uses scientific notation for very large/small - /// numbers. Scientific notation is used roughly for numbers outside of the - /// range 2^-64 through 2^64. - /// - /// \c Precision indicates the number of decimal digits of precision to use; - /// 0 requests the maximum available. - /// - /// As a special case to make debugging easier, if the number is small enough - /// to convert without scientific notation and has more than \c Precision - /// digits before the decimal place, it's printed accurately to the first - /// digit past zero. E.g., assuming 10 digits of precision: - /// - /// 98765432198.7654... => 98765432198.8 - /// 8765432198.7654... => 8765432198.8 - /// 765432198.7654... => 765432198.8 - /// 65432198.7654... => 65432198.77 - /// 5432198.7654... => 5432198.765 - std::string toString(unsigned Precision = DefaultPrecision) { - return PositiveFloatBase::toString(Digits, Exponent, Width, Precision); - } - - /// \brief Print a decimal representation. - /// - /// Print a string. See toString for documentation. - raw_ostream &print(raw_ostream &OS, - unsigned Precision = DefaultPrecision) const { - return PositiveFloatBase::print(OS, Digits, Exponent, Width, Precision); - } - void dump() const { return PositiveFloatBase::dump(Digits, Exponent, Width); } - - PositiveFloat &operator+=(const PositiveFloat &X); - PositiveFloat &operator-=(const PositiveFloat &X); - PositiveFloat &operator*=(const PositiveFloat &X); - PositiveFloat &operator/=(const PositiveFloat &X); - PositiveFloat &operator<<=(int16_t Shift) { return shiftLeft(Shift); } - PositiveFloat &operator>>=(int16_t Shift) { return shiftRight(Shift); } - -private: - PositiveFloat &shiftLeft(int32_t Shift); - PositiveFloat &shiftRight(int32_t Shift); - PositiveFloat normalizeExponents(PositiveFloat X); - -public: - /// \brief Scale a large number accurately. - /// - /// Scale N (multiply it by this). Uses full precision multiplication, even - /// if Width is smaller than 64, so information is not lost. - uint64_t scale(uint64_t N) const; - uint64_t scaleByInverse(uint64_t N) const { - // TODO: implement directly, rather than relying on inverse. Inverse is - // expensive. - return inverse().scale(N); - } - int64_t scale(int64_t N) const { - std::pair Unsigned = splitSigned(N); - return joinSigned(scale(Unsigned.first), Unsigned.second); - } - int64_t scaleByInverse(int64_t N) const { - std::pair Unsigned = splitSigned(N); - return joinSigned(scaleByInverse(Unsigned.first), Unsigned.second); - } - - int compare(const PositiveFloat &X) const; - int compareTo(uint64_t N) const { - PositiveFloat Float = getFloat(N); - int Compare = compare(Float); - if (Width == 64 || Compare != 0) - return Compare; - - // Check for precision loss. We know *this == RoundTrip. - uint64_t RoundTrip = Float.template toInt(); - return N == RoundTrip ? 0 : RoundTrip < N ? -1 : 1; - } - int compareTo(int64_t N) const { return N < 0 ? 1 : compareTo(uint64_t(N)); } - - PositiveFloat &invert() { return *this = PositiveFloat::getFloat(1) / *this; } - PositiveFloat inverse() const { return PositiveFloat(*this).invert(); } - -private: - static PositiveFloat getProduct(DigitsType L, DigitsType R); - static PositiveFloat getQuotient(DigitsType Dividend, DigitsType Divisor); - - std::pair lgImpl() const; - static int countLeadingZerosWidth(DigitsType Digits) { - if (Width == 64) - return countLeadingZeros64(Digits); - if (Width == 32) - return countLeadingZeros32(Digits); - return countLeadingZeros32(Digits) + Width - 32; - } - - static PositiveFloat adjustToWidth(uint64_t N, int S) { - assert(S >= MinExponent); - assert(S <= MaxExponent); - if (Width == 64 || N <= DigitsLimits::max()) - return PositiveFloat(N, S); - - // Shift right. - int Shift = 64 - Width - countLeadingZeros64(N); - DigitsType Shifted = N >> Shift; - - // Round. - assert(S + Shift <= MaxExponent); - return getRounded(PositiveFloat(Shifted, S + Shift), - N & UINT64_C(1) << (Shift - 1)); - } - - static PositiveFloat getRounded(PositiveFloat P, bool Round) { - if (!Round) - return P; - if (P.Digits == DigitsLimits::max()) - // Careful of overflow in the exponent. - return PositiveFloat(1, P.Exponent) <<= Width; - return PositiveFloat(P.Digits + 1, P.Exponent); - } +template <> struct TypeMap { + typedef MachineBasicBlock BlockT; + typedef MachineFunction FunctionT; + typedef MachineBranchProbabilityInfo BranchProbabilityInfoT; }; - -template -PositiveFloat operator+(const PositiveFloat &L, - const PositiveFloat &R) { - return PositiveFloat(L) += R; -} -template -PositiveFloat operator-(const PositiveFloat &L, - const PositiveFloat &R) { - return PositiveFloat(L) -= R; -} -template -PositiveFloat operator*(const PositiveFloat &L, - const PositiveFloat &R) { - return PositiveFloat(L) *= R; -} -template -PositiveFloat operator/(const PositiveFloat &L, - const PositiveFloat &R) { - return PositiveFloat(L) /= R; -} -template -PositiveFloat operator<<(const PositiveFloat &F, - int16_t Shift) { - return PositiveFloat(F) <<= Shift; -} -template -PositiveFloat operator>>(const PositiveFloat &F, - int16_t Shift) { - return PositiveFloat(F) >>= Shift; } -template -raw_ostream &operator<<(raw_ostream &OS, const PositiveFloat &X) { - return X.print(OS, 10); -} +/// BlockFrequencyInfoImpl implements block frequency algorithm for IR and +/// Machine Instructions. Algorithm starts with value ENTRY_FREQ +/// for the entry block and then propagates frequencies using branch weights +/// from (Machine)BranchProbabilityInfo. LoopInfo is not required because +/// algorithm can find "backedges" by itself. +template +class BlockFrequencyInfoImpl { + typedef typename bfi_detail::TypeMap::BlockT BlockT; + typedef typename bfi_detail::TypeMap::FunctionT FunctionT; + typedef typename bfi_detail::TypeMap::BranchProbabilityInfoT + BranchProbabilityInfoT; -template -bool operator<(const PositiveFloat &L, uint64_t R) { - return L.compareTo(R) < 0; -} -template -bool operator>(const PositiveFloat &L, uint64_t R) { - return L.compareTo(R) > 0; -} -template -bool operator==(const PositiveFloat &L, uint64_t R) { - return L.compareTo(R) == 0; -} -template -bool operator!=(const PositiveFloat &L, uint64_t R) { - return L.compareTo(R) != 0; -} -template -bool operator<=(const PositiveFloat &L, uint64_t R) { - return L.compareTo(R) <= 0; -} -template -bool operator>=(const PositiveFloat &L, uint64_t R) { - return L.compareTo(R) >= 0; -} + DenseMap Freqs; -template -bool operator<(const PositiveFloat &L, int64_t R) { - return L.compareTo(R) < 0; -} -template -bool operator>(const PositiveFloat &L, int64_t R) { - return L.compareTo(R) > 0; -} -template -bool operator==(const PositiveFloat &L, int64_t R) { - return L.compareTo(R) == 0; -} -template -bool operator!=(const PositiveFloat &L, int64_t R) { - return L.compareTo(R) != 0; -} -template -bool operator<=(const PositiveFloat &L, int64_t R) { - return L.compareTo(R) <= 0; -} -template -bool operator>=(const PositiveFloat &L, int64_t R) { - return L.compareTo(R) >= 0; -} + BranchProbabilityInfoT *BPI; -template -bool operator<(const PositiveFloat &L, uint32_t R) { - return L.compareTo(uint64_t(R)) < 0; -} -template -bool operator>(const PositiveFloat &L, uint32_t R) { - return L.compareTo(uint64_t(R)) > 0; -} -template -bool operator==(const PositiveFloat &L, uint32_t R) { - return L.compareTo(uint64_t(R)) == 0; -} -template -bool operator!=(const PositiveFloat &L, uint32_t R) { - return L.compareTo(uint64_t(R)) != 0; -} -template -bool operator<=(const PositiveFloat &L, uint32_t R) { - return L.compareTo(uint64_t(R)) <= 0; -} -template -bool operator>=(const PositiveFloat &L, uint32_t R) { - return L.compareTo(uint64_t(R)) >= 0; -} + FunctionT *Fn; -template -bool operator<(const PositiveFloat &L, int32_t R) { - return L.compareTo(int64_t(R)) < 0; -} -template -bool operator>(const PositiveFloat &L, int32_t R) { - return L.compareTo(int64_t(R)) > 0; -} -template -bool operator==(const PositiveFloat &L, int32_t R) { - return L.compareTo(int64_t(R)) == 0; -} -template -bool operator!=(const PositiveFloat &L, int32_t R) { - return L.compareTo(int64_t(R)) != 0; -} -template -bool operator<=(const PositiveFloat &L, int32_t R) { - return L.compareTo(int64_t(R)) <= 0; -} -template -bool operator>=(const PositiveFloat &L, int32_t R) { - return L.compareTo(int64_t(R)) >= 0; -} + typedef GraphTraits< Inverse > GT; -template -bool operator<(uint64_t L, const PositiveFloat &R) { - return R > L; -} -template -bool operator>(uint64_t L, const PositiveFloat &R) { - return R < L; -} -template -bool operator==(uint64_t L, const PositiveFloat &R) { - return R == L; -} -template -bool operator<=(uint64_t L, const PositiveFloat &R) { - return R >= L; -} -template -bool operator>=(uint64_t L, const PositiveFloat &R) { - return R <= L; -} -template -bool operator!=(uint64_t L, const PositiveFloat &R) { - return R != L; -} -template -bool operator<(int64_t L, const PositiveFloat &R) { - return R > L; -} -template -bool operator>(int64_t L, const PositiveFloat &R) { - return R < L; -} -template -bool operator==(int64_t L, const PositiveFloat &R) { - return R == L; -} -template -bool operator<=(int64_t L, const PositiveFloat &R) { - return R >= L; -} -template -bool operator>=(int64_t L, const PositiveFloat &R) { - return R <= L; -} -template -bool operator!=(int64_t L, const PositiveFloat &R) { - return R != L; -} -template -bool operator<(uint32_t L, const PositiveFloat &R) { - return R > L; -} -template -bool operator>(uint32_t L, const PositiveFloat &R) { - return R < L; -} -template -bool operator==(uint32_t L, const PositiveFloat &R) { - return R == L; -} -template -bool operator<=(uint32_t L, const PositiveFloat &R) { - return R >= L; -} -template -bool operator>=(uint32_t L, const PositiveFloat &R) { - return R <= L; -} -template -bool operator!=(uint32_t L, const PositiveFloat &R) { - return R != L; -} -template -bool operator<(int32_t L, const PositiveFloat &R) { - return R > L; -} -template -bool operator>(int32_t L, const PositiveFloat &R) { - return R < L; -} -template -bool operator==(int32_t L, const PositiveFloat &R) { - return R == L; -} -template -bool operator<=(int32_t L, const PositiveFloat &R) { - return R >= L; -} -template -bool operator>=(int32_t L, const PositiveFloat &R) { - return R <= L; -} -template -bool operator!=(int32_t L, const PositiveFloat &R) { - return R != L; -} + static const uint64_t EntryFreq = 1 << 14; -template -uint64_t PositiveFloat::scale(uint64_t N) const { - if (Width == 64 || N <= DigitsLimits::max()) - return (getFloat(N) * *this).template toInt(); - std::pair Lg = lgImpl(); - if (extractLgFloor(Lg) >= 64) - return UINT64_MAX; - if (extractLgCeiling(Lg) <= -64) - return 0; - - uint64_t Result = 0; - for (int Bit = 0; Bit < 64; Bit += Width) { - PositiveFloat Digit = getFloat(N & DigitsLimits::max() << Bit); - Digit *= *this; - - uint64_t Sum = Result + (Digit.toInt() >> Bit); - if (Sum < Result) - return UINT64_MAX; - Result = Sum; + std::string getBlockName(BasicBlock *BB) const { + return BB->getName().str(); } - return Result; -} -template -PositiveFloat PositiveFloat::getProduct(DigitsType L, - DigitsType R) { - // Check for zero. - if (!L || !R) - return getZero(); + std::string getBlockName(MachineBasicBlock *MBB) const { + std::string str; + raw_string_ostream ss(str); + ss << "BB#" << MBB->getNumber(); - // Check for numbers that we can compute with 64-bit math. - if (Width <= 32) - return adjustToWidth(uint64_t(L) * uint64_t(R), 0); - - // Do the full thing. - return PositiveFloat(multiply64(L, R)); -} -template -PositiveFloat PositiveFloat::getQuotient(DigitsType Dividend, - DigitsType Divisor) { - // Check for zero. - if (!Dividend) - return getZero(); - if (!Divisor) - return getLargest(); - - if (Width == 64) - return PositiveFloat(divide64(Dividend, Divisor)); - - // We can compute this with 64-bit math. - int Shift = countLeadingZeros64(Dividend); - uint64_t Shifted = uint64_t(Dividend) << Shift; - uint64_t Quotient = Shifted / Divisor; - - // If Quotient needs to be shifted, then adjustToWidth will round. - if (Quotient > DigitsLimits::max()) - return adjustToWidth(Quotient, -Shift); - - // Round based on the value of the next bit. - return getRounded(PositiveFloat(Quotient, -Shift), - Shifted % Divisor >= getHalf(Divisor)); -} - -template -template -IntT PositiveFloat::toInt() const { - typedef std::numeric_limits Limits; - if (*this < 1) - return 0; - if (*this >= Limits::max()) - return Limits::max(); + if (const BasicBlock *BB = MBB->getBasicBlock()) + ss << " derived from LLVM BB " << BB->getName(); - IntT N = Digits; - if (Exponent > 0) { - assert(size_t(Exponent) < sizeof(IntT) * 8); - return N << Exponent; + return ss.str(); } - if (Exponent < 0) { - assert(size_t(-Exponent) < sizeof(IntT) * 8); - return N >> -Exponent; - } - return N; -} - -template -std::pair PositiveFloat::lgImpl() const { - if (isZero()) - return std::make_pair(INT32_MIN, 0); - - // Get the floor of the lg of Digits. - int32_t LocalFloor = Width - countLeadingZerosWidth(Digits) - 1; - - // Get the floor of the lg of this. - int32_t Floor = Exponent + LocalFloor; - if (Digits == UINT64_C(1) << LocalFloor) - return std::make_pair(Floor, 0); - // Round based on the next digit. - bool Round = Digits & UINT64_C(1) << (LocalFloor - 1); - return std::make_pair(Floor + Round, Round ? 1 : -1); -} - -template -PositiveFloat -PositiveFloat::normalizeExponents(PositiveFloat X) { - if (isZero() || X.isZero()) - return X; - - if (Exponent > X.Exponent) { - // Reverse the arguments. - *this = X.normalizeExponents(*this); - return X; + void setBlockFreq(BlockT *BB, BlockFrequency Freq) { + Freqs[BB] = Freq; + DEBUG(dbgs() << "Frequency(" << getBlockName(BB) << ") = "; + printBlockFreq(dbgs(), Freq) << "\n"); } - if (Exponent == X.Exponent) - return X; - - int ExponentDiff = getDiff(Exponent, X.Exponent); - if (ExponentDiff >= 2 * Width) { - *this = getZero(); - return X; + /// getEdgeFreq - Return edge frequency based on SRC frequency and Src -> Dst + /// edge probability. + BlockFrequency getEdgeFreq(BlockT *Src, BlockT *Dst) const { + BranchProbability Prob = BPI->getEdgeProbability(Src, Dst); + return getBlockFreq(Src) * Prob; } - // Use up any leading zeros on X, and then shift this. - int ShiftX = std::min(countLeadingZerosWidth(X.Digits), ExponentDiff); - int ShiftThis = ExponentDiff - ShiftX; - - if (ShiftThis >= Width) { - *this = getZero(); - return X; + /// incBlockFreq - Increase BB block frequency by FREQ. + /// + void incBlockFreq(BlockT *BB, BlockFrequency Freq) { + Freqs[BB] += Freq; + DEBUG(dbgs() << "Frequency(" << getBlockName(BB) << ") += "; + printBlockFreq(dbgs(), Freq) << " --> "; + printBlockFreq(dbgs(), Freqs[BB]) << "\n"); } - X.Digits <<= ShiftX; - X.Exponent -= ShiftX; - Digits >>= ShiftThis; - Exponent += ShiftThis; - return X; -} + // All blocks in postorder. + std::vector POT; -template -PositiveFloat &PositiveFloat:: -operator+=(const PositiveFloat &X) { - if (isLargest() || X.isZero()) - return *this; - if (isZero() || X.isLargest()) - return *this = X; - - // Normalize exponents. - PositiveFloat Scaled = normalizeExponents(X); - - // Check for zero again. - if (isZero()) - return *this = Scaled; - if (Scaled.isZero()) - return *this; - - // Compute sum. - DigitsType Sum = Digits + Scaled.Digits; - bool DidOverflow = Sum < Digits || Sum < Scaled.Digits; - Digits = Sum; - if (!DidOverflow) - return *this; - - if (Exponent == MaxExponent) - return *this = getLargest(); - - ++Exponent; - Digits = Digits >> 1 | UINT64_C(1) << (Width - 1); - - return *this; -} -template -PositiveFloat &PositiveFloat:: -operator-=(const PositiveFloat &X) { - if (X.isZero()) - return *this; - if (*this <= X) - return *this = getZero(); - - // Normalize exponents. - PositiveFloat Scaled = normalizeExponents(X); - assert(Digits >= Scaled.Digits); - - // Compute difference. - if (!Scaled.isZero()) { - Digits -= Scaled.Digits; - return *this; - } + // Map Block -> Position in reverse-postorder list. + DenseMap RPO; - // Check if X just barely lost its last bit. E.g., for 32-bit: - // - // 1*2^32 - 1*2^0 == 0xffffffff != 1*2^32 - if (*this == PositiveFloat(1, X.lgFloor() + Width)) { - Digits = DigitsType(0) - 1; - --Exponent; - } - return *this; -} -template -PositiveFloat &PositiveFloat:: -operator*=(const PositiveFloat &X) { - if (isZero()) - return *this; - if (X.isZero()) - return *this = X; - - // Save the exponents. - int32_t Exponents = int32_t(Exponent) + int32_t(X.Exponent); - - // Get the raw product. - *this = getProduct(Digits, X.Digits); - - // Combine with exponents. - return *this <<= Exponents; -} -template -PositiveFloat &PositiveFloat:: -operator/=(const PositiveFloat &X) { - if (isZero()) - return *this; - if (X.isZero()) - return *this = getLargest(); - - // Save the exponents. - int32_t Exponents = int32_t(Exponent) + -int32_t(X.Exponent); - - // Get the raw quotient. - *this = getQuotient(Digits, X.Digits); - - // Combine with exponents. - return *this <<= Exponents; -} -template -PositiveFloat &PositiveFloat::shiftLeft(int32_t Shift) { - if (Shift < 0) - return shiftRight(-Shift); - if (!Shift || isZero()) - return *this; - - // Shift as much as we can in the exponent. - int16_t ExponentShift = std::min(Shift, MaxExponent - Exponent); - Exponent += ExponentShift; - if (ExponentShift == Shift) - return *this; - - // Check this late, since it's rare. - if (isLargest()) - return *this; - - // Shift as far as possible. - int32_t RawShift = std::min(Shift, countLeadingZerosWidth(Digits)); - if (RawShift + ExponentShift < Shift) - // Saturate. - return *this = getLargest(); - - Digits <<= Shift; - return *this; -} + // For each loop header, record the per-iteration probability of exiting the + // loop. This is the reciprocal of the expected number of loop iterations. + typedef DenseMap LoopExitProbMap; + LoopExitProbMap LoopExitProb; -template -PositiveFloat &PositiveFloat::shiftRight(int32_t Shift) { - if (Shift < 0) - return shiftLeft(-Shift); - if (!Shift || isZero()) - return *this; - - // Shift as much as we can in the exponent. - int16_t ExponentShift = std::min(Shift, Exponent - MinExponent); - Exponent -= ExponentShift; - if (ExponentShift == Shift) - return *this; - - // Shift as far as possible. - int32_t RawShift = Shift - ExponentShift; - if (RawShift >= Width) - // Saturate. - return *this = getZero(); - - // May result in zero. - Digits >>= Shift; - return *this; -} + // (reverse-)postorder traversal iterators. + typedef typename std::vector::iterator pot_iterator; + typedef typename std::vector::reverse_iterator rpot_iterator; -template -int PositiveFloat::compare(const PositiveFloat &X) const { - // Check for zero. - if (isZero()) - return X.isZero() ? 0 : -1; - if (X.isZero()) - return 1; - - // Check for the scale. Use lgFloor to be sure that the exponent difference - // is always lower than 64. - int32_t lgL = lgFloor(), lgR = X.lgFloor(); - if (lgL != lgR) - return lgL < lgR ? -1 : 1; - - // Compare digits. - if (Exponent < X.Exponent) - return PositiveFloatBase::compare(Digits, X.Digits, X.Exponent - Exponent); - - return -PositiveFloatBase::compare(X.Digits, Digits, Exponent - X.Exponent); -} + pot_iterator pot_begin() { return POT.begin(); } + pot_iterator pot_end() { return POT.end(); } -template struct isPodLike> { - static const bool value = true; -}; -} - -//===----------------------------------------------------------------------===// -// -// BlockMass definition. -// -// TODO: Make this private to BlockFrequencyInfoImpl or delete. -// -//===----------------------------------------------------------------------===// -namespace llvm { + rpot_iterator rpot_begin() { return POT.rbegin(); } + rpot_iterator rpot_end() { return POT.rend(); } -/// \brief Mass of a block. -/// -/// This class implements a sort of fixed-point fraction always between 0.0 and -/// 1.0. getMass() == UINT64_MAX indicates a value of 1.0. -/// -/// Masses can be added and subtracted. Simple saturation arithmetic is used, -/// so arithmetic operations never overflow or underflow. -/// -/// Masses can be multiplied. Multiplication treats full mass as 1.0 and uses -/// an inexpensive floating-point algorithm that's off-by-one (almost, but not -/// quite, maximum precision). -/// -/// Masses can be scaled by \a BranchProbability at maximum precision. -class BlockMass { - uint64_t Mass; + rpot_iterator rpot_at(BlockT *BB) { + rpot_iterator I = rpot_begin(); + unsigned idx = RPO.lookup(BB); + assert(idx); + std::advance(I, idx - 1); -public: - BlockMass() : Mass(0) {} - explicit BlockMass(uint64_t Mass) : Mass(Mass) {} - - static BlockMass getEmpty() { return BlockMass(); } - static BlockMass getFull() { return BlockMass(UINT64_MAX); } - - uint64_t getMass() const { return Mass; } - - bool isFull() const { return Mass == UINT64_MAX; } - bool isEmpty() const { return !Mass; } - - bool operator!() const { return isEmpty(); } - - /// \brief Add another mass. - /// - /// Adds another mass, saturating at \a isFull() rather than overflowing. - BlockMass &operator+=(const BlockMass &X) { - uint64_t Sum = Mass + X.Mass; - Mass = Sum < Mass ? UINT64_MAX : Sum; - return *this; + assert(*I == BB); + return I; } - /// \brief Subtract another mass. + /// isBackedge - Return if edge Src -> Dst is a reachable backedge. /// - /// Subtracts another mass, saturating at \a isEmpty() rather than - /// undeflowing. - BlockMass &operator-=(const BlockMass &X) { - uint64_t Diff = Mass - X.Mass; - Mass = Diff > Mass ? 0 : Diff; - return *this; + bool isBackedge(BlockT *Src, BlockT *Dst) const { + unsigned a = RPO.lookup(Src); + if (!a) + return false; + unsigned b = RPO.lookup(Dst); + assert(b && "Destination block should be reachable"); + return a >= b; } - /// \brief Scale by another mass. - /// - /// The current implementation is a little imprecise, but it's relatively - /// fast, never overflows, and maintains the property that 1.0*1.0==1.0 - /// (where isFull represents the number 1.0). It's an approximation of - /// 128-bit multiply that gets right-shifted by 64-bits. - /// - /// For a given digit size, multiplying two-digit numbers looks like: - /// - /// U1 . L1 - /// * U2 . L2 - /// ============ - /// 0 . . L1*L2 - /// + 0 . U1*L2 . 0 // (shift left once by a digit-size) - /// + 0 . U2*L1 . 0 // (shift left once by a digit-size) - /// + U1*L2 . 0 . 0 // (shift left twice by a digit-size) - /// - /// BlockMass has 64-bit numbers. Split each into two 32-bit digits, stored - /// 64-bit. Add 1 to the lower digits, to model isFull as 1.0; this won't - /// overflow, since we have 64-bit storage for each digit. - /// - /// To do this accurately, (a) multiply into two 64-bit digits, incrementing - /// the upper digit on overflows of the lower digit (carry), (b) subtract 1 - /// from the lower digit, decrementing the upper digit on underflow (carry), - /// and (c) truncate the lower digit. For the 1.0*1.0 case, the upper digit - /// will be 0 at the end of step (a), and then will underflow back to isFull - /// (1.0) in step (b). - /// - /// Instead, the implementation does something a little faster with a small - /// loss of accuracy: ignore the lower 64-bit digit entirely. The loss of - /// accuracy is small, since the sum of the unmodelled carries is 0 or 1 - /// (i.e., step (a) will overflow at most once, and step (b) will underflow - /// only if step (a) overflows). - /// - /// This is the formula we're calculating: - /// - /// U1.L1 * U2.L2 == U1 * U2 + (U1 * (L2+1))>>32 + (U2 * (L1+1))>>32 - /// - /// As a demonstration of 1.0*1.0, consider two 4-bit numbers that are both - /// full (1111). - /// - /// U1.L1 * U2.L2 == U1 * U2 + (U1 * (L2+1))>>2 + (U2 * (L1+1))>>2 - /// 11.11 * 11.11 == 11 * 11 + (11 * (11+1))/4 + (11 * (11+1))/4 - /// == 1001 + (11 * 100)/4 + (11 * 100)/4 - /// == 1001 + 1100/4 + 1100/4 - /// == 1001 + 0011 + 0011 - /// == 1111 - BlockMass &operator*=(const BlockMass &X) { - uint64_t U1 = Mass >> 32, L1 = Mass & UINT32_MAX, U2 = X.Mass >> 32, - L2 = X.Mass & UINT32_MAX; - Mass = U1 * U2 + (U1 * (L2 + 1) >> 32) + ((L1 + 1) * U2 >> 32); - return *this; - } + /// getSingleBlockPred - return single BB block predecessor or NULL if + /// BB has none or more predecessors. + BlockT *getSingleBlockPred(BlockT *BB) { + typename GT::ChildIteratorType + PI = GraphTraits< Inverse >::child_begin(BB), + PE = GraphTraits< Inverse >::child_end(BB); - /// \brief Multiply by a branch probability. - /// - /// Multiply by P. Guarantees full precision. - /// - /// This could be naively implemented by multiplying by the numerator and - /// dividing by the denominator, but in what order? Multiplying first can - /// overflow, while dividing first will lose precision (potentially, changing - /// a non-zero mass to zero). - /// - /// The implementation mixes the two methods. Since \a BranchProbability - /// uses 32-bits and \a BlockMass 64-bits, shift the mass as far to the left - /// as there is room, then divide by the denominator to get a quotient. - /// Multiplying by the numerator and right shifting gives a first - /// approximation. - /// - /// Calculate the error in this first approximation by calculating the - /// opposite mass (multiply by the opposite numerator and shift) and - /// subtracting both from teh original mass. - /// - /// Add to the first approximation the correct fraction of this error value. - /// This time, multiply first and then divide, since there is no danger of - /// overflow. - /// - /// \pre P represents a fraction between 0.0 and 1.0. - BlockMass &operator*=(const BranchProbability &P); - - bool operator==(const BlockMass &X) const { return Mass == X.Mass; } - bool operator<(const BlockMass &X) const { return Mass < X.Mass; } - bool operator!=(const BlockMass &X) const { return !(*this == X); } - bool operator>(const BlockMass &X) const { return X < *this; } - bool operator<=(const BlockMass &X) const { return !(*this > X); } - bool operator>=(const BlockMass &X) const { return !(*this < X); } + if (PI == PE) + return nullptr; - /// \brief Convert to floating point. - /// - /// Convert to a float. \a isFull() gives 1.0, while \a isEmpty() gives - /// slightly above 0.0. - PositiveFloat toFloat() const; - - void dump() const; - raw_ostream &print(raw_ostream &OS) const; -}; - -inline BlockMass operator+(const BlockMass &L, const BlockMass &R) { - return BlockMass(L) += R; -} -inline BlockMass operator-(const BlockMass &L, const BlockMass &R) { - return BlockMass(L) -= R; -} -inline BlockMass operator*(const BlockMass &L, const BlockMass &R) { - return BlockMass(L) *= R; -} -inline BlockMass operator*(const BlockMass &L, const BranchProbability &R) { - return BlockMass(L) *= R; -} -inline BlockMass operator*(const BranchProbability &L, const BlockMass &R) { - return BlockMass(R) *= L; -} + BlockT *Pred = *PI; -inline raw_ostream &operator<<(raw_ostream &OS, const BlockMass &X) { - return X.print(OS); -} + ++PI; + if (PI != PE) + return nullptr; -template <> struct isPodLike { - static const bool value = true; -}; -} + return Pred; + } -//===----------------------------------------------------------------------===// -// -// BlockFrequencyInfoImpl definition. -// -//===----------------------------------------------------------------------===// -namespace llvm { + void doBlock(BlockT *BB, BlockT *LoopHead, + SmallPtrSet &BlocksInLoop) { -class BasicBlock; -class BranchProbabilityInfo; -class Function; -class Loop; -class LoopInfo; -class MachineBasicBlock; -class MachineBranchProbabilityInfo; -class MachineFunction; -class MachineLoop; -class MachineLoopInfo; - -/// \brief Base class for BlockFrequencyInfoImpl -/// -/// BlockFrequencyInfoImplBase has supporting data structures and some -/// algorithms for BlockFrequencyInfoImplBase. Only algorithms that depend on -/// the block type (or that call such algorithms) are skipped here. -/// -/// Nevertheless, the majority of the overall algorithm documention lives with -/// BlockFrequencyInfoImpl. See there for details. -class BlockFrequencyInfoImplBase { -public: - typedef PositiveFloat Float; + DEBUG(dbgs() << "doBlock(" << getBlockName(BB) << ")\n"); + setBlockFreq(BB, 0); - /// \brief Representative of a block. - /// - /// This is a simple wrapper around an index into the reverse-post-order - /// traversal of the blocks. - /// - /// Unlike a block pointer, its order has meaning (location in the - /// topological sort) and it's class is the same regardless of block type. - struct BlockNode { - typedef uint32_t IndexType; - IndexType Index; - - bool operator==(const BlockNode &X) const { return Index == X.Index; } - bool operator!=(const BlockNode &X) const { return Index != X.Index; } - bool operator<=(const BlockNode &X) const { return Index <= X.Index; } - bool operator>=(const BlockNode &X) const { return Index >= X.Index; } - bool operator<(const BlockNode &X) const { return Index < X.Index; } - bool operator>(const BlockNode &X) const { return Index > X.Index; } - - BlockNode() : Index(UINT32_MAX) {} - BlockNode(IndexType Index) : Index(Index) {} - - bool isValid() const { return Index <= getMaxIndex(); } - static size_t getMaxIndex() { return UINT32_MAX - 1; } - }; - - /// \brief Stats about a block itself. - struct FrequencyData { - Float Floating; - uint64_t Integer; - }; - - /// \brief Index of loop information. - struct WorkingData { - BlockNode ContainingLoop; ///< The block whose loop this block is inside. - uint32_t LoopIndex; ///< Index into PackagedLoops. - bool IsPackaged; ///< Has ContainingLoop been packaged up? - bool IsAPackage; ///< Has this block's loop been packaged up? - BlockMass Mass; ///< Mass distribution from the entry block. - - WorkingData() - : LoopIndex(UINT32_MAX), IsPackaged(false), IsAPackage(false) {} - - bool hasLoopHeader() const { return ContainingLoop.isValid(); } - bool isLoopHeader() const { return LoopIndex != UINT32_MAX; } - }; - - /// \brief Unscaled probability weight. - /// - /// Probability weight for an edge in the graph (including the - /// successor/target node). - /// - /// All edges in the original function are 32-bit. However, exit edges from - /// loop packages are taken from 64-bit exit masses, so we need 64-bits of - /// space in general. - /// - /// In addition to the raw weight amount, Weight stores the type of the edge - /// in the current context (i.e., the context of the loop being processed). - /// Is this a local edge within the loop, an exit from the loop, or a - /// backedge to the loop header? - struct Weight { - enum DistType { Local, Exit, Backedge }; - DistType Type; - BlockNode TargetNode; - uint64_t Amount; - Weight() : Type(Local), Amount(0) {} - }; - - /// \brief Distribution of unscaled probability weight. - /// - /// Distribution of unscaled probability weight to a set of successors. - /// - /// This class collates the successor edge weights for later processing. - /// - /// \a DidOverflow indicates whether \a Total did overflow while adding to - /// the distribution. It should never overflow twice. There's no flag for - /// whether \a ForwardTotal overflows, since when \a Total exceeds 32-bits - /// they both get re-computed during \a normalize(). - struct Distribution { - typedef SmallVector WeightList; - WeightList Weights; ///< Individual successor weights. - uint64_t Total; ///< Sum of all weights. - bool DidOverflow; ///< Whether \a Total did overflow. - uint32_t ForwardTotal; ///< Total excluding backedges. - - Distribution() : Total(0), DidOverflow(false), ForwardTotal(0) {} - void addLocal(const BlockNode &Node, uint64_t Amount) { - add(Node, Amount, Weight::Local); + if (BB == LoopHead) { + setBlockFreq(BB, EntryFreq); + return; } - void addExit(const BlockNode &Node, uint64_t Amount) { - add(Node, Amount, Weight::Exit); + + if (BlockT *Pred = getSingleBlockPred(BB)) { + if (BlocksInLoop.count(Pred)) + setBlockFreq(BB, getEdgeFreq(Pred, BB)); + // TODO: else? irreducible, ignore it for now. + return; } - void addBackedge(const BlockNode &Node, uint64_t Amount) { - add(Node, Amount, Weight::Backedge); + + bool isInLoop = false; + bool isLoopHead = false; + + for (typename GT::ChildIteratorType + PI = GraphTraits< Inverse >::child_begin(BB), + PE = GraphTraits< Inverse >::child_end(BB); + PI != PE; ++PI) { + BlockT *Pred = *PI; + + if (isBackedge(Pred, BB)) { + isLoopHead = true; + } else if (BlocksInLoop.count(Pred)) { + incBlockFreq(BB, getEdgeFreq(Pred, BB)); + isInLoop = true; + } + // TODO: else? irreducible. } - /// \brief Normalize the distribution. - /// - /// Combines multiple edges to the same \a Weight::TargetNode and scales - /// down so that \a Total fits into 32-bits. - /// - /// This is linear in the size of \a Weights. For the vast majority of - /// cases, adjacent edge weights are combined by sorting WeightList and - /// combining adjacent weights. However, for very large edge lists an - /// auxiliary hash table is used. - void normalize(); - - private: - void add(const BlockNode &Node, uint64_t Amount, Weight::DistType Type); - }; - - /// \brief Data for a packaged loop. - /// - /// Contains the data necessary to represent represent a loop as a node once - /// it's packaged. - /// - /// PackagedLoopData inherits from BlockData to give the node the necessary - /// stats. Further, it has a list of successors, list of members, and stores - /// the backedge mass assigned to this loop. - struct PackagedLoopData { - typedef SmallVector, 4> ExitMap; - typedef SmallVector MemberList; - BlockNode Header; ///< Header. - ExitMap Exits; ///< Successor edges (and weights). - MemberList Members; ///< Members of the loop. - BlockMass BackedgeMass; ///< Mass returned to loop header. - BlockMass Mass; - Float Scale; - - PackagedLoopData(const BlockNode &Header) : Header(Header) {} - }; - - /// \brief Data about each block. This is used downstream. - std::vector Freqs; - - /// \brief Loop data: see initializeLoops(). - std::vector Working; - - /// \brief Indexed information about packaged loops. - std::vector PackagedLoops; - - /// \brief Create the initial loop packages. - /// - /// Initializes PackagedLoops using the data in Working about backedges - /// and containing loops. Called by initializeLoops(). - /// - /// \post WorkingData::LoopIndex has been initialized for every loop header - /// and PackagedLoopData::Members has been initialized. + if (!isInLoop) + return; - /// \brief Add all edges out of a packaged loop to the distribution. - /// - /// Adds all edges from LocalLoopHead to Dist. Calls addToDist() to add each - /// successor edge. - void addLoopSuccessorsToDist(const BlockNode &LoopHead, - const BlockNode &LocalLoopHead, - Distribution &Dist); + if (!isLoopHead) + return; - /// \brief Add an edge to the distribution. - /// - /// Adds an edge to Succ to Dist. If \c LoopHead.isValid(), then whether the - /// edge is forward/exit/backedge is in the context of LoopHead. Otherwise, - /// every edge should be a forward edge (since all the loops are packaged - /// up). - void addToDist(Distribution &Dist, const BlockNode &LoopHead, - const BlockNode &Pred, const BlockNode &Succ, uint64_t Weight); - - PackagedLoopData &getLoopPackage(const BlockNode &Head) { - assert(Head.Index < Working.size()); - size_t Index = Working[Head.Index].LoopIndex; - assert(Index < PackagedLoops.size()); - return PackagedLoops[Index]; + // This block is a loop header, so boost its frequency by the expected + // number of loop iterations. The loop blocks will be revisited so they all + // get this boost. + typename LoopExitProbMap::const_iterator I = LoopExitProb.find(BB); + assert(I != LoopExitProb.end() && "Loop header missing from table"); + Freqs[BB] /= I->second; + DEBUG(dbgs() << "Loop header scaled to "; + printBlockFreq(dbgs(), Freqs[BB]) << ".\n"); } - /// \brief Distribute mass according to a distribution. - /// - /// Distributes the mass in Source according to Dist. If LoopHead.isValid(), - /// backedges and exits are stored in its entry in PackagedLoops. - /// - /// Mass is distributed in parallel from two copies of the source mass. - /// - /// The first mass (forward) represents the distribution of mass through the - /// local DAG. This distribution should lose mass at loop exits and ignore - /// backedges. - /// - /// The second mass (general) represents the behavior of the loop in the - /// global context. In a given distribution from the head, how much mass - /// exits, and to where? How much mass returns to the loop head? - /// - /// The forward mass should be split up between local successors and exits, - /// but only actually distributed to the local successors. The general mass - /// should be split up between all three types of successors, but distributed - /// only to exits and backedges. - void distributeMass(const BlockNode &Source, const BlockNode &LoopHead, - Distribution &Dist); - - /// \brief Compute the loop scale for a loop. - void computeLoopScale(const BlockNode &LoopHead); - - /// \brief Package up a loop. - void packageLoop(const BlockNode &LoopHead); - - /// \brief Finalize frequency metrics. - /// - /// Unwraps loop packages, calculates final frequencies, and cleans up - /// no-longer-needed data structures. - void finalizeMetrics(); + /// doLoop - Propagate block frequency down through the loop. + void doLoop(BlockT *Head, BlockT *Tail) { + DEBUG(dbgs() << "doLoop(" << getBlockName(Head) << ", " + << getBlockName(Tail) << ")\n"); - /// \brief Clear all memory. - void clear(); + SmallPtrSet BlocksInLoop; - virtual std::string getBlockName(const BlockNode &Node) const; + for (rpot_iterator I = rpot_at(Head), E = rpot_at(Tail); ; ++I) { + BlockT *BB = *I; + doBlock(BB, Head, BlocksInLoop); - virtual raw_ostream &print(raw_ostream &OS) const { return OS; } - void dump() const { print(dbgs()); } - - Float getFloatingBlockFreq(const BlockNode &Node) const; - - BlockFrequency getBlockFreq(const BlockNode &Node) const; + BlocksInLoop.insert(BB); + if (I == E) + break; + } - raw_ostream &printBlockFreq(raw_ostream &OS, const BlockNode &Node) const; - raw_ostream &printBlockFreq(raw_ostream &OS, - const BlockFrequency &Freq) const; + // Compute loop's cyclic probability using backedges probabilities. + BlockFrequency BackFreq; + for (typename GT::ChildIteratorType + PI = GraphTraits< Inverse >::child_begin(Head), + PE = GraphTraits< Inverse >::child_end(Head); + PI != PE; ++PI) { + BlockT *Pred = *PI; + assert(Pred); + if (isBackedge(Pred, Head)) + BackFreq += getEdgeFreq(Pred, Head); + } - uint64_t getEntryFreq() const { - assert(!Freqs.empty()); - return Freqs[0].Integer; + // The cyclic probability is freq(BackEdges) / freq(Head), where freq(Head) + // only counts edges entering the loop, not the loop backedges. + // The probability of leaving the loop on each iteration is: + // + // ExitProb = 1 - CyclicProb + // + // The Expected number of loop iterations is: + // + // Iterations = 1 / ExitProb + // + uint64_t D = std::max(getBlockFreq(Head).getFrequency(), UINT64_C(1)); + uint64_t N = std::max(BackFreq.getFrequency(), UINT64_C(1)); + if (N < D) + N = D - N; + else + // We'd expect N < D, but rounding and saturation means that can't be + // guaranteed. + N = 1; + + // Now ExitProb = N / D, make sure it fits in an i32/i32 fraction. + assert(N <= D); + if (D > UINT32_MAX) { + unsigned Shift = 32 - countLeadingZeros(D); + D >>= Shift; + N >>= Shift; + if (N == 0) + N = 1; + } + BranchProbability LEP = BranchProbability(N, D); + LoopExitProb.insert(std::make_pair(Head, LEP)); + DEBUG(dbgs() << "LoopExitProb[" << getBlockName(Head) << "] = " << LEP + << " from 1 - "; + printBlockFreq(dbgs(), BackFreq) << " / "; + printBlockFreq(dbgs(), getBlockFreq(Head)) << ".\n"); } - /// \brief Virtual destructor. - /// - /// Need a virtual destructor to mask the compiler warning about - /// getBlockName(). - virtual ~BlockFrequencyInfoImplBase() {} -}; -namespace bfi_detail { -template struct TypeMap {}; -template <> struct TypeMap { - typedef BasicBlock BlockT; - typedef Function FunctionT; - typedef BranchProbabilityInfo BranchProbabilityInfoT; - typedef Loop LoopT; - typedef LoopInfo LoopInfoT; -}; -template <> struct TypeMap { - typedef MachineBasicBlock BlockT; - typedef MachineFunction FunctionT; - typedef MachineBranchProbabilityInfo BranchProbabilityInfoT; - typedef MachineLoop LoopT; - typedef MachineLoopInfo LoopInfoT; -}; - -/// \brief Get the name of a MachineBasicBlock. -/// -/// Get the name of a MachineBasicBlock. It's templated so that including from -/// CodeGen is unnecessary (that would be a layering issue). -/// -/// This is used mainly for debug output. The name is similar to -/// MachineBasicBlock::getFullName(), but skips the name of the function. -template std::string getBlockName(const BlockT *BB) { - assert(BB && "Unexpected nullptr"); - if (BB->getBasicBlock()) - return BB->getName().str(); - return (Twine("BB") + Twine(BB->getNumber())).str(); -} -/// \brief Get the name of a BasicBlock. -template <> inline std::string getBlockName(const BasicBlock *BB) { - assert(BB && "Unexpected nullptr"); - return BB->getName().str(); -} -} - -/// \brief Shared implementation for block frequency analysis. -/// -/// This is a shared implementation of BlockFrequencyInfo and -/// MachineBlockFrequencyInfo, and calculates the relative frequencies of -/// blocks. -/// -/// This algorithm leverages BlockMass and PositiveFloat to maintain precision, -/// separates mass distribution from loop scaling, and dithers to eliminate -/// probability mass loss. -/// -/// The implementation is split between BlockFrequencyInfoImpl, which knows the -/// type of graph being modelled (BasicBlock vs. MachineBasicBlock), and -/// BlockFrequencyInfoImplBase, which doesn't. The base class uses \a -/// BlockNode, a wrapper around a uint32_t. BlockNode is numbered from 0 in -/// reverse-post order. This gives two advantages: it's easy to compare the -/// relative ordering of two nodes, and maps keyed on BlockT can be represented -/// by vectors. -/// -/// This algorithm is O(V+E), unless there is irreducible control flow, in -/// which case it's O(V*E) in the worst case. -/// -/// These are the main stages: -/// -/// 0. Reverse post-order traversal (\a initializeRPOT()). -/// -/// Run a single post-order traversal and save it (in reverse) in RPOT. -/// All other stages make use of this ordering. Save a lookup from BlockT -/// to BlockNode (the index into RPOT) in Nodes. -/// -/// 1. Loop indexing (\a initializeLoops()). -/// -/// Translate LoopInfo/MachineLoopInfo into a form suitable for the rest of -/// the algorithm. In particular, store the immediate members of each loop -/// in reverse post-order. -/// -/// 2. Calculate mass and scale in loops (\a computeMassInLoops()). -/// -/// For each loop (bottom-up), distribute mass through the DAG resulting -/// from ignoring backedges and treating sub-loops as a single pseudo-node. -/// Track the backedge mass distributed to the loop header, and use it to -/// calculate the loop scale (number of loop iterations). -/// -/// Visiting loops bottom-up is a post-order traversal of loop headers. -/// For each loop, immediate members that represent sub-loops will already -/// have been visited and packaged into a pseudo-node. -/// -/// Distributing mass in a loop is a reverse-post-order traversal through -/// the loop. Start by assigning full mass to the Loop header. For each -/// node in the loop: -/// -/// - Fetch and categorize the weight distribution for its successors. -/// If this is a packaged-subloop, the weight distribution is stored -/// in \a PackagedLoopData::Exits. Otherwise, fetch it from -/// BranchProbabilityInfo. -/// -/// - Each successor is categorized as \a Weight::Local, a normal -/// forward edge within the current loop, \a Weight::Backedge, a -/// backedge to the loop header, or \a Weight::Exit, any successor -/// outside the loop. The weight, the successor, and its category -/// are stored in \a Distribution. There can be multiple edges to -/// each successor. -/// -/// - Normalize the distribution: scale weights down so that their sum -/// is 32-bits, and coalesce multiple edges to the same node. -/// -/// - Distribute the mass accordingly, dithering to minimize mass loss, -/// as described in \a distributeMass(). Mass is distributed in -/// parallel in two ways: forward, and general. Local successors -/// take their mass from the forward mass, while exit and backedge -/// successors take their mass from the general mass. Additionally, -/// exit edges use up (ignored) mass from the forward mass, and local -/// edges use up (ignored) mass from the general distribution. -/// -/// Finally, calculate the loop scale from the accumulated backedge mass. -/// -/// 3. Distribute mass in the function (\a computeMassInFunction()). -/// -/// Finally, distribute mass through the DAG resulting from packaging all -/// loops in the function. This uses the same algorithm as distributing -/// mass in a loop, except that there are no exit or backedge edges. -/// -/// 4. Loop unpackaging and cleanup (\a finalizeMetrics()). -/// -/// Initialize the frequency to a floating point representation of its -/// mass. -/// -/// Visit loops top-down (reverse post-order), scaling the loop header's -/// frequency by its psuedo-node's mass and loop scale. Keep track of the -/// minimum and maximum final frequencies. -/// -/// Using the min and max frequencies as a guide, translate floating point -/// frequencies to an appropriate range in uint64_t. -/// -/// It has some known flaws. -/// -/// - Irreducible control flow isn't modelled correctly. In particular, -/// LoopInfo and MachineLoopInfo ignore irreducible backedges. The main -/// result is that irreducible SCCs will under-scaled. No mass is lost, -/// but the computed branch weights for the loop pseudo-node will be -/// incorrect. -/// -/// Modelling irreducible control flow exactly involves setting up and -/// solving a group of infinite geometric series. Such precision is -/// unlikely to be worthwhile, since most of our algorithms give up on -/// irreducible control flow anyway. -/// -/// Nevertheless, we might find that we need to get closer. If -/// LoopInfo/MachineLoopInfo flags loops with irreducible control flow -/// (and/or the function as a whole), we can find the SCCs, compute an -/// approximate exit frequency for the SCC as a whole, and scale up -/// accordingly. -/// -/// - Loop scale is limited to 4096 per loop (2^12) to avoid exhausting -/// BlockFrequency's 64-bit integer precision. -template class BlockFrequencyInfoImpl : BlockFrequencyInfoImplBase { - typedef typename bfi_detail::TypeMap::BlockT BlockT; - typedef typename bfi_detail::TypeMap::FunctionT FunctionT; - typedef typename bfi_detail::TypeMap::BranchProbabilityInfoT - BranchProbabilityInfoT; - typedef typename bfi_detail::TypeMap::LoopT LoopT; - typedef typename bfi_detail::TypeMap::LoopInfoT LoopInfoT; + friend class BlockFrequencyInfo; + friend class MachineBlockFrequencyInfo; - typedef GraphTraits Successor; - typedef GraphTraits> Predecessor; + BlockFrequencyInfoImpl() { } - const BranchProbabilityInfoT *BPI; - const LoopInfoT *LI; - const FunctionT *F; + void doFunction(FunctionT *fn, BranchProbabilityInfoT *bpi) { + Fn = fn; + BPI = bpi; - // All blocks in reverse postorder. - std::vector RPOT; - DenseMap Nodes; + // Clear everything. + RPO.clear(); + POT.clear(); + LoopExitProb.clear(); + Freqs.clear(); - typedef typename std::vector::const_iterator rpot_iterator; + BlockT *EntryBlock = fn->begin(); - rpot_iterator rpot_begin() const { return RPOT.begin(); } - rpot_iterator rpot_end() const { return RPOT.end(); } + std::copy(po_begin(EntryBlock), po_end(EntryBlock), std::back_inserter(POT)); - size_t getIndex(const rpot_iterator &I) const { return I - rpot_begin(); } + unsigned RPOidx = 0; + for (rpot_iterator I = rpot_begin(), E = rpot_end(); I != E; ++I) { + BlockT *BB = *I; + RPO[BB] = ++RPOidx; + DEBUG(dbgs() << "RPO[" << getBlockName(BB) << "] = " << RPO[BB] << "\n"); + } - BlockNode getNode(const rpot_iterator &I) const { - return BlockNode(getIndex(I)); - } - BlockNode getNode(const BlockT *BB) const { return Nodes.lookup(BB); } + // Travel over all blocks in postorder. + for (pot_iterator I = pot_begin(), E = pot_end(); I != E; ++I) { + BlockT *BB = *I; + BlockT *LastTail = nullptr; + DEBUG(dbgs() << "POT: " << getBlockName(BB) << "\n"); - const BlockT *getBlock(const BlockNode &Node) const { - return RPOT[Node.Index]; - } + for (typename GT::ChildIteratorType + PI = GraphTraits< Inverse >::child_begin(BB), + PE = GraphTraits< Inverse >::child_end(BB); + PI != PE; ++PI) { - void initializeRPOT(); - void initializeLoops(); - void runOnFunction(const FunctionT *F); + BlockT *Pred = *PI; + if (isBackedge(Pred, BB) && (!LastTail || RPO[Pred] > RPO[LastTail])) + LastTail = Pred; + } - void propagateMassToSuccessors(const BlockNode &LoopHead, - const BlockNode &Node); - void computeMassInLoops(); - void computeMassInLoop(const BlockNode &LoopHead); - void computeMassInFunction(); + if (LastTail) + doLoop(BB, LastTail); + } - std::string getBlockName(const BlockNode &Node) const override { - return bfi_detail::getBlockName(getBlock(Node)); + // At the end assume the whole function as a loop, and travel over it once + // again. + doLoop(*(rpot_begin()), *(pot_begin())); } public: - const FunctionT *getFunction() const { return F; } - void doFunction(const FunctionT *F, const BranchProbabilityInfoT *BPI, - const LoopInfoT *LI); - BlockFrequencyInfoImpl() : BPI(0), LI(0), F(0) {} + uint64_t getEntryFreq() { return EntryFreq; } - using BlockFrequencyInfoImplBase::getEntryFreq; + /// getBlockFreq - Return block frequency. Return 0 if we don't have it. BlockFrequency getBlockFreq(const BlockT *BB) const { - return BlockFrequencyInfoImplBase::getBlockFreq(getNode(BB)); - } - Float getFloatingBlockFreq(const BlockT *BB) const { - return BlockFrequencyInfoImplBase::getFloatingBlockFreq(getNode(BB)); - } - - /// \brief Print the frequencies for the current function. - /// - /// Prints the frequencies for the blocks in the current function. - /// - /// Blocks are printed in the natural iteration order of the function, rather - /// than reverse post-order. This provides two advantages: writing -analyze - /// tests is easier (since blocks come out in source order), and even - /// unreachable blocks are printed. - raw_ostream &print(raw_ostream &OS) const override; - using BlockFrequencyInfoImplBase::dump; - - using BlockFrequencyInfoImplBase::printBlockFreq; - raw_ostream &printBlockFreq(raw_ostream &OS, const BlockT *BB) const { - return BlockFrequencyInfoImplBase::printBlockFreq(OS, getNode(BB)); - } -}; - -template -void BlockFrequencyInfoImpl::doFunction(const FunctionT *F, - const BranchProbabilityInfoT *BPI, - const LoopInfoT *LI) { - // Save the parameters. - this->BPI = BPI; - this->LI = LI; - this->F = F; - - // Clean up left-over data structures. - BlockFrequencyInfoImplBase::clear(); - RPOT.clear(); - Nodes.clear(); - - // Initialize. - DEBUG(dbgs() << "\nblock-frequency: " << F->getName() << "\n=================" - << std::string(F->getName().size(), '=') << "\n"); - initializeRPOT(); - initializeLoops(); - - // Visit loops in post-order to find thelocal mass distribution, and then do - // the full function. - computeMassInLoops(); - computeMassInFunction(); - finalizeMetrics(); -} - -template void BlockFrequencyInfoImpl::initializeRPOT() { - const BlockT *Entry = F->begin(); - RPOT.reserve(F->size()); - std::copy(po_begin(Entry), po_end(Entry), std::back_inserter(RPOT)); - std::reverse(RPOT.begin(), RPOT.end()); - - assert(RPOT.size() - 1 <= BlockNode::getMaxIndex() && - "More nodes in function than Block Frequency Info supports"); - - DEBUG(dbgs() << "reverse-post-order-traversal\n"); - for (rpot_iterator I = rpot_begin(), E = rpot_end(); I != E; ++I) { - BlockNode Node = getNode(I); - DEBUG(dbgs() << " - " << getIndex(I) << ": " << getBlockName(Node) << "\n"); - Nodes[*I] = Node; + typename DenseMap::const_iterator + I = Freqs.find(BB); + if (I != Freqs.end()) + return I->second; + return 0; } - Working.resize(RPOT.size()); - Freqs.resize(RPOT.size()); -} - -template void BlockFrequencyInfoImpl::initializeLoops() { - DEBUG(dbgs() << "loop-detection\n"); - if (LI->empty()) - return; - - // Visit loops top down and assign them an index. - std::deque Q; - Q.insert(Q.end(), LI->begin(), LI->end()); - while (!Q.empty()) { - const LoopT *Loop = Q.front(); - Q.pop_front(); - Q.insert(Q.end(), Loop->begin(), Loop->end()); - - // Save the order this loop was visited. - BlockNode Header = getNode(Loop->getHeader()); - assert(Header.isValid()); - - Working[Header.Index].LoopIndex = PackagedLoops.size(); - PackagedLoops.emplace_back(Header); - DEBUG(dbgs() << " - loop = " << getBlockName(Header) << "\n"); + void print(raw_ostream &OS) const { + OS << "\n\n---- Block Freqs ----\n"; + for (typename FunctionT::iterator I = Fn->begin(), E = Fn->end(); I != E;) { + BlockT *BB = I++; + OS << " " << getBlockName(BB) << " = "; + printBlockFreq(OS, getBlockFreq(BB)) << "\n"; + + for (typename GraphTraits::ChildIteratorType + SI = GraphTraits::child_begin(BB), + SE = GraphTraits::child_end(BB); SI != SE; ++SI) { + BlockT *Succ = *SI; + OS << " " << getBlockName(BB) << " -> " << getBlockName(Succ) + << " = "; printBlockFreq(OS, getEdgeFreq(BB, Succ)) << "\n"; + } + } } - // Visit nodes in reverse post-order and add them to their deepest containing - // loop. - for (size_t Index = 0; Index < RPOT.size(); ++Index) { - const LoopT *Loop = LI->getLoopFor(RPOT[Index]); - if (!Loop) - continue; - - // If this is a loop header, find its parent loop (if any). - if (Working[Index].isLoopHeader()) - if (!(Loop = Loop->getParentLoop())) - continue; - - // Add this node to its containing loop's member list. - BlockNode Header = getNode(Loop->getHeader()); - assert(Header.isValid()); - const auto &HeaderData = Working[Header.Index]; - assert(HeaderData.isLoopHeader()); - - Working[Index].ContainingLoop = Header; - PackagedLoops[HeaderData.LoopIndex].Members.push_back(Index); - DEBUG(dbgs() << " - loop = " << getBlockName(Header) - << ": member = " << getBlockName(Index) << "\n"); + void dump() const { + print(dbgs()); } -} - -template void BlockFrequencyInfoImpl::computeMassInLoops() { - // Visit loops with the deepest first, and the top-level loops last. - for (auto L = PackagedLoops.rbegin(), LE = PackagedLoops.rend(); L != LE; ++L) - computeMassInLoop(L->Header); -} - -template -void BlockFrequencyInfoImpl::computeMassInLoop(const BlockNode &LoopHead) { - // Compute mass in loop. - DEBUG(dbgs() << "compute-mass-in-loop: " << getBlockName(LoopHead) << "\n"); - - Working[LoopHead.Index].Mass = BlockMass::getFull(); - propagateMassToSuccessors(LoopHead, LoopHead); - - for (const BlockNode &M : getLoopPackage(LoopHead).Members) - propagateMassToSuccessors(LoopHead, M); - - computeLoopScale(LoopHead); - packageLoop(LoopHead); -} -template void BlockFrequencyInfoImpl::computeMassInFunction() { - // Compute mass in function. - DEBUG(dbgs() << "compute-mass-in-function\n"); - Working[0].Mass = BlockMass::getFull(); - for (rpot_iterator I = rpot_begin(), IE = rpot_end(); I != IE; ++I) { - // Check for nodes that have been packaged. - BlockNode Node = getNode(I); - if (Working[Node.Index].hasLoopHeader()) - continue; - - propagateMassToSuccessors(BlockNode(), Node); + // Utility method that looks up the block frequency associated with BB and + // prints it to OS. + raw_ostream &printBlockFreq(raw_ostream &OS, + const BlockT *BB) { + return printBlockFreq(OS, getBlockFreq(BB)); } -} -template -void -BlockFrequencyInfoImpl::propagateMassToSuccessors(const BlockNode &LoopHead, - const BlockNode &Node) { - DEBUG(dbgs() << " - node: " << getBlockName(Node) << "\n"); - // Calculate probability for successors. - Distribution Dist; - if (Node != LoopHead && Working[Node.Index].isLoopHeader()) - addLoopSuccessorsToDist(LoopHead, Node, Dist); - else { - const BlockT *BB = getBlock(Node); - for (auto SI = Successor::child_begin(BB), SE = Successor::child_end(BB); - SI != SE; ++SI) - // Do not dereference SI, or getEdgeWeight() is linear in the number of - // successors. - addToDist(Dist, LoopHead, Node, getNode(*SI), BPI->getEdgeWeight(BB, SI)); + raw_ostream &printBlockFreq(raw_ostream &OS, + const BlockFrequency &Freq) const { + // Convert fixed-point number to decimal. + uint64_t Frequency = Freq.getFrequency(); + OS << Frequency / EntryFreq << "."; + uint64_t Rem = Frequency % EntryFreq; + uint64_t Eps = 1; + do { + Rem *= 10; + Eps *= 10; + OS << Rem / EntryFreq; + Rem = Rem % EntryFreq; + } while (Rem >= Eps/2); + return OS; } - // Distribute mass to successors, saving exit and backedge data in the - // loop header. - distributeMass(Node, LoopHead, Dist); -} +}; -template -raw_ostream &BlockFrequencyInfoImpl::print(raw_ostream &OS) const { - if (!F) - return OS; - OS << "block-frequency-info: " << F->getName() << "\n"; - for (const BlockT &BB : *F) - OS << " - " << bfi_detail::getBlockName(&BB) - << ": float = " << getFloatingBlockFreq(&BB) - << ", int = " << getBlockFreq(&BB).getFrequency() << "\n"; - - // Add an extra newline for readability. - OS << "\n"; - return OS; -} } #endif diff --git a/lib/Analysis/BlockFrequencyInfo.cpp b/lib/Analysis/BlockFrequencyInfo.cpp index 13ab29a94d..39aef9e140 100644 --- a/lib/Analysis/BlockFrequencyInfo.cpp +++ b/lib/Analysis/BlockFrequencyInfo.cpp @@ -11,7 +11,6 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "block-freq" #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/BlockFrequencyInfoImpl.h" #include "llvm/Analysis/BranchProbabilityInfo.h" @@ -107,7 +106,6 @@ struct DOTGraphTraits : public DefaultDOTGraphTraits { INITIALIZE_PASS_BEGIN(BlockFrequencyInfo, "block-freq", "Block Frequency Analysis", true, true) INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfo) -INITIALIZE_PASS_DEPENDENCY(LoopInfo) INITIALIZE_PASS_END(BlockFrequencyInfo, "block-freq", "Block Frequency Analysis", true, true) @@ -122,16 +120,14 @@ BlockFrequencyInfo::~BlockFrequencyInfo() {} void BlockFrequencyInfo::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); - AU.addRequired(); AU.setPreservesAll(); } bool BlockFrequencyInfo::runOnFunction(Function &F) { BranchProbabilityInfo &BPI = getAnalysis(); - LoopInfo &LI = getAnalysis(); if (!BFI) BFI.reset(new ImplType); - BFI->doFunction(&F, &BPI, &LI); + BFI->doFunction(&F, &BPI); #ifndef NDEBUG if (ViewBlockFreqPropagationDAG != GVDT_None) view(); @@ -162,7 +158,7 @@ void BlockFrequencyInfo::view() const { } const Function *BlockFrequencyInfo::getFunction() const { - return BFI ? BFI->getFunction() : nullptr; + return BFI ? BFI->Fn : nullptr; } raw_ostream &BlockFrequencyInfo:: diff --git a/lib/Analysis/BlockFrequencyInfoImpl.cpp b/lib/Analysis/BlockFrequencyInfoImpl.cpp deleted file mode 100644 index f267a9cdc5..0000000000 --- a/lib/Analysis/BlockFrequencyInfoImpl.cpp +++ /dev/null @@ -1,931 +0,0 @@ -//===- BlockFrequencyImplInfo.cpp - Block Frequency Info Implementation ---===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// Loops should be simplified before this analysis. -// -//===----------------------------------------------------------------------===// - -#define DEBUG_TYPE "block-freq" -#include "llvm/Analysis/BlockFrequencyInfoImpl.h" -#include "llvm/ADT/APFloat.h" -#include "llvm/Support/raw_ostream.h" -#include - -using namespace llvm; - -//===----------------------------------------------------------------------===// -// -// PositiveFloat implementation. -// -//===----------------------------------------------------------------------===// -const int PositiveFloatBase::MaxExponent; -const int PositiveFloatBase::MinExponent; - -static void appendDigit(std::string &Str, unsigned D) { - assert(D < 10); - Str += '0' + D % 10; -} - -static void appendNumber(std::string &Str, uint64_t N) { - while (N) { - appendDigit(Str, N % 10); - N /= 10; - } -} - -static bool doesRoundUp(char Digit) { - switch (Digit) { - case '5': - case '6': - case '7': - case '8': - case '9': - return true; - default: - return false; - } -} - -static std::string toStringAPFloat(uint64_t D, int E, unsigned Precision) { - assert(E >= PositiveFloatBase::MinExponent); - assert(E <= PositiveFloatBase::MaxExponent); - - // Find a new E, but don't let it increase past MaxExponent. - int LeadingZeros = PositiveFloatBase::countLeadingZeros64(D); - int NewE = std::min(PositiveFloatBase::MaxExponent, E + 63 - LeadingZeros); - int Shift = 63 - (NewE - E); - assert(Shift <= LeadingZeros); - assert(Shift == LeadingZeros || NewE == PositiveFloatBase::MaxExponent); - D <<= Shift; - E = NewE; - - // Check for a denormal. - unsigned AdjustedE = E + 16383; - if (!(D >> 63)) { - assert(E == PositiveFloatBase::MaxExponent); - AdjustedE = 0; - } - - // Build the float and print it. - uint64_t RawBits[2] = {D, AdjustedE}; - APFloat Float(APFloat::x87DoubleExtended, APInt(80, RawBits)); - SmallVector Chars; - Float.toString(Chars, Precision, 0); - return std::string(Chars.begin(), Chars.end()); -} - -static std::string stripTrailingZeros(std::string Float) { - size_t NonZero = Float.find_last_not_of('0'); - assert(NonZero != std::string::npos && "no . in floating point string"); - - if (Float[NonZero] == '.') - ++NonZero; - - return Float.substr(0, NonZero + 1); -} - -std::string PositiveFloatBase::toString(uint64_t D, int16_t E, int Width, - unsigned Precision) { - if (!D) - return "0.0"; - - // Canonicalize exponent and digits. - uint64_t Above0 = 0; - uint64_t Below0 = 0; - uint64_t Extra = 0; - int ExtraShift = 0; - if (E == 0) { - Above0 = D; - } else if (E > 0) { - if (int Shift = std::min(int16_t(countLeadingZeros64(D)), E)) { - D <<= Shift; - E -= Shift; - - if (!E) - Above0 = D; - } - } else if (E > -64) { - Above0 = D >> -E; - Below0 = D << (64 + E); - } else if (E > -120) { - Below0 = D >> (-E - 64); - Extra = D << (128 + E); - ExtraShift = -64 - E; - } - - // Fall back on APFloat for very small and very large numbers. - if (!Above0 && !Below0) - return toStringAPFloat(D, E, Precision); - - // Append the digits before the decimal. - std::string Str; - size_t DigitsOut = 0; - if (Above0) { - appendNumber(Str, Above0); - DigitsOut = Str.size(); - } else - appendDigit(Str, 0); - std::reverse(Str.begin(), Str.end()); - - // Return early if there's nothing after the decimal. - if (!Below0) - return Str + ".0"; - - // Append the decimal and beyond. - Str += '.'; - uint64_t Error = UINT64_C(1) << (64 - Width); - - // We need to shift Below0 to the right to make space for calculating - // digits. Save the precision we're losing in Extra. - Extra = (Below0 & 0xf) << 56 | (Extra >> 8); - Below0 >>= 4; - size_t SinceDot = 0; - size_t AfterDot = Str.size(); - do { - if (ExtraShift) { - --ExtraShift; - Error *= 5; - } else - Error *= 10; - - Below0 *= 10; - Extra *= 10; - Below0 += (Extra >> 60); - Extra = Extra & (UINT64_MAX >> 4); - appendDigit(Str, Below0 >> 60); - Below0 = Below0 & (UINT64_MAX >> 4); - if (DigitsOut || Str.back() != '0') - ++DigitsOut; - ++SinceDot; - } while (Error && (Below0 << 4 | Extra >> 60) >= Error / 2 && - (!Precision || DigitsOut <= Precision || SinceDot < 2)); - - // Return early for maximum precision. - if (!Precision || DigitsOut <= Precision) - return stripTrailingZeros(Str); - - // Find where to truncate. - size_t Truncate = - std::max(Str.size() - (DigitsOut - Precision), AfterDot + 1); - - // Check if there's anything to truncate. - if (Truncate >= Str.size()) - return stripTrailingZeros(Str); - - bool Carry = doesRoundUp(Str[Truncate]); - if (!Carry) - return stripTrailingZeros(Str.substr(0, Truncate)); - - // Round with the first truncated digit. - for (std::string::reverse_iterator I(Str.begin() + Truncate), E = Str.rend(); - I != E; ++I) { - if (*I == '.') - continue; - if (*I == '9') { - *I = '0'; - continue; - } - - ++*I; - Carry = false; - break; - } - - // Add "1" in front if we still need to carry. - return stripTrailingZeros(std::string(Carry, '1') + Str.substr(0, Truncate)); -} - -raw_ostream &PositiveFloatBase::print(raw_ostream &OS, uint64_t D, int16_t E, - int Width, unsigned Precision) { - return OS << toString(D, E, Width, Precision); -} - -void PositiveFloatBase::dump(uint64_t D, int16_t E, int Width) { - print(dbgs(), D, E, Width, 0) << "[" << Width << ":" << D << "*2^" << E - << "]"; -} - -static std::pair -getRoundedFloat(uint64_t N, bool ShouldRound, int64_t Shift) { - if (ShouldRound) - if (!++N) - // Rounding caused an overflow. - return std::make_pair(UINT64_C(1), Shift + 64); - return std::make_pair(N, Shift); -} - -std::pair PositiveFloatBase::divide64(uint64_t Dividend, - uint64_t Divisor) { - // Input should be sanitized. - assert(Divisor); - assert(Dividend); - - // Minimize size of divisor. - int16_t Shift = 0; - if (int Zeros = countTrailingZeros(Divisor)) { - Shift -= Zeros; - Divisor >>= Zeros; - } - - // Check for powers of two. - if (Divisor == 1) - return std::make_pair(Dividend, Shift); - - // Maximize size of dividend. - if (int Zeros = countLeadingZeros64(Dividend)) { - Shift -= Zeros; - Dividend <<= Zeros; - } - - // Start with the result of a divide. - uint64_t Quotient = Dividend / Divisor; - Dividend %= Divisor; - - // Continue building the quotient with long division. - // - // TODO: continue with largers digits. - while (!(Quotient >> 63) && Dividend) { - // Shift Dividend, and check for overflow. - bool IsOverflow = Dividend >> 63; - Dividend <<= 1; - --Shift; - - // Divide. - bool DoesDivide = IsOverflow || Divisor <= Dividend; - Quotient = (Quotient << 1) | uint64_t(DoesDivide); - Dividend -= DoesDivide ? Divisor : 0; - } - - // Round. - if (Dividend >= getHalf(Divisor)) - if (!++Quotient) - // Rounding caused an overflow in Quotient. - return std::make_pair(UINT64_C(1), Shift + 64); - - return getRoundedFloat(Quotient, Dividend >= getHalf(Divisor), Shift); -} - -static void addWithCarry(uint64_t &Upper, uint64_t &Lower, uint64_t N) { - uint64_t NewLower = Lower + (N << 32); - Upper += (N >> 32) + (NewLower < Lower); - Lower = NewLower; -} - -std::pair PositiveFloatBase::multiply64(uint64_t L, - uint64_t R) { - // Separate into two 32-bit digits (U.L). - uint64_t UL = L >> 32, LL = L & UINT32_MAX, UR = R >> 32, LR = R & UINT32_MAX; - - // Compute cross products. - uint64_t P1 = UL * UR, P2 = UL * LR, P3 = LL * UR, P4 = LL * LR; - - // Sum into two 64-bit digits. - uint64_t Upper = P1, Lower = P4; - addWithCarry(Upper, Lower, P2); - addWithCarry(Upper, Lower, P3); - - // Check for the lower 32 bits. - if (!Upper) - return std::make_pair(Lower, 0); - - // Shift as little as possible to maximize precision. - unsigned LeadingZeros = countLeadingZeros64(Upper); - int16_t Shift = 64 - LeadingZeros; - if (LeadingZeros) - Upper = Upper << LeadingZeros | Lower >> Shift; - bool ShouldRound = Shift && (Lower & UINT64_C(1) << (Shift - 1)); - return getRoundedFloat(Upper, ShouldRound, Shift); -} - -//===----------------------------------------------------------------------===// -// -// BlockMass implementation. -// -//===----------------------------------------------------------------------===// -BlockMass &BlockMass::operator*=(const BranchProbability &P) { - uint32_t N = P.getNumerator(), D = P.getDenominator(); - assert(D || "divide by 0"); - assert(N <= D || "fraction greater than 1"); - - // Fast path for multiplying by 1.0. - if (!Mass || N == D) - return *this; - - // Get as much precision as we can. - int Shift = countLeadingZeros(Mass); - uint64_t ShiftedQuotient = (Mass << Shift) / D; - uint64_t Product = ShiftedQuotient * N >> Shift; - - // Now check for what's lost. - uint64_t Left = ShiftedQuotient * (D - N) >> Shift; - uint64_t Lost = Mass - Product - Left; - - // TODO: prove this assertion. - assert(Lost <= UINT32_MAX); - - // Take the product plus a portion of the spoils. - Mass = Product + Lost * N / D; - return *this; -} - -PositiveFloat BlockMass::toFloat() const { - if (isFull()) - return PositiveFloat(1, 0); - return PositiveFloat(getMass() + 1, -64); -} - -void BlockMass::dump() const { print(dbgs()); } - -static char getHexDigit(int N) { - assert(N < 16); - if (N < 10) - return '0' + N; - return 'a' + N - 10; -} -raw_ostream &BlockMass::print(raw_ostream &OS) const { - for (int Digits = 0; Digits < 16; ++Digits) - OS << getHexDigit(Mass >> (60 - Digits * 4) & 0xf); - return OS; -} - -//===----------------------------------------------------------------------===// -// -// BlockFrequencyInfoImpl implementation. -// -//===----------------------------------------------------------------------===// -namespace { - -typedef BlockFrequencyInfoImplBase::BlockNode BlockNode; -typedef BlockFrequencyInfoImplBase::Distribution Distribution; -typedef BlockFrequencyInfoImplBase::Distribution::WeightList WeightList; -typedef BlockFrequencyInfoImplBase::Float Float; -typedef BlockFrequencyInfoImplBase::PackagedLoopData PackagedLoopData; -typedef BlockFrequencyInfoImplBase::Weight Weight; -typedef BlockFrequencyInfoImplBase::FrequencyData FrequencyData; - -/// \brief Dithering mass distributer. -/// -/// This class splits up a single mass into portions by weight, dithering to -/// spread out error. No mass is lost. The dithering precision depends on the -/// precision of the product of \a BlockMass and \a BranchProbability. -/// -/// The distribution algorithm follows. -/// -/// 1. Initialize by saving the sum of the weights in \a RemWeight and the -/// mass to distribute in \a RemMass. -/// -/// 2. For each portion: -/// -/// 1. Construct a branch probability, P, as the portion's weight divided -/// by the current value of \a RemWeight. -/// 2. Calculate the portion's mass as \a RemMass times P. -/// 3. Update \a RemWeight and \a RemMass at each portion by subtracting -/// the current portion's weight and mass. -/// -/// Mass is distributed in two ways: full distribution and forward -/// distribution. The latter ignores backedges, and uses the parallel fields -/// \a RemForwardWeight and \a RemForwardMass. -struct DitheringDistributer { - uint32_t RemWeight; - uint32_t RemForwardWeight; - - BlockMass RemMass; - BlockMass RemForwardMass; - - DitheringDistributer(Distribution &Dist, const BlockMass &Mass); - - BlockMass takeLocalMass(uint32_t Weight) { - (void)takeMass(Weight); - return takeForwardMass(Weight); - } - BlockMass takeExitMass(uint32_t Weight) { - (void)takeForwardMass(Weight); - return takeMass(Weight); - } - BlockMass takeBackedgeMass(uint32_t Weight) { return takeMass(Weight); } - -private: - BlockMass takeForwardMass(uint32_t Weight); - BlockMass takeMass(uint32_t Weight); -}; -} - -DitheringDistributer::DitheringDistributer(Distribution &Dist, - const BlockMass &Mass) { - Dist.normalize(); - RemWeight = Dist.Total; - RemForwardWeight = Dist.ForwardTotal; - RemMass = Mass; - RemForwardMass = Dist.ForwardTotal ? Mass : BlockMass(); -} - -BlockMass DitheringDistributer::takeForwardMass(uint32_t Weight) { - // Compute the amount of mass to take. - assert(Weight && "invalid weight"); - assert(Weight <= RemForwardWeight); - BlockMass Mass = RemForwardMass * BranchProbability(Weight, RemForwardWeight); - - // Decrement totals (dither). - RemForwardWeight -= Weight; - RemForwardMass -= Mass; - return Mass; -} -BlockMass DitheringDistributer::takeMass(uint32_t Weight) { - assert(Weight && "invalid weight"); - assert(Weight <= RemWeight); - BlockMass Mass = RemMass * BranchProbability(Weight, RemWeight); - - // Decrement totals (dither). - RemWeight -= Weight; - RemMass -= Mass; - return Mass; -} - -void Distribution::add(const BlockNode &Node, uint64_t Amount, - Weight::DistType Type) { - assert(Amount && "invalid weight of 0"); - uint64_t NewTotal = Total + Amount; - - // Check for overflow. It should be impossible to overflow twice. - bool IsOverflow = NewTotal < Total; - assert(!(DidOverflow && IsOverflow) && "unexpected repeated overflow"); - DidOverflow |= IsOverflow; - - // Update the total. - Total = NewTotal; - - // Save the weight. - Weight W; - W.TargetNode = Node; - W.Amount = Amount; - W.Type = Type; - Weights.push_back(W); - - if (Type == Weight::Backedge) - return; - - // Update forward total. Don't worry about overflow here, since then Total - // will exceed 32-bits and they'll both be recomputed in normalize(). - ForwardTotal += Amount; -} - -static void combineWeight(Weight &W, const Weight &OtherW) { - assert(OtherW.TargetNode.isValid()); - if (!W.Amount) { - W = OtherW; - return; - } - assert(W.Type == OtherW.Type); - assert(W.TargetNode == OtherW.TargetNode); - assert(W.Amount < W.Amount + OtherW.Amount); - W.Amount += OtherW.Amount; -} -static void combineWeightsBySorting(WeightList &Weights) { - // Sort so edges to the same node are adjacent. - std::sort(Weights.begin(), Weights.end(), - [](const Weight &L, - const Weight &R) { return L.TargetNode < R.TargetNode; }); - - // Combine adjacent edges. - WeightList::iterator O = Weights.begin(); - for (WeightList::const_iterator I = O, L = O, E = Weights.end(); I != E; - ++O, (I = L)) { - *O = *I; - - // Find the adjacent weights to the same node. - for (++L; L != E && I->TargetNode == L->TargetNode; ++L) - combineWeight(*O, *L); - } - - // Erase extra entries. - Weights.erase(O, Weights.end()); - return; -} -static void combineWeightsByHashing(WeightList &Weights) { - // Collect weights into a DenseMap. - typedef DenseMap HashTable; - HashTable Combined(NextPowerOf2(2 * Weights.size())); - for (const Weight &W : Weights) - combineWeight(Combined[W.TargetNode.Index], W); - - // Check whether anything changed. - if (Weights.size() == Combined.size()) - return; - - // Fill in the new weights. - Weights.clear(); - Weights.reserve(Combined.size()); - for (const auto &I : Combined) - Weights.push_back(I.second); -} -static void combineWeights(WeightList &Weights) { - // Use a hash table for many successors to keep this linear. - if (Weights.size() > 128) { - combineWeightsByHashing(Weights); - return; - } - - combineWeightsBySorting(Weights); -} -static uint64_t shiftRightAndRound(uint64_t N, int Shift) { - assert(Shift >= 0); - assert(Shift < 64); - if (!Shift) - return N; - return (N >> Shift) + (UINT64_C(1) & N >> (Shift - 1)); -} -void Distribution::normalize() { - // Early exit for termination nodes. - if (Weights.empty()) - return; - - // Only bother if there are multiple successors. - if (Weights.size() > 1) - combineWeights(Weights); - - // Early exit when combined into a single successor. - if (Weights.size() == 1) { - Total = 1; - ForwardTotal = Weights.front().Type != Weight::Backedge; - Weights.front().Amount = 1; - return; - } - - // Determine how much to shift right so that the total fits into 32-bits. - // - // If we shift at all, shift by 1 extra. Otherwise, the lower limit of 1 - // for each weight can cause a 32-bit overflow. - int Shift = 0; - if (DidOverflow) - Shift = 33; - else if (Total > UINT32_MAX) - Shift = 33 - countLeadingZeros(Total); - - // Early exit if nothing needs to be scaled. - if (!Shift) - return; - - // Recompute the total through accumulation (rather than shifting it) so that - // it's accurate after shifting. ForwardTotal is dirty here anyway. - Total = 0; - ForwardTotal = 0; - - // Sum the weights to each node and shift right if necessary. - for (Weight &W : Weights) { - // Scale down below UINT32_MAX. Since Shift is larger than necessary, we - // can round here without concern about overflow. - assert(W.TargetNode.isValid()); - W.Amount = std::max(UINT64_C(1), shiftRightAndRound(W.Amount, Shift)); - assert(W.Amount <= UINT32_MAX); - - // Update the total. - Total += W.Amount; - if (W.Type == Weight::Backedge) - continue; - - // Update the forward total. - ForwardTotal += W.Amount; - } - assert(Total <= UINT32_MAX); -} - -void BlockFrequencyInfoImplBase::clear() { - *this = BlockFrequencyInfoImplBase(); -} - -/// \brief Clear all memory not needed downstream. -/// -/// Releases all memory not used downstream. In particular, saves Freqs. -static void cleanup(BlockFrequencyInfoImplBase &BFI) { - std::vector SavedFreqs(std::move(BFI.Freqs)); - BFI.clear(); - BFI.Freqs = std::move(SavedFreqs); -} - -/// \brief Get a possibly packaged node. -/// -/// Get the node currently representing Node, which could be a containing -/// loop. -/// -/// This function should only be called when distributing mass. As long as -/// there are no irreducilbe edges to Node, then it will have complexity O(1) -/// in this context. -/// -/// In general, the complexity is O(L), where L is the number of loop headers -/// Node has been packaged into. Since this method is called in the context -/// of distributing mass, L will be the number of loop headers an early exit -/// edge jumps out of. -static BlockNode getPackagedNode(const BlockFrequencyInfoImplBase &BFI, - const BlockNode &Node) { - assert(Node.isValid()); - if (!BFI.Working[Node.Index].IsPackaged) - return Node; - if (!BFI.Working[Node.Index].ContainingLoop.isValid()) - return Node; - return getPackagedNode(BFI, BFI.Working[Node.Index].ContainingLoop); -} - -/// \brief Get the appropriate mass for a possible pseudo-node loop package. -/// -/// Get appropriate mass for Node. If Node is a loop-header (whose loop has -/// been packaged), returns the mass of its pseudo-node. If it's a node inside -/// a packaged loop, it returns the loop's pseudo-node. -static BlockMass &getPackageMass(BlockFrequencyInfoImplBase &BFI, - const BlockNode &Node) { - assert(Node.isValid()); - assert(!BFI.Working[Node.Index].IsPackaged); - if (!BFI.Working[Node.Index].IsAPackage) - return BFI.Working[Node.Index].Mass; - - return BFI.getLoopPackage(Node).Mass; -} - -void BlockFrequencyInfoImplBase::addToDist(Distribution &Dist, - const BlockNode &LoopHead, - const BlockNode &Pred, - const BlockNode &Succ, - uint64_t Weight) { - if (!Weight) - Weight = 1; - -#ifndef NDEBUG - auto debugSuccessor = [&](const char *Type, const BlockNode &Resolved) { - dbgs() << " =>" - << " [" << Type << "] weight = " << Weight; - if (Succ != LoopHead) - dbgs() << ", succ = " << getBlockName(Succ); - if (Resolved != Succ) - dbgs() << ", resolved = " << getBlockName(Resolved); - dbgs() << "\n"; - }; - (void)debugSuccessor; -#endif - - if (Succ == LoopHead) { - DEBUG(debugSuccessor("backedge", Succ)); - Dist.addBackedge(LoopHead, Weight); - return; - } - BlockNode Resolved = getPackagedNode(*this, Succ); - assert(Resolved != LoopHead); - - if (Working[Resolved.Index].ContainingLoop != LoopHead) { - DEBUG(debugSuccessor(" exit ", Resolved)); - Dist.addExit(Resolved, Weight); - return; - } - - if (!LoopHead.isValid() && Resolved < Pred) { - // Irreducible backedge. Skip this edge in the distribution. - DEBUG(debugSuccessor("skipped ", Resolved)); - return; - } - - DEBUG(debugSuccessor(" local ", Resolved)); - Dist.addLocal(Resolved, Weight); -} - -void BlockFrequencyInfoImplBase::addLoopSuccessorsToDist( - const BlockNode &LoopHead, const BlockNode &LocalLoopHead, - Distribution &Dist) { - PackagedLoopData &LoopPackage = getLoopPackage(LocalLoopHead); - const PackagedLoopData::ExitMap &Exits = LoopPackage.Exits; - - // Copy the exit map into Dist. - for (const auto &I : Exits) - addToDist(Dist, LoopHead, LocalLoopHead, I.first, I.second.getMass()); - - // We don't need this map any more. Clear it to prevent quadratic memory - // usage in deeply nested loops with irreducible control flow. - LoopPackage.Exits.clear(); -} - -/// \brief Get the maximum allowed loop scale. -/// -/// Gives the maximum number of estimated iterations allowed for a loop. -/// Downstream users have trouble with very large numbers (even within -/// 64-bits). Perhaps they can be changed to use PositiveFloat. -/// -/// TODO: change downstream users so that this can be increased or removed. -static Float getMaxLoopScale() { return Float(1, 12); } - -/// \brief Compute the loop scale for a loop. -void BlockFrequencyInfoImplBase::computeLoopScale(const BlockNode &LoopHead) { - // Compute loop scale. - DEBUG(dbgs() << "compute-loop-scale: " << getBlockName(LoopHead) << "\n"); - - // LoopScale == 1 / ExitMass - // ExitMass == HeadMass - BackedgeMass - PackagedLoopData &LoopPackage = getLoopPackage(LoopHead); - BlockMass ExitMass = BlockMass::getFull() - LoopPackage.BackedgeMass; - - // Block scale stores the inverse of the scale. - LoopPackage.Scale = ExitMass.toFloat().inverse(); - - DEBUG(dbgs() << " - exit-mass = " << ExitMass << " (" << BlockMass::getFull() - << " - " << LoopPackage.BackedgeMass << ")\n" - << " - scale = " << LoopPackage.Scale << "\n"); - - if (LoopPackage.Scale > getMaxLoopScale()) { - LoopPackage.Scale = getMaxLoopScale(); - DEBUG(dbgs() << " - reduced-to-max-scale: " << getMaxLoopScale() << "\n"); - } -} - -/// \brief Package up a loop. -void BlockFrequencyInfoImplBase::packageLoop(const BlockNode &LoopHead) { - DEBUG(dbgs() << "packaging-loop: " << getBlockName(LoopHead) << "\n"); - Working[LoopHead.Index].IsAPackage = true; - for (const BlockNode &M : getLoopPackage(LoopHead).Members) { - DEBUG(dbgs() << " - node: " << getBlockName(M.Index) << "\n"); - Working[M.Index].IsPackaged = true; - } -} - -void BlockFrequencyInfoImplBase::distributeMass(const BlockNode &Source, - const BlockNode &LoopHead, - Distribution &Dist) { - BlockMass Mass = getPackageMass(*this, Source); - DEBUG(dbgs() << " => mass: " << Mass - << " ( general | forward )\n"); - - // Distribute mass to successors as laid out in Dist. - DitheringDistributer D(Dist, Mass); - -#ifndef NDEBUG - auto debugAssign = [&](const BlockNode &T, const BlockMass &M, - const char *Desc) { - dbgs() << " => assign " << M << " (" << D.RemMass << "|" - << D.RemForwardMass << ")"; - if (Desc) - dbgs() << " [" << Desc << "]"; - if (T.isValid()) - dbgs() << " to " << getBlockName(T); - dbgs() << "\n"; - }; - (void)debugAssign; -#endif - - PackagedLoopData *LoopPackage = 0; - if (LoopHead.isValid()) - LoopPackage = &getLoopPackage(LoopHead); - for (const Weight &W : Dist.Weights) { - // Check for a local edge (forward and non-exit). - if (W.Type == Weight::Local) { - BlockMass Local = D.takeLocalMass(W.Amount); - getPackageMass(*this, W.TargetNode) += Local; - DEBUG(debugAssign(W.TargetNode, Local, nullptr)); - continue; - } - - // Backedges and exits only make sense if we're processing a loop. - assert(LoopPackage && "backedge or exit outside of loop"); - - // Check for a backedge. - if (W.Type == Weight::Backedge) { - BlockMass Back = D.takeBackedgeMass(W.Amount); - LoopPackage->BackedgeMass += Back; - DEBUG(debugAssign(BlockNode(), Back, "back")); - continue; - } - - // This must be an exit. - assert(W.Type == Weight::Exit); - BlockMass Exit = D.takeExitMass(W.Amount); - LoopPackage->Exits.push_back(std::make_pair(W.TargetNode, Exit)); - DEBUG(debugAssign(W.TargetNode, Exit, "exit")); - } -} - -static void convertFloatingToInteger(BlockFrequencyInfoImplBase &BFI, - const Float &Min, const Float &Max) { - // Scale the Factor to a size that creates integers. Ideally, integers would - // be scaled so that Max == UINT64_MAX so that they can be best - // differentiated. However, the register allocator currently deals poorly - // with large numbers. Instead, push Min up a little from 1 to give some - // room to differentiate small, unequal numbers. - // - // TODO: fix issues downstream so that ScalingFactor can be Float(1,64)/Max. - Float ScalingFactor = Min.inverse(); - if ((Max / Min).lg() < 60) - ScalingFactor <<= 3; - - // Translate the floats to integers. - DEBUG(dbgs() << "float-to-int: min = " << Min << ", max = " << Max - << ", factor = " << ScalingFactor << "\n"); - for (size_t Index = 0; Index < BFI.Freqs.size(); ++Index) { - Float Scaled = BFI.Freqs[Index].Floating * ScalingFactor; - BFI.Freqs[Index].Integer = std::max(UINT64_C(1), Scaled.toInt()); - DEBUG(dbgs() << " - " << BFI.getBlockName(Index) << ": float = " - << BFI.Freqs[Index].Floating << ", scaled = " << Scaled - << ", int = " << BFI.Freqs[Index].Integer << "\n"); - } -} - -static void scaleBlockData(BlockFrequencyInfoImplBase &BFI, - const BlockNode &Node, - const PackagedLoopData &Loop) { - Float F = Loop.Mass.toFloat() * Loop.Scale; - - Float &Current = BFI.Freqs[Node.Index].Floating; - Float Updated = Current * F; - - DEBUG(dbgs() << " - " << BFI.getBlockName(Node) << ": " << Current << " => " - << Updated << "\n"); - - Current = Updated; -} - -/// \brief Unwrap a loop package. -/// -/// Visits all the members of a loop, adjusting their BlockData according to -/// the loop's pseudo-node. -static void unwrapLoopPackage(BlockFrequencyInfoImplBase &BFI, - const BlockNode &Head) { - assert(Head.isValid()); - - PackagedLoopData &LoopPackage = BFI.getLoopPackage(Head); - DEBUG(dbgs() << "unwrap-loop-package: " << BFI.getBlockName(Head) - << ": mass = " << LoopPackage.Mass - << ", scale = " << LoopPackage.Scale << "\n"); - scaleBlockData(BFI, Head, LoopPackage); - - // Propagate the head scale through the loop. Since members are visited in - // RPO, the head scale will be updated by the loop scale first, and then the - // final head scale will be used for updated the rest of the members. - for (const BlockNode &M : LoopPackage.Members) { - const FrequencyData &HeadData = BFI.Freqs[Head.Index]; - FrequencyData &Freqs = BFI.Freqs[M.Index]; - Float NewFreq = Freqs.Floating * HeadData.Floating; - DEBUG(dbgs() << " - " << BFI.getBlockName(M) << ": " << Freqs.Floating - << " => " << NewFreq << "\n"); - Freqs.Floating = NewFreq; - } -} - -void BlockFrequencyInfoImplBase::finalizeMetrics() { - // Set initial frequencies from loop-local masses. - for (size_t Index = 0; Index < Working.size(); ++Index) - Freqs[Index].Floating = Working[Index].Mass.toFloat(); - - // Unwrap loop packages in reverse post-order, tracking min and max - // frequencies. - auto Min = Float::getLargest(); - auto Max = Float::getZero(); - for (size_t Index = 0; Index < Working.size(); ++Index) { - if (Working[Index].isLoopHeader()) - unwrapLoopPackage(*this, BlockNode(Index)); - - // Update max scale. - Min = std::min(Min, Freqs[Index].Floating); - Max = std::max(Max, Freqs[Index].Floating); - } - - // Convert to integers. - convertFloatingToInteger(*this, Min, Max); - - // Clean up data structures. - cleanup(*this); - - // Print out the final stats. - DEBUG(dump()); -} - -BlockFrequency -BlockFrequencyInfoImplBase::getBlockFreq(const BlockNode &Node) const { - if (!Node.isValid()) - return 0; - return Freqs[Node.Index].Integer; -} -Float -BlockFrequencyInfoImplBase::getFloatingBlockFreq(const BlockNode &Node) const { - if (!Node.isValid()) - return Float::getZero(); - return Freqs[Node.Index].Floating; -} - -std::string -BlockFrequencyInfoImplBase::getBlockName(const BlockNode &Node) const { - return std::string(); -} - -raw_ostream & -BlockFrequencyInfoImplBase::printBlockFreq(raw_ostream &OS, - const BlockNode &Node) const { - return OS << getFloatingBlockFreq(Node); -} - -raw_ostream & -BlockFrequencyInfoImplBase::printBlockFreq(raw_ostream &OS, - const BlockFrequency &Freq) const { - Float Block(Freq.getFrequency(), 0); - Float Entry(getEntryFreq(), 0); - - return OS << Block / Entry; -} diff --git a/lib/Analysis/CMakeLists.txt b/lib/Analysis/CMakeLists.txt index 0b0b2f92ea..c6d4573885 100644 --- a/lib/Analysis/CMakeLists.txt +++ b/lib/Analysis/CMakeLists.txt @@ -7,7 +7,6 @@ add_llvm_library(LLVMAnalysis Analysis.cpp BasicAliasAnalysis.cpp BlockFrequencyInfo.cpp - BlockFrequencyInfoImpl.cpp BranchProbabilityInfo.cpp CFG.cpp CFGPrinter.cpp diff --git a/lib/CodeGen/MachineBlockFrequencyInfo.cpp b/lib/CodeGen/MachineBlockFrequencyInfo.cpp index d3ac0c0437..70efa307d5 100644 --- a/lib/CodeGen/MachineBlockFrequencyInfo.cpp +++ b/lib/CodeGen/MachineBlockFrequencyInfo.cpp @@ -11,12 +11,9 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "block-freq" #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/Analysis/BlockFrequencyInfoImpl.h" #include "llvm/CodeGen/MachineBranchProbabilityInfo.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/InitializePasses.h" #include "llvm/Support/CommandLine.h" @@ -115,7 +112,6 @@ struct DOTGraphTraits : INITIALIZE_PASS_BEGIN(MachineBlockFrequencyInfo, "machine-block-freq", "Machine Block Frequency Analysis", true, true) INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo) -INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_END(MachineBlockFrequencyInfo, "machine-block-freq", "Machine Block Frequency Analysis", true, true) @@ -131,18 +127,16 @@ MachineBlockFrequencyInfo::~MachineBlockFrequencyInfo() {} void MachineBlockFrequencyInfo::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); - AU.addRequired(); AU.setPreservesAll(); MachineFunctionPass::getAnalysisUsage(AU); } bool MachineBlockFrequencyInfo::runOnMachineFunction(MachineFunction &F) { MachineBranchProbabilityInfo &MBPI = - getAnalysis(); - MachineLoopInfo &MLI = getAnalysis(); + getAnalysis(); if (!MBFI) MBFI.reset(new ImplType); - MBFI->doFunction(&F, &MBPI, &MLI); + MBFI->doFunction(&F, &MBPI); #ifndef NDEBUG if (ViewMachineBlockFreqPropagationDAG != GVDT_None) { view(); @@ -172,7 +166,7 @@ getBlockFreq(const MachineBasicBlock *MBB) const { } const MachineFunction *MachineBlockFrequencyInfo::getFunction() const { - return MBFI ? MBFI->getFunction() : nullptr; + return MBFI ? MBFI->Fn : nullptr; } raw_ostream & diff --git a/test/Analysis/BlockFrequencyInfo/bad_input.ll b/test/Analysis/BlockFrequencyInfo/bad_input.ll deleted file mode 100644 index bcdc1e6f0b..0000000000 --- a/test/Analysis/BlockFrequencyInfo/bad_input.ll +++ /dev/null @@ -1,50 +0,0 @@ -; RUN: opt < %s -analyze -block-freq | FileCheck %s - -declare void @g(i32 %x) - -; CHECK-LABEL: Printing analysis {{.*}} for function 'branch_weight_0': -; CHECK-NEXT: block-frequency-info: branch_weight_0 -define void @branch_weight_0(i32 %a) { -; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]] -entry: - br label %for.body - -; Check that we get 1,4 instead of 0,3. -; CHECK-NEXT: for.body: float = 4.0, -for.body: - %i = phi i32 [ 0, %entry ], [ %inc, %for.body ] - call void @g(i32 %i) - %inc = add i32 %i, 1 - %cmp = icmp ugt i32 %inc, %a - br i1 %cmp, label %for.end, label %for.body, !prof !0 - -; CHECK-NEXT: for.end: float = 1.0, int = [[ENTRY]] -for.end: - ret void -} - -!0 = metadata !{metadata !"branch_weights", i32 0, i32 3} - -; CHECK-LABEL: Printing analysis {{.*}} for function 'infinite_loop' -; CHECK-NEXT: block-frequency-info: infinite_loop -define void @infinite_loop(i1 %x) { -; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]] -entry: - br i1 %x, label %for.body, label %for.end, !prof !1 - -; Check that the loop scale maxes out at 4096, giving 2048 here. -; CHECK-NEXT: for.body: float = 2048.0, -for.body: - %i = phi i32 [ 0, %entry ], [ %inc, %for.body ] - call void @g(i32 %i) - %inc = add i32 %i, 1 - br label %for.body - -; Check that the exit weight is half of entry, since half is lost in the -; infinite loop above. -; CHECK-NEXT: for.end: float = 0.5, -for.end: - ret void -} - -!1 = metadata !{metadata !"branch_weights", i32 1, i32 1} diff --git a/test/Analysis/BlockFrequencyInfo/basic.ll b/test/Analysis/BlockFrequencyInfo/basic.ll index 006e6ab4d7..ce29fb5ce1 100644 --- a/test/Analysis/BlockFrequencyInfo/basic.ll +++ b/test/Analysis/BlockFrequencyInfo/basic.ll @@ -1,14 +1,13 @@ ; RUN: opt < %s -analyze -block-freq | FileCheck %s define i32 @test1(i32 %i, i32* %a) { -; CHECK-LABEL: Printing analysis {{.*}} for function 'test1': -; CHECK-NEXT: block-frequency-info: test1 -; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]] +; CHECK: Printing analysis {{.*}} for function 'test1' +; CHECK: entry = 1.0 entry: br label %body ; Loop backedges are weighted and thus their bodies have a greater frequency. -; CHECK-NEXT: body: float = 32.0, +; CHECK: body = 32.0 body: %iv = phi i32 [ 0, %entry ], [ %next, %body ] %base = phi i32 [ 0, %entry ], [ %sum, %body ] @@ -19,29 +18,29 @@ body: %exitcond = icmp eq i32 %next, %i br i1 %exitcond, label %exit, label %body -; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]] +; CHECK: exit = 1.0 exit: ret i32 %sum } define i32 @test2(i32 %i, i32 %a, i32 %b) { -; CHECK-LABEL: Printing analysis {{.*}} for function 'test2': -; CHECK-NEXT: block-frequency-info: test2 -; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]] +; CHECK: Printing analysis {{.*}} for function 'test2' +; CHECK: entry = 1.0 entry: %cond = icmp ult i32 %i, 42 br i1 %cond, label %then, label %else, !prof !0 ; The 'then' branch is predicted more likely via branch weight metadata. -; CHECK-NEXT: then: float = 0.9411{{[0-9]*}}, +; CHECK: then = 0.94116 then: br label %exit -; CHECK-NEXT: else: float = 0.05882{{[0-9]*}}, +; CHECK: else = 0.05877 else: br label %exit -; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]] +; FIXME: It may be a bug that we don't sum back to 1.0. +; CHECK: exit = 0.99993 exit: %result = phi i32 [ %a, %then ], [ %b, %else ] ret i32 %result @@ -50,37 +49,37 @@ exit: !0 = metadata !{metadata !"branch_weights", i32 64, i32 4} define i32 @test3(i32 %i, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) { -; CHECK-LABEL: Printing analysis {{.*}} for function 'test3': -; CHECK-NEXT: block-frequency-info: test3 -; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]] +; CHECK: Printing analysis {{.*}} for function 'test3' +; CHECK: entry = 1.0 entry: switch i32 %i, label %case_a [ i32 1, label %case_b i32 2, label %case_c i32 3, label %case_d i32 4, label %case_e ], !prof !1 -; CHECK-NEXT: case_a: float = 0.05, +; CHECK: case_a = 0.04998 case_a: br label %exit -; CHECK-NEXT: case_b: float = 0.05, +; CHECK: case_b = 0.04998 case_b: br label %exit ; The 'case_c' branch is predicted more likely via branch weight metadata. -; CHECK-NEXT: case_c: float = 0.8, +; CHECK: case_c = 0.79998 case_c: br label %exit -; CHECK-NEXT: case_d: float = 0.05, +; CHECK: case_d = 0.04998 case_d: br label %exit -; CHECK-NEXT: case_e: float = 0.05, +; CHECK: case_e = 0.04998 case_e: br label %exit -; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]] +; FIXME: It may be a bug that we don't sum back to 1.0. +; CHECK: exit = 0.99993 exit: %result = phi i32 [ %a, %case_a ], [ %b, %case_b ], @@ -92,50 +91,44 @@ exit: !1 = metadata !{metadata !"branch_weights", i32 4, i32 4, i32 64, i32 4, i32 4} +; CHECK: Printing analysis {{.*}} for function 'nested_loops' +; CHECK: entry = 1.0 +; This test doesn't seem to be assigning sensible frequencies to nested loops. define void @nested_loops(i32 %a) { -; CHECK-LABEL: Printing analysis {{.*}} for function 'nested_loops': -; CHECK-NEXT: block-frequency-info: nested_loops -; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]] entry: br label %for.cond1.preheader -; CHECK-NEXT: for.cond1.preheader: float = 4001.0, for.cond1.preheader: %x.024 = phi i32 [ 0, %entry ], [ %inc12, %for.inc11 ] br label %for.cond4.preheader -; CHECK-NEXT: for.cond4.preheader: float = 16008001.0, for.cond4.preheader: %y.023 = phi i32 [ 0, %for.cond1.preheader ], [ %inc9, %for.inc8 ] %add = add i32 %y.023, %x.024 br label %for.body6 -; CHECK-NEXT: for.body6: float = 64048012001.0, for.body6: %z.022 = phi i32 [ 0, %for.cond4.preheader ], [ %inc, %for.body6 ] %add7 = add i32 %add, %z.022 - tail call void @g(i32 %add7) + tail call void @g(i32 %add7) #2 %inc = add i32 %z.022, 1 %cmp5 = icmp ugt i32 %inc, %a br i1 %cmp5, label %for.inc8, label %for.body6, !prof !2 -; CHECK-NEXT: for.inc8: float = 16008001.0, for.inc8: %inc9 = add i32 %y.023, 1 %cmp2 = icmp ugt i32 %inc9, %a br i1 %cmp2, label %for.inc11, label %for.cond4.preheader, !prof !2 -; CHECK-NEXT: for.inc11: float = 4001.0, for.inc11: %inc12 = add i32 %x.024, 1 %cmp = icmp ugt i32 %inc12, %a br i1 %cmp, label %for.end13, label %for.cond1.preheader, !prof !2 -; CHECK-NEXT: for.end13: float = 1.0, int = [[ENTRY]] for.end13: ret void } -declare void @g(i32) +declare void @g(i32) #1 !2 = metadata !{metadata !"branch_weights", i32 1, i32 4000} diff --git a/test/Analysis/BlockFrequencyInfo/double_exit.ll b/test/Analysis/BlockFrequencyInfo/double_exit.ll deleted file mode 100644 index 2fe617c9f5..0000000000 --- a/test/Analysis/BlockFrequencyInfo/double_exit.ll +++ /dev/null @@ -1,165 +0,0 @@ -; RUN: opt < %s -analyze -block-freq | FileCheck %s - -; CHECK-LABEL: Printing analysis {{.*}} for function 'double_exit': -; CHECK-NEXT: block-frequency-info: double_exit -define i32 @double_exit(i32 %N) { -; Mass = 1 -; Frequency = 1 -; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]] -entry: - br label %outer - -; Mass = 1 -; Backedge mass = 1/3, exit mass = 2/3 -; Loop scale = 3/2 -; Psuedo-edges = exit -; Psuedo-mass = 1 -; Frequency = 1*3/2*1 = 3/2 -; CHECK-NEXT: outer: float = 1.5, -outer: - %I.0 = phi i32 [ 0, %entry ], [ %inc6, %outer.inc ] - %Return.0 = phi i32 [ 0, %entry ], [ %Return.1, %outer.inc ] - %cmp = icmp slt i32 %I.0, %N - br i1 %cmp, label %inner, label %exit, !prof !2 ; 2:1 - -; Mass = 1 -; Backedge mass = 3/5, exit mass = 2/5 -; Loop scale = 5/2 -; Pseudo-edges = outer.inc @ 1/5, exit @ 1/5 -; Pseudo-mass = 2/3 -; Frequency = 3/2*1*5/2*2/3 = 5/2 -; CHECK-NEXT: inner: float = 2.5, -inner: - %Return.1 = phi i32 [ %Return.0, %outer ], [ %call4, %inner.inc ] - %J.0 = phi i32 [ %I.0, %outer ], [ %inc, %inner.inc ] - %cmp2 = icmp slt i32 %J.0, %N - br i1 %cmp2, label %inner.body, label %outer.inc, !prof !1 ; 4:1 - -; Mass = 4/5 -; Frequency = 5/2*4/5 = 2 -; CHECK-NEXT: inner.body: float = 2.0, -inner.body: - %call = call i32 @c2(i32 %I.0, i32 %J.0) - %tobool = icmp ne i32 %call, 0 - br i1 %tobool, label %exit, label %inner.inc, !prof !0 ; 3:1 - -; Mass = 3/5 -; Frequency = 5/2*3/5 = 3/2 -; CHECK-NEXT: inner.inc: float = 1.5, -inner.inc: - %call4 = call i32 @logic2(i32 %Return.1, i32 %I.0, i32 %J.0) - %inc = add nsw i32 %J.0, 1 - br label %inner - -; Mass = 1/3 -; Frequency = 3/2*1/3 = 1/2 -; CHECK-NEXT: outer.inc: float = 0.5, -outer.inc: - %inc6 = add nsw i32 %I.0, 1 - br label %outer - -; Mass = 1 -; Frequency = 1 -; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]] -exit: - %Return.2 = phi i32 [ %Return.1, %inner.body ], [ %Return.0, %outer ] - ret i32 %Return.2 -} - -!0 = metadata !{metadata !"branch_weights", i32 1, i32 3} -!1 = metadata !{metadata !"branch_weights", i32 4, i32 1} -!2 = metadata !{metadata !"branch_weights", i32 2, i32 1} - -declare i32 @c2(i32, i32) -declare i32 @logic2(i32, i32, i32) - -; CHECK-LABEL: Printing analysis {{.*}} for function 'double_exit_in_loop': -; CHECK-NEXT: block-frequency-info: double_exit_in_loop -define i32 @double_exit_in_loop(i32 %N) { -; Mass = 1 -; Frequency = 1 -; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]] -entry: - br label %outer - -; Mass = 1 -; Backedge mass = 1/2, exit mass = 1/2 -; Loop scale = 2 -; Pseudo-edges = exit -; Psuedo-mass = 1 -; Frequency = 1*2*1 = 2 -; CHECK-NEXT: outer: float = 2.0, -outer: - %I.0 = phi i32 [ 0, %entry ], [ %inc12, %outer.inc ] - %Return.0 = phi i32 [ 0, %entry ], [ %Return.3, %outer.inc ] - %cmp = icmp slt i32 %I.0, %N - br i1 %cmp, label %middle, label %exit, !prof !3 ; 1:1 - -; Mass = 1 -; Backedge mass = 1/3, exit mass = 2/3 -; Loop scale = 3/2 -; Psuedo-edges = outer.inc -; Psuedo-mass = 1/2 -; Frequency = 2*1*3/2*1/2 = 3/2 -; CHECK-NEXT: middle: float = 1.5, -middle: - %J.0 = phi i32 [ %I.0, %outer ], [ %inc9, %middle.inc ] - %Return.1 = phi i32 [ %Return.0, %outer ], [ %Return.2, %middle.inc ] - %cmp2 = icmp slt i32 %J.0, %N - br i1 %cmp2, label %inner, label %outer.inc, !prof !2 ; 2:1 - -; Mass = 1 -; Backedge mass = 3/5, exit mass = 2/5 -; Loop scale = 5/2 -; Pseudo-edges = middle.inc @ 1/5, outer.inc @ 1/5 -; Pseudo-mass = 2/3 -; Frequency = 3/2*1*5/2*2/3 = 5/2 -; CHECK-NEXT: inner: float = 2.5, -inner: - %Return.2 = phi i32 [ %Return.1, %middle ], [ %call7, %inner.inc ] - %K.0 = phi i32 [ %J.0, %middle ], [ %inc, %inner.inc ] - %cmp5 = icmp slt i32 %K.0, %N - br i1 %cmp5, label %inner.body, label %middle.inc, !prof !1 ; 4:1 - -; Mass = 4/5 -; Frequency = 5/2*4/5 = 2 -; CHECK-NEXT: inner.body: float = 2.0, -inner.body: - %call = call i32 @c3(i32 %I.0, i32 %J.0, i32 %K.0) - %tobool = icmp ne i32 %call, 0 - br i1 %tobool, label %outer.inc, label %inner.inc, !prof !0 ; 3:1 - -; Mass = 3/5 -; Frequency = 5/2*3/5 = 3/2 -; CHECK-NEXT: inner.inc: float = 1.5, -inner.inc: - %call7 = call i32 @logic3(i32 %Return.2, i32 %I.0, i32 %J.0, i32 %K.0) - %inc = add nsw i32 %K.0, 1 - br label %inner - -; Mass = 1/3 -; Frequency = 3/2*1/3 = 1/2 -; CHECK-NEXT: middle.inc: float = 0.5, -middle.inc: - %inc9 = add nsw i32 %J.0, 1 - br label %middle - -; Mass = 1/2 -; Frequency = 2*1/2 = 1 -; CHECK-NEXT: outer.inc: float = 1.0, -outer.inc: - %Return.3 = phi i32 [ %Return.2, %inner.body ], [ %Return.1, %middle ] - %inc12 = add nsw i32 %I.0, 1 - br label %outer - -; Mass = 1 -; Frequency = 1 -; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]] -exit: - ret i32 %Return.0 -} - -!3 = metadata !{metadata !"branch_weights", i32 1, i32 1} - -declare i32 @c3(i32, i32, i32) -declare i32 @logic3(i32, i32, i32, i32) diff --git a/test/Analysis/BlockFrequencyInfo/irreducible.ll b/test/Analysis/BlockFrequencyInfo/irreducible.ll deleted file mode 100644 index 46a2958700..0000000000 --- a/test/Analysis/BlockFrequencyInfo/irreducible.ll +++ /dev/null @@ -1,197 +0,0 @@ -; RUN: opt < %s -analyze -block-freq | FileCheck %s - -; A loop with multiple exits should be handled correctly. -; -; CHECK-LABEL: Printing analysis {{.*}} for function 'multiexit': -; CHECK-NEXT: block-frequency-info: multiexit -define void @multiexit(i32 %a) { -; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]] -entry: - br label %loop.1 - -; CHECK-NEXT: loop.1: float = 1.333{{3*}}, -loop.1: - %i = phi i32 [ 0, %entry ], [ %inc.2, %loop.2 ] - call void @f(i32 %i) - %inc.1 = add i32 %i, 1 - %cmp.1 = icmp ugt i32 %inc.1, %a - br i1 %cmp.1, label %exit.1, label %loop.2, !prof !0 - -; CHECK-NEXT: loop.2: float = 0.666{{6*7}}, -loop.2: - call void @g(i32 %inc.1) - %inc.2 = add i32 %inc.1, 1 - %cmp.2 = icmp ugt i32 %inc.2, %a - br i1 %cmp.2, label %exit.2, label %loop.1, !prof !1 - -; CHECK-NEXT: exit.1: float = 0.666{{6*7}}, -exit.1: - call void @h(i32 %inc.1) - br label %return - -; CHECK-NEXT: exit.2: float = 0.333{{3*}}, -exit.2: - call void @i(i32 %inc.2) - br label %return - -; CHECK-NEXT: return: float = 1.0, int = [[ENTRY]] -return: - ret void -} - -declare void @f(i32 %x) -declare void @g(i32 %x) -declare void @h(i32 %x) -declare void @i(i32 %x) - -!0 = metadata !{metadata !"branch_weights", i32 3, i32 3} -!1 = metadata !{metadata !"branch_weights", i32 5, i32 5} - -; The current BlockFrequencyInfo algorithm doesn't handle multiple entrances -; into a loop very well. The frequencies assigned to blocks in the loop are -; predictable (and not absurd), but also not correct and therefore not worth -; testing. -; -; There are two testcases below. -; -; For each testcase, I use a CHECK-NEXT/NOT combo like an XFAIL with the -; granularity of a single check. If/when this behaviour is fixed, we'll know -; about it, and the test should be updated. -; -; Testcase #1 -; =========== -; -; In this case c1 and c2 should have frequencies of 15/7 and 13/7, -; respectively. To calculate this, consider assigning 1.0 to entry, and -; distributing frequency iteratively (to infinity). At the first iteration, -; entry gives 3/4 to c1 and 1/4 to c2. At every step after, c1 and c2 give 3/4 -; of what they have to each other. Somehow, all of it comes out to exit. -; -; c1 = 3/4 + 1/4*3/4 + 3/4*3^2/4^2 + 1/4*3^3/4^3 + 3/4*3^3/4^3 + ... -; c2 = 1/4 + 3/4*3/4 + 1/4*3^2/4^2 + 3/4*3^3/4^3 + 1/4*3^3/4^3 + ... -; -; Simplify by splitting up the odd and even terms of the series and taking out -; factors so that the infite series matches: -; -; c1 = 3/4 *(9^0/16^0 + 9^1/16^1 + 9^2/16^2 + ...) -; + 3/16*(9^0/16^0 + 9^1/16^1 + 9^2/16^2 + ...) -; c2 = 1/4 *(9^0/16^0 + 9^1/16^1 + 9^2/16^2 + ...) -; + 9/16*(9^0/16^0 + 9^1/16^1 + 9^2/16^2 + ...) -; -; c1 = 15/16*(9^0/16^0 + 9^1/16^1 + 9^2/16^2 + ...) -; c2 = 13/16*(9^0/16^0 + 9^1/16^1 + 9^2/16^2 + ...) -; -; Since this geometric series sums to 16/7: -; -; c1 = 15/7 -; c2 = 13/7 -; -; If we treat c1 and c2 as members of the same loop, the exit frequency of the -; loop as a whole is 1/4, so the loop scale should be 4. Summing c1 and c2 -; gives 28/7, or 4.0, which is nice confirmation of the math above. -; -; However, assuming c1 precedes c2 in reverse post-order, the current algorithm -; returns 3/4 and 13/16, respectively. LoopInfo ignores edges between loops -; (and doesn't see any loops here at all), and -block-freq ignores the -; irreducible edge from c2 to c1. -; -; CHECK-LABEL: Printing analysis {{.*}} for function 'multientry': -; CHECK-NEXT: block-frequency-info: multientry -define void @multientry(i32 %a) { -; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]] -entry: - %choose = call i32 @choose(i32 %a) - %compare = icmp ugt i32 %choose, %a - br i1 %compare, label %c1, label %c2, !prof !2 - -; This is like a single-line XFAIL (see above). -; CHECK-NEXT: c1: -; CHECK-NOT: float = 2.142857{{[0-9]*}}, -c1: - %i1 = phi i32 [ %a, %entry ], [ %i2.inc, %c2 ] - %i1.inc = add i32 %i1, 1 - %choose1 = call i32 @choose(i32 %i1) - %compare1 = icmp ugt i32 %choose1, %a - br i1 %compare1, label %c2, label %exit, !prof !2 - -; This is like a single-line XFAIL (see above). -; CHECK-NEXT: c2: -; CHECK-NOT: float = 1.857142{{[0-9]*}}, -c2: - %i2 = phi i32 [ %a, %entry ], [ %i1.inc, %c1 ] - %i2.inc = add i32 %i2, 1 - %choose2 = call i32 @choose(i32 %i2) - %compare2 = icmp ugt i32 %choose2, %a - br i1 %compare2, label %c1, label %exit, !prof !2 - -; We still shouldn't lose any frequency. -; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]] -exit: - ret void -} - -; Testcase #2 -; =========== -; -; In this case c1 and c2 should be treated as equals in a single loop. The -; exit frequency is 1/3, so the scaling factor for the loop should be 3.0. The -; loop is entered 2/3 of the time, and c1 and c2 split the total loop frequency -; evenly (1/2), so they should each have frequencies of 1.0 (3.0*2/3*1/2). -; Another way of computing this result is by assigning 1.0 to entry and showing -; that c1 and c2 should accumulate frequencies of: -; -; 1/3 + 2/9 + 4/27 + 8/81 + ... -; 2^0/3^1 + 2^1/3^2 + 2^2/3^3 + 2^3/3^4 + ... -; -; At the first step, c1 and c2 each get 1/3 of the entry. At each subsequent -; step, c1 and c2 each get 1/3 of what's left in c1 and c2 combined. This -; infinite series sums to 1. -; -; However, assuming c1 precedes c2 in reverse post-order, the current algorithm -; returns 1/2 and 3/4, respectively. LoopInfo ignores edges between loops (and -; treats c1 and c2 as self-loops only), and -block-freq ignores the irreducible -; edge from c2 to c1. -; -; Below I use a CHECK-NEXT/NOT combo like an XFAIL with the granularity of a -; single check. If/when this behaviour is fixed, we'll know about it, and the -; test should be updated. -; -; CHECK-LABEL: Printing analysis {{.*}} for function 'crossloops': -; CHECK-NEXT: block-frequency-info: crossloops -define void @crossloops(i32 %a) { -; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]] -entry: - %choose = call i32 @choose(i32 %a) - switch i32 %choose, label %exit [ i32 1, label %c1 - i32 2, label %c2 ], !prof !3 - -; This is like a single-line XFAIL (see above). -; CHECK-NEXT: c1: -; CHECK-NOT: float = 1.0, -c1: - %i1 = phi i32 [ %a, %entry ], [ %i1.inc, %c1 ], [ %i2.inc, %c2 ] - %i1.inc = add i32 %i1, 1 - %choose1 = call i32 @choose(i32 %i1) - switch i32 %choose1, label %exit [ i32 1, label %c1 - i32 2, label %c2 ], !prof !3 - -; This is like a single-line XFAIL (see above). -; CHECK-NEXT: c2: -; CHECK-NOT: float = 1.0, -c2: - %i2 = phi i32 [ %a, %entry ], [ %i1.inc, %c1 ], [ %i2.inc, %c2 ] - %i2.inc = add i32 %i2, 1 - %choose2 = call i32 @choose(i32 %i2) - switch i32 %choose2, label %exit [ i32 1, label %c1 - i32 2, label %c2 ], !prof !3 - -; We still shouldn't lose any frequency. -; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]] -exit: - ret void -} - -declare i32 @choose(i32) - -!2 = metadata !{metadata !"branch_weights", i32 3, i32 1} -!3 = metadata !{metadata !"branch_weights", i32 2, i32 2, i32 2} diff --git a/test/Analysis/BlockFrequencyInfo/loop_with_branch.ll b/test/Analysis/BlockFrequencyInfo/loop_with_branch.ll deleted file mode 100644 index 9d27b6bf0f..0000000000 --- a/test/Analysis/BlockFrequencyInfo/loop_with_branch.ll +++ /dev/null @@ -1,44 +0,0 @@ -; RUN: opt < %s -analyze -block-freq | FileCheck %s - -; CHECK-LABEL: Printing analysis {{.*}} for function 'loop_with_branch': -; CHECK-NEXT: block-frequency-info: loop_with_branch -define void @loop_with_branch(i32 %a) { -; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]] -entry: - %skip_loop = call i1 @foo0(i32 %a) - br i1 %skip_loop, label %skip, label %header, !prof !0 - -; CHECK-NEXT: skip: float = 0.25, -skip: - br label %exit - -; CHECK-NEXT: header: float = 4.5, -header: - %i = phi i32 [ 0, %entry ], [ %i.next, %back ] - %i.next = add i32 %i, 1 - %choose = call i2 @foo1(i32 %i) - switch i2 %choose, label %exit [ i2 0, label %left - i2 1, label %right ], !prof !1 - -; CHECK-NEXT: left: float = 1.5, -left: - br label %back - -; CHECK-NEXT: right: float = 2.25, -right: - br label %back - -; CHECK-NEXT: back: float = 3.75, -back: - br label %header - -; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]] -exit: - ret void -} - -declare i1 @foo0(i32) -declare i2 @foo1(i32) - -!0 = metadata !{metadata !"branch_weights", i32 1, i32 3} -!1 = metadata !{metadata !"branch_weights", i32 1, i32 2, i32 3} diff --git a/test/Analysis/BlockFrequencyInfo/nested_loop_with_branches.ll b/test/Analysis/BlockFrequencyInfo/nested_loop_with_branches.ll deleted file mode 100644 index d93ffceb5f..0000000000 --- a/test/Analysis/BlockFrequencyInfo/nested_loop_with_branches.ll +++ /dev/null @@ -1,59 +0,0 @@ -; RUN: opt < %s -analyze -block-freq | FileCheck %s - -; CHECK-LABEL: Printing analysis {{.*}} for function 'nested_loop_with_branches' -; CHECK-NEXT: block-frequency-info: nested_loop_with_branches -define void @nested_loop_with_branches(i32 %a) { -; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]] -entry: - %v0 = call i1 @foo0(i32 %a) - br i1 %v0, label %exit, label %outer, !prof !0 - -; CHECK-NEXT: outer: float = 12.0, -outer: - %i = phi i32 [ 0, %entry ], [ %i.next, %inner.end ], [ %i.next, %no_inner ] - %i.next = add i32 %i, 1 - %do_inner = call i1 @foo1(i32 %i) - br i1 %do_inner, label %no_inner, label %inner, !prof !0 - -; CHECK-NEXT: inner: float = 36.0, -inner: - %j = phi i32 [ 0, %outer ], [ %j.next, %inner.end ] - %side = call i1 @foo3(i32 %j) - br i1 %side, label %left, label %right, !prof !0 - -; CHECK-NEXT: left: float = 9.0, -left: - %v4 = call i1 @foo4(i32 %j) - br label %inner.end - -; CHECK-NEXT: right: float = 27.0, -right: - %v5 = call i1 @foo5(i32 %j) - br label %inner.end - -; CHECK-NEXT: inner.end: float = 36.0, -inner.end: - %stay_inner = phi i1 [ %v4, %left ], [ %v5, %right ] - %j.next = add i32 %j, 1 - br i1 %stay_inner, label %inner, label %outer, !prof !1 - -; CHECK-NEXT: no_inner: float = 3.0, -no_inner: - %continue = call i1 @foo6(i32 %i) - br i1 %continue, label %outer, label %exit, !prof !1 - -; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]] -exit: - ret void -} - -declare i1 @foo0(i32) -declare i1 @foo1(i32) -declare i1 @foo2(i32) -declare i1 @foo3(i32) -declare i1 @foo4(i32) -declare i1 @foo5(i32) -declare i1 @foo6(i32) - -!0 = metadata !{metadata !"branch_weights", i32 1, i32 3} -!1 = metadata !{metadata !"branch_weights", i32 3, i32 1} diff --git a/test/CodeGen/XCore/llvm-intrinsics.ll b/test/CodeGen/XCore/llvm-intrinsics.ll index b436282615..e0acd66e4a 100644 --- a/test/CodeGen/XCore/llvm-intrinsics.ll +++ b/test/CodeGen/XCore/llvm-intrinsics.ll @@ -287,8 +287,9 @@ define void @Unwind1() { ; CHECKFP: .LBB{{[0-9_]+}} ; CHECKFP-NEXT: ldc r2, 40 ; CHECKFP-NEXT: add r2, r10, r2 -; CHECKFP-NEXT: add r2, r2, r0 +; CHECKFP-NEXT: add r0, r2, r0 ; CHECKFP-NEXT: mov r3, r1 +; CHECKFP-NEXT: mov r2, r0 ; CHECKFP-NEXT: ldw r9, r10[4] ; CHECKFP-NEXT: ldw r8, r10[5] ; CHECKFP-NEXT: ldw r7, r10[6] @@ -336,8 +337,9 @@ define void @Unwind1() { ; CHECK-NEXT: ldc r2, 36 ; CHECK-NEXT: ldaw r3, sp[0] ; CHECK-NEXT: add r2, r3, r2 -; CHECK-NEXT: add r2, r2, r0 +; CHECK-NEXT: add r0, r2, r0 ; CHECK-NEXT: mov r3, r1 +; CHECK-NEXT: mov r2, r0 ; CHECK-NEXT: ldw r10, sp[2] ; CHECK-NEXT: ldw r9, sp[3] ; CHECK-NEXT: ldw r8, sp[4] -- cgit v1.2.3