3 files changed, 89 insertions, 55 deletions
diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp
index 1b38c027da..300821026a 100644
--- a/lib/Analysis/ConstantFolding.cpp
+++ b/lib/Analysis/ConstantFolding.cpp
@@ -1047,6 +1047,14 @@ llvm::canConstantFoldCallTo(const Function *F) {
   case Intrinsic::smul_with_overflow:
   case Intrinsic::convert_from_fp16:
   case Intrinsic::convert_to_fp16:
+  case Intrinsic::x86_sse_cvtss2si:
+  case Intrinsic::x86_sse_cvtss2si64:
+  case Intrinsic::x86_sse_cvttss2si:
+  case Intrinsic::x86_sse_cvttss2si64:
+  case Intrinsic::x86_sse2_cvtsd2si:
+  case Intrinsic::x86_sse2_cvtsd2si64:
+  case Intrinsic::x86_sse2_cvttsd2si:
+  case Intrinsic::x86_sse2_cvttsd2si64:
     return true;
   default:
     return false;
@@ -1116,6 +1124,36 @@ static Constant *ConstantFoldBinaryFP(double (*NativeFP)(double, double),
   return 0; // dummy return to suppress warning
 }
 
+/// ConstantFoldConvertToInt - Attempt to an SSE floating point to integer
+/// conversion of a constant floating point. If roundTowardZero is false, the
+/// default IEEE rounding is used (toward nearest, ties to even). This matches
+/// the behavior of the non-truncating SSE instructions in the default rounding
+/// mode. The desired integer type Ty is used to select how many bits are
+/// available for the result. Returns null if the conversion cannot be
+/// performed, otherwise returns the Constant value resulting from the
+/// conversion.
+static Constant *ConstantFoldConvertToInt(ConstantFP *Op, bool roundTowardZero,
+                                          const Type *Ty) {
+  assert(Op && "Called with NULL operand");
+  APFloat Val(Op->getValueAPF());
+
+  // All of these conversion intrinsics form an integer of at most 64bits.
+  unsigned ResultWidth = cast<IntegerType>(Ty)->getBitWidth();
+  assert(ResultWidth <= 64 &&
+         "Can only constant fold conversions to 64 and 32 bit ints");
+
+  uint64_t UIntVal;
+  bool isExact = false;
+  APFloat::roundingMode mode = roundTowardZero? APFloat::rmTowardZero
+                                              : APFloat::rmNearestTiesToEven;
+  APFloat::opStatus status = Val.convertToInteger(&UIntVal, ResultWidth,
+                                                  /*isSigned=*/true, mode,
+                                                  &isExact);
+  if (status != APFloat::opOK && status != APFloat::opInexact)
+    return 0;
+  return ConstantInt::get(Ty, UIntVal, /*isSigned=*/true);
+}
+
 /// ConstantFoldCall - Attempt to constant fold a call to the specified function
 /// with the specified arguments, returning null if unsuccessful.
 Constant *
@@ -1246,6 +1284,24 @@ llvm::ConstantFoldCall(Function *F,
       }
     }
 
+    if (ConstantVector *Op = dyn_cast<ConstantVector>(Operands[0])) {
+      switch (F->getIntrinsicID()) {
+      default: break;
+      case Intrinsic::x86_sse_cvtss2si:
+      case Intrinsic::x86_sse_cvtss2si64:
+      case Intrinsic::x86_sse2_cvtsd2si:
+      case Intrinsic::x86_sse2_cvtsd2si64:
+        if (ConstantFP *FPOp = dyn_cast<ConstantFP>(Op->getOperand(0)))
+          return ConstantFoldConvertToInt(FPOp, /*roundTowardZero=*/false, Ty);
+      case Intrinsic::x86_sse_cvttss2si:
+      case Intrinsic::x86_sse_cvttss2si64:
+      case Intrinsic::x86_sse2_cvttsd2si:
+      case Intrinsic::x86_sse2_cvttsd2si64:
+        if (ConstantFP *FPOp = dyn_cast<ConstantFP>(Op->getOperand(0)))
+          return ConstantFoldConvertToInt(FPOp, /*roundTowardZero=*/true, Ty);
+      }
+    }
+
     if (isa<UndefValue>(Operands[0])) {
       if (F->getIntrinsicID() == Intrinsic::bswap)
         return Operands[0];
diff --git a/lib/Target/README.txt b/lib/Target/README.txt
index 194a19219c..c3a9330ba6 100644
--- a/lib/Target/README.txt
+++ b/lib/Target/README.txt
@@ -2259,58 +2259,3 @@ Since we know that x+2.0 doesn't care about the sign of any zeros in X, we can
 transform the fmul to 0.0, and then the fadd to 2.0.
 
 //===---------------------------------------------------------------------===//
-
-clang -O3 currently compiles this code:
-
-#include <emmintrin.h>
-int f(double x) { return _mm_cvtsd_si32(_mm_set_sd(x)); }
-int g(double x) { return _mm_cvttsd_si32(_mm_set_sd(x)); }
-
-into
-
-define i32 @_Z1fd(double %x) nounwind readnone {
-entry:
-  %vecinit.i = insertelement <2 x double> undef, double %x, i32 0
-  %vecinit1.i = insertelement <2 x double> %vecinit.i, double 0.000000e+00,i32 1
-  %0 = tail call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %vecinit1.i) nounwind
-  ret i32 %0
-}
-
-define i32 @_Z1gd(double %x) nounwind readnone {
-entry:
-  %conv.i = fptosi double %x to i32
-  ret i32 %conv.i
-}
-
-This difference carries over to the assmebly produced, resulting in:
-
-_Z1fd:                                  # @_Z1fd
-# BB#0:                                 # %entry
-        pushq   %rbp
-        movq    %rsp, %rbp
-        xorps   %xmm1, %xmm1
-        movsd   %xmm0, %xmm1
-        cvtsd2sil       %xmm1, %eax
-        popq    %rbp
-        ret
-
-_Z1gd:                                  # @_Z1gd
-# BB#0:                                 # %entry
-        pushq   %rbp
-        movq    %rsp, %rbp
-        cvttsd2si       %xmm0, %eax
-        popq    %rbp
-        ret
-
-The problem is that we can't see through the intrinsic call used for cvtsd2si,
-and fold away the unnecessary manipulation of the function parameter. When
-these functions are inlined, it forms a barrier preventing many further
-optimizations. LLVM IR doesn't have a good way to model the logic of
-'cvtsd2si', its only FP -> int conversion path forces truncation. We should add
-a rounding flag onto fptosi so that it can represent this type of rounding
-naturally in the IR rather than using intrinsics. We might need to use a
-'system_rounding_mode' flag to encode that the semantics of the rounding mode
-can be changed by the program, but ideally we could just say that isn't
-supported, and hard code the rounding.
-
-//===---------------------------------------------------------------------===//
diff --git a/test/Transforms/ConstProp/calls.ll b/test/Transforms/ConstProp/calls.ll
index a12fc82d64..82d73245ad 100644
--- a/test/Transforms/ConstProp/calls.ll
+++ b/test/Transforms/ConstProp/calls.ll
@@ -21,3 +21,36 @@ define double @T() {
   %c = fadd double %b, %D
   ret double %c
 }
+
+define i1 @test_sse_cvt() nounwind readnone {
+; CHECK: @test_sse_cvt
+; CHECK-NOT: call
+; CHECK: ret i1 true
+entry:
+  %i0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> <float 1.75, float undef, float undef, float undef>) nounwind
+  %i1 = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> <float 1.75, float undef, float undef, float undef>) nounwind
+  %i2 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> <float 1.75, float undef, float undef, float undef>) nounwind
+  %i3 = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> <float 1.75, float undef, float undef, float undef>) nounwind
+  %i4 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> <double 1.75, double undef>) nounwind
+  %i5 = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> <double 1.75, double undef>) nounwind
+  %i6 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> <double 1.75, double undef>) nounwind
+  %i7 = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> <double 1.75, double undef>) nounwind
+  %sum11 = add i32 %i0, %i1
+  %sum12 = add i32 %i4, %i5
+  %sum1 = add i32 %sum11, %sum12
+  %sum21 = add i64 %i2, %i3
+  %sum22 = add i64 %i6, %i7
+  %sum2 = add i64 %sum21, %sum22
+  %sum1.sext = sext i32 %sum1 to i64
+  %b = icmp eq i64 %sum1.sext, %sum2
+  ret i1 %b
+}
+
+declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone
+declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone
+declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) nounwind readnone
+declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>) nounwind readnone
+declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone
+declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>) nounwind readnone
+declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) nounwind readnone
+declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>) nounwind readnone