Skip to content

Commit b47ab92

Browse files
committed
device-libs: Move special case check in rsqrt f64 implementation
Move the edge case check to be on the original input argument, instead of the output of the rsq. This simplifies optimizations to strip out the check based on value tracking. This approximately results in equivalently good code. On targets with v_fmac_f64, the result looks worse in the most trivial example due to a bad decision to not rewrite to v_fma_f64 on the final fma (llvm#171891). The result is equivalently good in other final use contexts.
1 parent 0648d87 commit b47ab92

File tree

2 files changed

+5
-5
lines changed

2 files changed

+5
-5
lines changed

amd/device-libs/ocml/src/rsqrtD.cl

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,7 @@ CONSTATTR double
1111
MATH_MANGLE(rsqrt)(double x)
1212
{
1313
double y0 = BUILTIN_AMDGPU_RSQRT_F64(x);
14-
double e = MATH_MAD(-x*y0, y0, 1.0);
15-
double y1 = MATH_MAD(y0*e, MATH_MAD(e, 0.375, 0.5), y0);
16-
return BUILTIN_CLASS_F64(y0, CLASS_PSUB|CLASS_PNOR) ? y1 : y0;
14+
double e = MATH_MAD(-y0 * (x == PINF_F64 || x == 0.0 ? y0 : x), y0, 1.0);
15+
return MATH_MAD(y0*e, MATH_MAD(e, 0.375, 0.5), y0);
1716
}
1817

amd/device-libs/test/compile/rsqrt.cl

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,13 +26,14 @@ float test_rsqrt_f32(float x) {
2626

2727
// CHECK-LABEL: {{^}}test_rsqrt_f64:
2828
// CHECK: v_rsq_f64
29+
// CHECK: v_cmp_class_f64
30+
// CHECK: v_cndmask_b32
31+
// CHECK: v_cndmask_b32
2932
// CHECK: v_mul_f64
3033
// CHECK: v_fma_f64
3134
// CHECK: v_mul_f64
3235
// CHECK: v_fma_f64
3336
// CHECK: v_fma_f64
34-
// CHECK: v_cndmask_b32
35-
// CHECK: v_cndmask_b32
3637
double test_rsqrt_f64(double x) {
3738
return rsqrt(x);
3839
}

0 commit comments

Comments
 (0)