From 9862f8b4bb67c030949c5c5a17b541e7d072b2a7 Mon Sep 17 00:00:00 2001 From: Eric Bavier Date: Thu, 12 Dec 2024 09:57:57 -0600 Subject: [PATCH] Improves accuracy of vector square root approximation example. The previous implementation was unnecessarily inaccurate: Measured max error of 2.647303869947791 ulp @ 2.0831878e-19 func (0x1.ebe0fap-63) = 0x1.f5d6b2p-32 func_ref (0x1.ebe0fap-63) = 0x1.f5d6acb494965p-32 2'139'095'041 tested in [0 , inf] 57.2114955877736548% correctly rounded ( 1223808265) 88.4557938629712339% faithfully rounded (+ 668345235) 11.5442061370287661% >= 1.0 ulp (+ 246941541) Which is 21.6 bits of precision rater than the claimed 23. The new implementation has lower maximum error and higher correctly- and faithfully-rounded rates: Measured max error of 0.800413005053997 ulp @ 3.0387554e-29 func (0x1.342a9ap-95) = 0x1.8d378p-48 func_ref (0x1.342a9ap-95) = 0x1.8d378199cfbbcp-48 2'139'095'041 tested in [0 , inf] 87.4400844352198163% correctly rounded ( 1870426510) 100.0000000000000000% faithfully rounded (+ 268668531) With is 23.3 bits of precision. * src/vector-examples.adoc (Square root approximation example): Use `r - (0.5 r) ((x r) r - 1)` formulation for 1/sqrt(x) estimate refinement. --- src/vector-examples.adoc | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/src/vector-examples.adoc b/src/vector-examples.adoc index 4577c9216..41d81eaaa 100644 --- a/src/vector-examples.adoc +++ b/src/vector-examples.adoc @@ -95,25 +95,25 @@ vfmul.vv v1, v1, v3 # Estimate of v1/v2 === Square root approximation example ---- -# v1 = sqrt(v1) to almost 23 bits of precision. +# v1 = sqrt(v1) to more than 23 bits of precision. fmv.w.x ft0, x0 # Mask off zero inputs -vmfne.vf v0, v1, ft0 # to avoid div by zero -vfrsqrt7.v v2, v1, v0.t # Estimate 1/sqrt(x) -vmfne.vf v0, v2, ft0, v0.t # Additionally mask off +inf inputs - li t0, 0x40400000 -vmv.v.x v4, t0 # Splat 3.0 -vfmul.vv v3, v1, v2, v0.t # x * est -vfnmsub.vv v3, v2, v4, v0.t # - x * est * est + 3 -vfmul.vv v3, v3, v2, v0.t # est * (-x * est * est + 3) - li t0, 0x3f000000 - fmv.w.x ft0, t0 # 0.5 -vfmul.vf v2, v3, ft0, v0.t # Estimate to 14 bits -vfmul.vv v3, v1, v2, v0.t # x * est -vfnmsub.vv v3, v2, v4, v0.t # - x * est * est + 3 -vfmul.vv v3, v3, v2, v0.t # est * (-x * est * est + 3) -vfmul.vf v2, v3, ft0, v0.t # Estimate to 23 bits -vfmul.vv v1, v2, v1, v0.t # x * 1/sqrt(x) +vmfne.vf v0, v1, ft0 # to avoid DZ exception +vfrsqrt7.v v2, v1, v0.t # Estimate r ~= 1/sqrt(v1) +vmfne.vf v0, v2, ft0, v0.t # Mask off +inf to avoid NV + li t0, 0x3f800000 + fli.s ft0, 0.5 +vmv.v.x v5, t0 # Splat 1.0 +vfmul.vv v3, v1, v2, v0.t # t = v1 r +vfmul.vf v4, v2, ft0, v0.t # 0.5 r +vfmsub.vv v3, v2, v5, v0.t # t r - 1 +vfnmsac.vv v2, v3, v4, v0.t # r - (0.5 r) (t r - 1) + # Better estimate of 1/sqrt(v1) +vfmul.vv v1, v1, v2, v0.t # t = v1 r +vfmsub.vv v2, v1, v5, v0.t # t r - 1 +vfmul.vf v3, v1, ft0, v0.t # 0.5 t +vfnmsac.vv v1, v2, v3, v0.t # t - (0.5 t) (t r - 1) + # ~ sqrt(v1) to about 23.3 bits ---- === C standard library strcmp example