From 9862f8b4bb67c030949c5c5a17b541e7d072b2a7 Mon Sep 17 00:00:00 2001
From: Eric Bavier <eric.bavier@sifive.com>
Date: Thu, 12 Dec 2024 09:57:57 -0600
Subject: [PATCH] Improves accuracy of vector square root approximation
 example.

The previous implementation was unnecessarily inaccurate:

  Measured max error of 2.647303869947791 ulp @ 2.0831878e-19
        func (0x1.ebe0fap-63) = 0x1.f5d6b2p-32
    func_ref (0x1.ebe0fap-63) = 0x1.f5d6acb494965p-32
  2'139'095'041 tested in [0 , inf]
   57.2114955877736548% correctly rounded  (  1223808265)
   88.4557938629712339% faithfully rounded (+  668345235)
   11.5442061370287661% >= 1.0 ulp         (+  246941541)

Which is 21.6 bits of precision rater than the claimed 23.

The new implementation has lower maximum error and higher correctly-
and faithfully-rounded rates:

  Measured max error of 0.800413005053997 ulp @ 3.0387554e-29
        func (0x1.342a9ap-95) = 0x1.8d378p-48
    func_ref (0x1.342a9ap-95) = 0x1.8d378199cfbbcp-48
  2'139'095'041 tested in [0 , inf]
   87.4400844352198163% correctly rounded  (  1870426510)
  100.0000000000000000% faithfully rounded (+  268668531)

With is 23.3 bits of precision.

* src/vector-examples.adoc (Square root approximation example): Use
`r - (0.5 r) ((x r) r - 1)` formulation for 1/sqrt(x) estimate
refinement.
---
 src/vector-examples.adoc | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/src/vector-examples.adoc b/src/vector-examples.adoc
index 4577c9216..41d81eaaa 100644
--- a/src/vector-examples.adoc
+++ b/src/vector-examples.adoc
@@ -95,25 +95,25 @@ vfmul.vv v1, v1, v3         # Estimate of v1/v2
 === Square root approximation example
 
 ----
-# v1 = sqrt(v1) to almost 23 bits of precision.
+# v1 = sqrt(v1) to more than 23 bits of precision.
 
   fmv.w.x ft0, x0           # Mask off zero inputs
-vmfne.vf v0, v1, ft0        #   to avoid div by zero
-vfrsqrt7.v v2, v1, v0.t     # Estimate 1/sqrt(x)
-vmfne.vf v0, v2, ft0, v0.t  # Additionally mask off +inf inputs
-  li t0, 0x40400000
-vmv.v.x v4, t0              # Splat 3.0
-vfmul.vv v3, v1, v2, v0.t   # x * est
-vfnmsub.vv v3, v2, v4, v0.t # - x * est * est + 3
-vfmul.vv v3, v3, v2, v0.t   # est * (-x * est * est + 3)
-  li t0, 0x3f000000
-  fmv.w.x ft0, t0           # 0.5
-vfmul.vf v2, v3, ft0, v0.t  # Estimate to 14 bits
-vfmul.vv v3, v1, v2, v0.t   # x * est
-vfnmsub.vv v3, v2, v4, v0.t # - x * est * est + 3
-vfmul.vv v3, v3, v2, v0.t   # est * (-x * est * est + 3)
-vfmul.vf v2, v3, ft0, v0.t  # Estimate to 23 bits
-vfmul.vv v1, v2, v1, v0.t   # x * 1/sqrt(x)
+vmfne.vf v0, v1, ft0        #   to avoid DZ exception
+vfrsqrt7.v v2, v1, v0.t     # Estimate r ~= 1/sqrt(v1)
+vmfne.vf v0, v2, ft0, v0.t  # Mask off +inf to avoid NV
+  li t0, 0x3f800000
+  fli.s ft0, 0.5
+vmv.v.x v5, t0              # Splat 1.0
+vfmul.vv v3, v1, v2, v0.t   # t = v1 r
+vfmul.vf v4, v2, ft0, v0.t  # 0.5 r
+vfmsub.vv v3, v2, v5, v0.t  # t r - 1
+vfnmsac.vv v2, v3, v4, v0.t # r - (0.5 r) (t r - 1)
+                            # Better estimate of 1/sqrt(v1)
+vfmul.vv v1, v1, v2, v0.t   # t = v1 r
+vfmsub.vv v2, v1, v5, v0.t  # t r - 1
+vfmul.vf v3, v1, ft0, v0.t  # 0.5 t
+vfnmsac.vv v1, v2, v3, v0.t # t - (0.5 t) (t r - 1)
+                            # ~ sqrt(v1) to about 23.3 bits
 ----
 
 === C standard library strcmp example