From 3fd56c38b232883c71663a126c03c3c36e7dde6f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Albin=20Ahlb=C3=A4ck?= <albin.ahlback@gmail.com>
Date: Mon, 11 Mar 2024 15:41:22 +0100
Subject: [PATCH 1/2] Clarify usage of low-level addition and subtraction
 macros

---
 doc/source/longlong.rst | 111 +++++++++++++++++++---------------------
 1 file changed, 54 insertions(+), 57 deletions(-)

diff --git a/doc/source/longlong.rst b/doc/source/longlong.rst
index a11f32e8e0..e4b84253f2 100644
--- a/doc/source/longlong.rst
+++ b/doc/source/longlong.rst
@@ -3,90 +3,87 @@
 **longlong.h** -- support functions for multi-word arithmetic
 ===============================================================================
 
+Leading and trailing zeroes
+-------------------------------------------------------------------------------
 
-Auxiliary asm macros
---------------------------------------------------------------------------------
+.. macro:: flint_clz(x)
 
+    Returns the number of zero-bits from the msb to the first non-zero bit in
+    the limb `x`.  This is the number of steps `x` needs to be shifted left to
+    set the most significant bit in `x`. If `x` is zero then the return value is
+    undefined.
 
-.. macro:: umul_ppmm(high_prod, low_prod, multiplier, multiplicand)
+.. macro:: flint_ctz(x)
 
-    Multiplies two single limb integers ``MULTIPLIER`` and 
-    ``MULTIPLICAND``, and generates a two limb product in 
-    ``HIGH_PROD`` and ``LOW_PROD``.
+    As for ``flint_clz()``, but counts from the least significant end. If `x` is
+    zero then the return value is undefined.
 
-.. macro:: smul_ppmm(high_prod, low_prod, multiplier, multiplicand)
+Addition and subtraction
+-------------------------------------------------------------------------------
 
-    As for ``umul_ppmm()`` but the numbers are signed.
+.. note::
 
-.. macro:: udiv_qrnnd(quotient, remainder, high_numerator, low_numerator, denominator)
+    When aliasing inputs with outputs in these addition and subtraction macros,
+    make sure to have `s_{i}` aliased with `a_{i}` for addition macros, and
+    `d_{i}` aliased with `m_{i}` for optimal performance. Moreover, keep
+    immediates (in other words, constants known to the compiler) in the `b_{i}`
+    variables for addition and `s_{i}` for subtraction.
 
-    Divides an unsigned integer, composed by the limb integers 
-    ``HIGH_NUMERATOR`` and ``LOW_NUMERATOR``, by ``DENOMINATOR`` 
-    and places the quotient in ``QUOTIENT`` and the remainder in 
-    ``REMAINDER``.  ``HIGH_NUMERATOR`` must be less than 
-    ``DENOMINATOR`` for correct operation. 
+.. macro:: add_ssaaaa(s1, s0, a1, a0, b1, b0)
 
-.. macro:: sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator, denominator)
+    Sets `s_1` and `s_0` according
+    to `c B^2 + s_1 B + s_0 = (a_1 B + a_0) + (b_1 B + b_0)`,
+    where `B = 2^{\mathtt{FLINT\_BITS}}` is the base, and `c` is the carry from
+    the addition which is not stored anywhere.
 
-    As for ``udiv_qrnnd()`` but the numbers are signed.  The quotient is 
-    rounded towards `0`. Note that as the quotient is signed it must lie in 
-    the range `[-2^63, 2^63)`.
+.. macro:: add_sssaaaaaa(s2, s1, s0, a2, a1, a0, b2, b1, b0)
 
-.. macro:: flint_clz(x)
+    Works like ``add_ssaaaa``, but for two three-limbed integers. Carry is lost.
 
-    Returns the number of zero-bits from the msb to the first non-zero bit in
-    the limb ``x``.  This is the number of steps ``x`` needs to be shifted left
-    to set the msb. If ``x`` is `0` then the return value is undefined.
+.. macro:: sub_ddmmss(d1, d0, m1, m0, s1, s0)
 
-.. macro:: flint_ctz(x)
+    Sets `d_1` and `d_0` to the difference between the two-limbed
+    integers `m_1 B + m_0` and `s_1 B + s_0`,
+    where `B = 2^{\mathtt{FLINT\_BITS}}`. Borrow from the subtraction is not
+    stored anywhere.
 
-    As for ``flint_clz()``, but counts from the least significant end. If ``x``
-    is zero then the return value is undefined.
+.. macro:: sub_dddmmmsss(d2, d1, d0, m2, m1, m0, s2, s1, s0)
 
-.. macro:: add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1, high_addend_2, low_addend_2)
+    Works like ``sub_dddmmmsss``, but for two three-limbed integers. Borrow is
+    lost.
 
-    Adds two limb integers, composed by ``HIGH_ADDEND_1`` and 
-    ``LOW_ADDEND_1``, and ``HIGH_ADDEND_2`` and ``LOW_ADDEND_2``, 
-    respectively.  The result is placed in ``HIGH_SUM`` and 
-    ``LOW_SUM``.  Overflow, i.e. carry out, is not stored anywhere, 
-    and is lost.
+Multiplication
+-------------------------------------------------------------------------------
 
-.. macro:: add_sssaaaaaa(high_sum, mid_sum, low_sum, high_addend_1, mid_addend_1, low_addend_1, high_addend_2, mid_addend_2, low_addend_2)
+.. macro:: umul_ppmm(p1, p0, u, v)
 
-    Adds two three limb integers. Carry out is lost.
+    Computes `p_1 B + p0 = u v`, where `B = 2^{\mathtt{FLINT\_BITS}}`.
 
-.. macro:: sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend, high_subtrahend, low_subtrahend)
+.. macro:: smul_ppmm(p1, p0, u, v)
 
-    Subtracts two limb integers, composed by ``HIGH_MINUEND_1`` and 
-    ``LOW_MINUEND_1``, and ``HIGH_SUBTRAHEND_2`` and 
-    ``LOW_SUBTRAHEND_2``, respectively.  The result is placed in 
-    ``HIGH_DIFFERENCE`` and ``LOW_DIFFERENCE``.  Overflow, i.e. 
-    carry out is not stored anywhere, and is lost.
+    Works like ``umul_ppmm`` but for signed numbers.
 
-.. macro:: sub_dddmmmsss(high_diff, mid_diff, low_diff, high_minuend_1, mid_minuend_1, low_minuend_1, high_subtrahend_2, mid_subtrahend_2, low_subtrahend_2)
+Division
+-------------------------------------------------------------------------------
 
-    Subtracts two three limb integers. Borrow out is lost.
+.. macro:: udiv_qrnnd(q, r, n1, n0, d)
 
-.. macro:: byte_swap(x)
+    Computes the non-negative integers `q` and `r` in `d q + r = n_1 B + n_0`,
+    where `B = 2^{\mathtt{FLINT\_BITS}}`. Assumes that `d < n_1`.
 
-    Swap the order of the bytes in the word `x`, i.e. most significant byte
-    becomes least significant byte, etc.
+.. macro:: sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator, denominator)
 
-.. macro:: invert_limb(invxl, xl)
+    Works like ``udiv_qrnnd``, but for signed numbers.
 
-    Deprecated: see :func:`n_preinvert_limb_prenorm`.
+.. macro:: udiv_qrnnd_preinv(q, r, n1, n0, d, di)
 
-.. macro:: udiv_qrnnd_preinv(q, r, nh, nl, d, di)
+    Works like ``udiv_qrnnd``, but takes a precomputed inverse ``di`` as 
+    computed by ::func::`n_preinvert_limb`.
 
-    As for ``udiv_qrnnd()`` but takes a precomputed inverse ``di`` as 
-    computed by ``invert_limb()``. The algorithm, in terms of the theorem 
-    above, is::
+Miscellaneous
+-------------------------------------------------------------------------------
 
-        nadj = n1*(d-B/2) + n0
-        xh, xl = (n2+n1)*(m-B)
-        xh, xl += nadj + n2*B ( xh, xl = n2*B + (n2+n1)*(m-B) + n1*(d-B/2) + n0 )
-        _q1 = B - xh - 1
-        xh, xl = _q1*d + nh, nl - B*d = nh, nl - q1*d - d so that xh = 0 or -1
-        r = xl + xh*d where xh is 0 if q1 is off by 1, otherwise -1
-        q = xh - _q1 = xh + 1 + n2
+.. macro:: byte_swap(x)
 
+    Swap the order of the bytes in the word `x`, i.e. most significant byte
+    becomes least significant byte, etc.

From 6f11b77e2e1b5a896a066c254a970cb7ce801fde Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Albin=20Ahlb=C3=A4ck?= <albin.ahlback@gmail.com>
Date: Mon, 11 Mar 2024 15:50:27 +0100
Subject: [PATCH 2/2] Optimize instance of add_sssaaaaaa

---
 src/nmod_vec/dot.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/nmod_vec/dot.c b/src/nmod_vec/dot.c
index 258d075834..a1239bef0a 100644
--- a/src/nmod_vec/dot.c
+++ b/src/nmod_vec/dot.c
@@ -29,11 +29,11 @@ _nmod_vec_dot_bound_limbs(slong len, nmod_t mod)
     umul_ppmm(t1, t0, mod.n - 1, mod.n - 1);
     umul_ppmm(t2, t1, t1, len);
     umul_ppmm(u1, u0, t0, len);
-    add_sssaaaaaa(t2, t1, t0,  t2, t1, UWORD(0),  UWORD(0), u1, u0);
+    add_ssaaaa(t2, t1, t2, t1, UWORD(0), u1);
 
     if (t2 != 0) return 3;
     if (t1 != 0) return 2;
-    return (t0 != 0);
+    return (u0 != 0);
 }
 
 mp_limb_t