From 3fd56c38b232883c71663a126c03c3c36e7dde6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Albin=20Ahlb=C3=A4ck?= Date: Mon, 11 Mar 2024 15:41:22 +0100 Subject: [PATCH 1/2] Clarify usage of low-level addition and subtraction macros --- doc/source/longlong.rst | 111 +++++++++++++++++++--------------------- 1 file changed, 54 insertions(+), 57 deletions(-) diff --git a/doc/source/longlong.rst b/doc/source/longlong.rst index a11f32e8e0..e4b84253f2 100644 --- a/doc/source/longlong.rst +++ b/doc/source/longlong.rst @@ -3,90 +3,87 @@ **longlong.h** -- support functions for multi-word arithmetic =============================================================================== +Leading and trailing zeroes +------------------------------------------------------------------------------- -Auxiliary asm macros --------------------------------------------------------------------------------- +.. macro:: flint_clz(x) + Returns the number of zero-bits from the msb to the first non-zero bit in + the limb `x`. This is the number of steps `x` needs to be shifted left to + set the most significant bit in `x`. If `x` is zero then the return value is + undefined. -.. macro:: umul_ppmm(high_prod, low_prod, multiplier, multiplicand) +.. macro:: flint_ctz(x) - Multiplies two single limb integers ``MULTIPLIER`` and - ``MULTIPLICAND``, and generates a two limb product in - ``HIGH_PROD`` and ``LOW_PROD``. + As for ``flint_clz()``, but counts from the least significant end. If `x` is + zero then the return value is undefined. -.. macro:: smul_ppmm(high_prod, low_prod, multiplier, multiplicand) +Addition and subtraction +------------------------------------------------------------------------------- - As for ``umul_ppmm()`` but the numbers are signed. +.. note:: -.. macro:: udiv_qrnnd(quotient, remainder, high_numerator, low_numerator, denominator) + When aliasing inputs with outputs in these addition and subtraction macros, + make sure to have `s_{i}` aliased with `a_{i}` for addition macros, and + `d_{i}` aliased with `m_{i}` for optimal performance. Moreover, keep + immediates (in other words, constants known to the compiler) in the `b_{i}` + variables for addition and `s_{i}` for subtraction. - Divides an unsigned integer, composed by the limb integers - ``HIGH_NUMERATOR`` and ``LOW_NUMERATOR``, by ``DENOMINATOR`` - and places the quotient in ``QUOTIENT`` and the remainder in - ``REMAINDER``. ``HIGH_NUMERATOR`` must be less than - ``DENOMINATOR`` for correct operation. +.. macro:: add_ssaaaa(s1, s0, a1, a0, b1, b0) -.. macro:: sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator, denominator) + Sets `s_1` and `s_0` according + to `c B^2 + s_1 B + s_0 = (a_1 B + a_0) + (b_1 B + b_0)`, + where `B = 2^{\mathtt{FLINT\_BITS}}` is the base, and `c` is the carry from + the addition which is not stored anywhere. - As for ``udiv_qrnnd()`` but the numbers are signed. The quotient is - rounded towards `0`. Note that as the quotient is signed it must lie in - the range `[-2^63, 2^63)`. +.. macro:: add_sssaaaaaa(s2, s1, s0, a2, a1, a0, b2, b1, b0) -.. macro:: flint_clz(x) + Works like ``add_ssaaaa``, but for two three-limbed integers. Carry is lost. - Returns the number of zero-bits from the msb to the first non-zero bit in - the limb ``x``. This is the number of steps ``x`` needs to be shifted left - to set the msb. If ``x`` is `0` then the return value is undefined. +.. macro:: sub_ddmmss(d1, d0, m1, m0, s1, s0) -.. macro:: flint_ctz(x) + Sets `d_1` and `d_0` to the difference between the two-limbed + integers `m_1 B + m_0` and `s_1 B + s_0`, + where `B = 2^{\mathtt{FLINT\_BITS}}`. Borrow from the subtraction is not + stored anywhere. - As for ``flint_clz()``, but counts from the least significant end. If ``x`` - is zero then the return value is undefined. +.. macro:: sub_dddmmmsss(d2, d1, d0, m2, m1, m0, s2, s1, s0) -.. macro:: add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1, high_addend_2, low_addend_2) + Works like ``sub_dddmmmsss``, but for two three-limbed integers. Borrow is + lost. - Adds two limb integers, composed by ``HIGH_ADDEND_1`` and - ``LOW_ADDEND_1``, and ``HIGH_ADDEND_2`` and ``LOW_ADDEND_2``, - respectively. The result is placed in ``HIGH_SUM`` and - ``LOW_SUM``. Overflow, i.e. carry out, is not stored anywhere, - and is lost. +Multiplication +------------------------------------------------------------------------------- -.. macro:: add_sssaaaaaa(high_sum, mid_sum, low_sum, high_addend_1, mid_addend_1, low_addend_1, high_addend_2, mid_addend_2, low_addend_2) +.. macro:: umul_ppmm(p1, p0, u, v) - Adds two three limb integers. Carry out is lost. + Computes `p_1 B + p0 = u v`, where `B = 2^{\mathtt{FLINT\_BITS}}`. -.. macro:: sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend, high_subtrahend, low_subtrahend) +.. macro:: smul_ppmm(p1, p0, u, v) - Subtracts two limb integers, composed by ``HIGH_MINUEND_1`` and - ``LOW_MINUEND_1``, and ``HIGH_SUBTRAHEND_2`` and - ``LOW_SUBTRAHEND_2``, respectively. The result is placed in - ``HIGH_DIFFERENCE`` and ``LOW_DIFFERENCE``. Overflow, i.e. - carry out is not stored anywhere, and is lost. + Works like ``umul_ppmm`` but for signed numbers. -.. macro:: sub_dddmmmsss(high_diff, mid_diff, low_diff, high_minuend_1, mid_minuend_1, low_minuend_1, high_subtrahend_2, mid_subtrahend_2, low_subtrahend_2) +Division +------------------------------------------------------------------------------- - Subtracts two three limb integers. Borrow out is lost. +.. macro:: udiv_qrnnd(q, r, n1, n0, d) -.. macro:: byte_swap(x) + Computes the non-negative integers `q` and `r` in `d q + r = n_1 B + n_0`, + where `B = 2^{\mathtt{FLINT\_BITS}}`. Assumes that `d < n_1`. - Swap the order of the bytes in the word `x`, i.e. most significant byte - becomes least significant byte, etc. +.. macro:: sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator, denominator) -.. macro:: invert_limb(invxl, xl) + Works like ``udiv_qrnnd``, but for signed numbers. - Deprecated: see :func:`n_preinvert_limb_prenorm`. +.. macro:: udiv_qrnnd_preinv(q, r, n1, n0, d, di) -.. macro:: udiv_qrnnd_preinv(q, r, nh, nl, d, di) + Works like ``udiv_qrnnd``, but takes a precomputed inverse ``di`` as + computed by ::func::`n_preinvert_limb`. - As for ``udiv_qrnnd()`` but takes a precomputed inverse ``di`` as - computed by ``invert_limb()``. The algorithm, in terms of the theorem - above, is:: +Miscellaneous +------------------------------------------------------------------------------- - nadj = n1*(d-B/2) + n0 - xh, xl = (n2+n1)*(m-B) - xh, xl += nadj + n2*B ( xh, xl = n2*B + (n2+n1)*(m-B) + n1*(d-B/2) + n0 ) - _q1 = B - xh - 1 - xh, xl = _q1*d + nh, nl - B*d = nh, nl - q1*d - d so that xh = 0 or -1 - r = xl + xh*d where xh is 0 if q1 is off by 1, otherwise -1 - q = xh - _q1 = xh + 1 + n2 +.. macro:: byte_swap(x) + Swap the order of the bytes in the word `x`, i.e. most significant byte + becomes least significant byte, etc. From 6f11b77e2e1b5a896a066c254a970cb7ce801fde Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Albin=20Ahlb=C3=A4ck?= Date: Mon, 11 Mar 2024 15:50:27 +0100 Subject: [PATCH 2/2] Optimize instance of add_sssaaaaaa --- src/nmod_vec/dot.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/nmod_vec/dot.c b/src/nmod_vec/dot.c index 258d075834..a1239bef0a 100644 --- a/src/nmod_vec/dot.c +++ b/src/nmod_vec/dot.c @@ -29,11 +29,11 @@ _nmod_vec_dot_bound_limbs(slong len, nmod_t mod) umul_ppmm(t1, t0, mod.n - 1, mod.n - 1); umul_ppmm(t2, t1, t1, len); umul_ppmm(u1, u0, t0, len); - add_sssaaaaaa(t2, t1, t0, t2, t1, UWORD(0), UWORD(0), u1, u0); + add_ssaaaa(t2, t1, t2, t1, UWORD(0), u1); if (t2 != 0) return 3; if (t1 != 0) return 2; - return (t0 != 0); + return (u0 != 0); } mp_limb_t