Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix vec_copysign implementations per issue #158, Part 1B. #161

Merged
merged 1 commit into from
Feb 2, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 22 additions & 11 deletions src/pveclib/vec_f128_ppc.h
Original file line number Diff line number Diff line change
Expand Up @@ -441,7 +441,7 @@ vec_xsxexpqp (__binary128 f128)
* \endcode
*
* \note Would like to use the intrinsic scalar_extract_exp() here but
* this is not available until GCC 11.
* this is not available until GCC 11 (or later).
* Also GCC defines these scalar built-ins to return integer scalar
* values in GPRs.
* This would defeat the purpose of an all vector implementation.
Expand Down Expand Up @@ -1769,7 +1769,7 @@ static inline vec_xscvsqqp (vi128_t int128)
lo64 = int64[VEC_DW_L];
result = (hi64 * two64) + lo64;
// copy the __int128's sign into the __binary128 result
result = vec_copysignf128 (result, i_sign);
result = vec_copysignf128 (i_sign, result);
#elif defined (_ARCH_PWR8)
...
#endif
Expand Down Expand Up @@ -3806,26 +3806,37 @@ vec_all_iszerof128 (__binary128 f128)
#endif
}

/** \brief Copy the sign bit from f128y and merge with the magnitude
* from f128x. The merged result is returned as a __float128 value.
/** \brief Copy the sign bit from f128x and merge with the magnitude
* from f128y. The merged result is returned as a __float128 value.
*
* \note This operation was patterned after the intrinsic vec_cpsgn
* (altivec.h) introduced for POWER7 and VSX. It turns out the
* original (GCC 4.9) compiler implementation reversed the operands
* and does not match the PowerISA or the Vector Intrinsic Programming
* Reference manuals. Subsequent compilers and PVECLIB
* implementations replicated this (operand order) error.
* This has now been reported as bug against the compilers, which are
* in the process of applying fixes and distributing updates.
* This version of PVECLIB is updated to match the Vector Intrinsic
* Programming Reference.
*
* |processor|Latency|Throughput|
* |--------:|:-----:|:---------|
* |power8 | 2-11 | 2/cycle |
* |power9 | 2 | 4/cycle |
*
* @param f128x a __float128 value containing the magnitude.
* @param f128y a __float128 value containing the sign bit.
* @return a __float128 value with magnitude from f128x and the
* sign of f128y.
* @param f128x a __float128 value containing the sign bit.
* @param f128y a __float128 value containing the magnitude.
* @return a __float128 value with magnitude from f128y and the
* sign of f128x.
*/
static inline __binary128
vec_copysignf128 (__binary128 f128x, __binary128 f128y)
{
__binary128 result;
#if _ARCH_PWR9
__asm__(
"xscpsgnqp %0,%2,%1;\n"
"xscpsgnqp %0,%1,%2;\n"
: "=v" (result)
: "v" (f128x), "v" (f128y)
:);
Expand All @@ -3835,7 +3846,7 @@ vec_copysignf128 (__binary128 f128x, __binary128 f128y)
tmpx = vec_xfer_bin128_2_vui32t (f128x);
tmpy = vec_xfer_bin128_2_vui32t (f128y);

tmp = vec_sel (tmpx, tmpy, signmask);
tmp = vec_sel (tmpy, tmpx, signmask);
result = vec_xfer_vui32t_2_bin128 (tmp);
#endif
return (result);
Expand Down Expand Up @@ -8006,7 +8017,7 @@ static inline vec_xscvsqqp (vi128_t int128)
lo64 = int64[VEC_DW_L];
result = (hi64 * two64) + lo64;
// Copy the __int128's sign into the __binary128 result
result = vec_copysignf128 (result, i_sign);
result = vec_copysignf128 (i_sign, result);
#elif defined (_ARCH_PWR8)
vui64_t q_exp;
vui128_t q_sig;
Expand Down
37 changes: 29 additions & 8 deletions src/pveclib/vec_f32_ppc.h
Original file line number Diff line number Diff line change
Expand Up @@ -788,31 +788,52 @@ vec_any_iszerof32 (vf32_t vf32)
#endif
}

/** \brief Copy the sign bit from vf32y merged with magnitude from
* vf32x and return the resulting vector float values.
/** \brief Copy the sign bit from vf32x merged with magnitude from
* vf32y and return the resulting vector float values.
*
* \note This operation was patterned after the intrinsic vec_cpsgn
* (altivec.h) introduced for POWER7 and VSX. It turns out the
* original (GCC 4.9) compiler implementation reversed the operands
* and does not match the PowerISA or the Vector Intrinsic Programming
* Reference manuals. Subsequent compilers and PVECLIB
* implementations replicated this (operand order) error.
* This has now been reported as bug against the compilers, which are
* in the process of applying fixes and distributing updates.
* This version of PVECLIB is updated to match the Vector Intrinsic
* Programming Reference. This implementation is independent of the
* compilers update status.
*
* |processor|Latency|Throughput|
* |--------:|:-----:|:---------|
* |power8 | 6-7 | 2/cycle |
* |power9 | 2 | 2/cycle |
*
* @param vf32x vector float values containing the magnitudes.
* @param vf32y vector float values containing the sign bits.
* @return vector float values with magnitude from vf32x and the
* sign of vf32y.
* @param vf32x vector float values containing the sign bits.
* @param vf32y vector float values containing the magnitudes.
* @return vector float values with magnitude from vf32y and the
* sign of vf32x.
*/
static inline vf32_t
vec_copysignf32 (vf32_t vf32x, vf32_t vf32y)
{
#if _ARCH_PWR7
/* P9 has a 2 cycle xvcpsgnsp and eliminates a const load. */
#ifdef PVECLIB_CPSGN_FIXED
return (vec_cpsgn (vf32x, vf32y));
#else
vf32_t result;
__asm__(
"xvcpsgnsp %x0,%x1,%x2;\n"
: "=wa" (result)
: "wa" (vf32x), "wa" (vf32y)
:);
return (result);
#endif
#else
const vui32_t signmask = CONST_VINT128_W(0x80000000, 0x80000000,
0x80000000, 0x80000000);
vf32_t result;

result = (vf32_t)vec_sel ((vui32_t)vf32x, (vui32_t)vf32y, signmask);
result = (vf32_t)vec_sel ((vui32_t)vf32y, (vui32_t)vf32x, signmask);
return (result);
#endif
}
Expand Down
46 changes: 34 additions & 12 deletions src/pveclib/vec_f64_ppc.h
Original file line number Diff line number Diff line change
Expand Up @@ -780,31 +780,53 @@ vec_any_iszerof64 (vf64_t vf64)
#endif
}

/** \brief Copy the sign bit from vf64y merged with magnitude from
* vf64x and return the resulting vector double values.
/** \brief Copy the sign bit from vf64x merged with magnitude from
* vf64y and return the resulting vector double values.
*
* \note This operation was patterned after the intrinsic vec_cpsgn
* (altivec.h) introduced for POWER7 and VSX. It turns out the
* original (GCC 4.9) compiler implementation reversed the operands
* and does not match the PowerISA or the Vector Intrinsic Programming
* Reference manuals. Subsequent compilers and PVECLIB
* implementations replicated this (operand order) error.
* This has now been reported as bug against the compilers, which are
* in the process of applying fixes and distributing updates.
* This version of PVECLIB is updated to match the Vector Intrinsic
* Programming Reference. This implementation is independent of the
* compilers update status.
*
* |processor|Latency|Throughput|
* |--------:|:-----:|:---------|
* |power8 | 6-7 | 2/cycle |
* |power9 | 2 | 2/cycle |
*
* @param vf64x vector double values containing the magnitudes.
* @param vf64y vector double values containing the sign bits.
* @return vector double values with magnitude from vf64x and the
* sign of vf64y.
* @param vf64x vector double values containing the sign bits.
* @param vf64y vector double values containing the magnitudes.
* @return vector double values with magnitude from vf64y and the
* sign of vf64x.
*/
static inline vf64_t
vec_copysignf64 (vf64_t vf64x , vf64_t vf64y)
vec_copysignf64 (vf64_t vf64x, vf64_t vf64y)
{
#if _ARCH_PWR7
/* P9 has a 2 cycle xvcpsgndp and eliminates a const load. */
return (vec_cpsgn (vf64x, vf64y));
#ifdef PVECLIB_CPSGN_FIXED
return (vec_cpsgn (vf64x, vf64y));
#else
vf64_t result;
__asm__(
"xvcpsgndp %x0,%x1,%x2;\n"
: "=wa" (result)
: "wa" (vf64x), "wa" (vf64y)
:);
return (result);
#endif
#else
const vui32_t signmask = CONST_VINT128_W(0x80000000, 0, 0x80000000, 0);
vf64_t result;
const vui32_t signmask = CONST_VINT128_W(0x80000000, 0, 0x80000000, 0);
vf64_t result;

result = (vf64_t)vec_sel ((vui32_t)vf64x, (vui32_t)vf64y, signmask);
return (result);
result = (vf64_t) vec_sel ((vui32_t) vf64y, (vui32_t) vf64x, signmask);
return (result);
#endif
}

Expand Down
Loading