From 8f14ea485974f9acc500dc5d5f71ac35badd72a6 Mon Sep 17 00:00:00 2001
From: Steven Munroe <munroesj52@gmail.com>
Date: Tue, 25 Jan 2022 13:20:22 -0600
Subject: [PATCH] Fix vec_copysign implementations per issue #158, Part 1B.

Rebased after the add/subqpo merge.
Seems the GCC (and Clang followed) initially reversed the operands
for vec_cpsgn() (so the sign is copied from operand b into a).
This differs from the Intrinsic reference and ISA that say the sign
is copied from operand a into b.

Unfortunately PVECLIB implementations of vec_copysignf32(),
vec_copysignf64(), and vec_copysignf128() duplicated the results of
original GCC vec_cpsgn() for the implementation,

This has been reported as a bug and now GCC and Clang are in the
process of "fixing" this bug to match the Intrinsic reference guide.
The fix will be applied to currently supported versions but older
compiler version will remain unchanged.

This change set will correct the PVECLIB copysign implementation
to match the intrinsic reference manual. We will the macro
PVECLIB_CPSGN_FIXED to isolate the PVECLIB implementations from
the compiler changes.

	* src/pveclib/vec_f128_ppc.h (vec_copysignf128): Change to
	match operand order from Vector Intrinsic Reference.
	(vec_xscvsqqp [_ARCH_PWR9]): Use correct vec_copysignf128
	operand order.
	* src/pveclib/vec_f32_ppc.h (vec_copysignf32): Change to match
	operand order from Vector Intrinsic Reference.
	* src/pveclib/vec_f64_ppc.h (vec_copysignf64): Change to match
	operand order from Vector Intrinsic Reference.

	* src/testsuite/arith128_test_f128.c (test_copysignf128):
	Define new __float128 const f128_nsnan.
	Reverse test operands and results to match corrected
	vec_copysignf128() implementation. Remove duplicated tests.
	* src/testsuite/arith128_test_f32.c (test_float_cpsgn):
	Reverse test operands and results to match corrected
	vec_copysignf32() implementation.
	* src/testsuite/arith128_test_f64.c (test_double_cpsgn):
	Reverse test operands and results to match corrected
	vec_copysignf64() implementation.

	* src/testsuite/vec_f32_dummy.c (test_vec_copysignf32):
	new compile test.
	* src/testsuite/vec_f64_dummy.c (test_vec_copysignf64):
	new compile test.

Signed-off-by: Steven Munroe <munroesj52@gmail.com>
---
 src/pveclib/vec_f128_ppc.h         |  33 +++--
 src/pveclib/vec_f32_ppc.h          |  37 ++++--
 src/pveclib/vec_f64_ppc.h          |  46 +++++--
 src/testsuite/arith128_test_f128.c | 192 ++++++-----------------------
 src/testsuite/arith128_test_f32.c  |  14 +--
 src/testsuite/arith128_test_f64.c  |  26 ++--
 src/testsuite/vec_f32_dummy.c      |   6 +
 src/testsuite/vec_f64_dummy.c      |   6 +
 8 files changed, 154 insertions(+), 206 deletions(-)

diff --git a/src/pveclib/vec_f128_ppc.h b/src/pveclib/vec_f128_ppc.h
index 80f2c92..94ad206 100644
--- a/src/pveclib/vec_f128_ppc.h
+++ b/src/pveclib/vec_f128_ppc.h
@@ -441,7 +441,7 @@ vec_xsxexpqp (__binary128 f128)
  * \endcode
  *
  * \note Would like to use the intrinsic scalar_extract_exp() here but
- * this is not available until GCC 11.
+ * this is not available until GCC 11 (or later).
  * Also GCC defines these scalar built-ins to return integer scalar
  * values in GPRs.
  * This would defeat the purpose of an all vector implementation.
@@ -1769,7 +1769,7 @@ static inline vec_xscvsqqp (vi128_t int128)
   lo64 = int64[VEC_DW_L];
   result = (hi64 * two64) + lo64;
   // copy the __int128's sign into the __binary128 result
-  result = vec_copysignf128 (result, i_sign);
+  result = vec_copysignf128 (i_sign, result);
 #elif  defined (_ARCH_PWR8)
 ...
 #endif
@@ -3806,18 +3806,29 @@ vec_all_iszerof128 (__binary128 f128)
 #endif
 }
 
-/** \brief Copy the sign bit from f128y and merge with the magnitude
- *  from f128x. The merged result is returned as a __float128 value.
+/** \brief Copy the sign bit from f128x and merge with the magnitude
+ *  from f128y. The merged result is returned as a __float128 value.
+ *
+ *  \note This operation was patterned after the intrinsic vec_cpsgn
+ *  (altivec.h) introduced for POWER7 and VSX. It turns out the
+ *  original (GCC 4.9) compiler implementation reversed the operands
+ *  and does not match the PowerISA or the Vector Intrinsic Programming
+ *  Reference manuals. Subsequent compilers and PVECLIB
+ *  implementations replicated this (operand order) error.
+ *  This has now been reported as bug against the compilers, which are
+ *  in the process of applying fixes and distributing updates.
+ *  This version of PVECLIB is updated to match the Vector Intrinsic
+ *  Programming Reference.
  *
  *  |processor|Latency|Throughput|
  *  |--------:|:-----:|:---------|
  *  |power8   | 2-11  | 2/cycle  |
  *  |power9   | 2     | 4/cycle  |
  *
- *  @param f128x a __float128 value containing the magnitude.
- *  @param f128y a __float128 value containing the sign bit.
- *  @return a __float128 value with magnitude from f128x and the
- *  sign of f128y.
+ *  @param f128x a __float128 value containing the sign bit.
+ *  @param f128y a __float128 value containing the magnitude.
+ *  @return a __float128 value with magnitude from f128y and the
+ *  sign of f128x.
  */
 static inline __binary128
 vec_copysignf128 (__binary128 f128x, __binary128 f128y)
@@ -3825,7 +3836,7 @@ vec_copysignf128 (__binary128 f128x, __binary128 f128y)
   __binary128 result;
 #if _ARCH_PWR9
   __asm__(
-      "xscpsgnqp %0,%2,%1;\n"
+      "xscpsgnqp %0,%1,%2;\n"
       : "=v" (result)
       : "v" (f128x), "v" (f128y)
       :);
@@ -3835,7 +3846,7 @@ vec_copysignf128 (__binary128 f128x, __binary128 f128y)
   tmpx = vec_xfer_bin128_2_vui32t (f128x);
   tmpy = vec_xfer_bin128_2_vui32t (f128y);
 
-  tmp = vec_sel (tmpx, tmpy, signmask);
+  tmp = vec_sel (tmpy, tmpx, signmask);
   result = vec_xfer_vui32t_2_bin128 (tmp);
 #endif
   return (result);
@@ -8006,7 +8017,7 @@ static inline vec_xscvsqqp (vi128_t int128)
   lo64 = int64[VEC_DW_L];
   result = (hi64 * two64) + lo64;
   // Copy the __int128's sign into the __binary128 result
-  result = vec_copysignf128 (result, i_sign);
+  result = vec_copysignf128 (i_sign, result);
 #elif  defined (_ARCH_PWR8)
   vui64_t q_exp;
   vui128_t q_sig;
diff --git a/src/pveclib/vec_f32_ppc.h b/src/pveclib/vec_f32_ppc.h
index a005b73..a75be02 100644
--- a/src/pveclib/vec_f32_ppc.h
+++ b/src/pveclib/vec_f32_ppc.h
@@ -788,31 +788,52 @@ vec_any_iszerof32 (vf32_t vf32)
 #endif
 }
 
-/** \brief Copy the sign bit from vf32y merged with magnitude from
- *  vf32x and return the resulting vector float values.
+/** \brief Copy the sign bit from vf32x merged with magnitude from
+ *  vf32y and return the resulting vector float values.
+ *
+ *  \note This operation was patterned after the intrinsic vec_cpsgn
+ *  (altivec.h) introduced for POWER7 and VSX. It turns out the
+ *  original (GCC 4.9) compiler implementation reversed the operands
+ *  and does not match the PowerISA or the Vector Intrinsic Programming
+ *  Reference manuals. Subsequent compilers and PVECLIB
+ *  implementations replicated this (operand order) error.
+ *  This has now been reported as bug against the compilers, which are
+ *  in the process of applying fixes and distributing updates.
+ *  This version of PVECLIB is updated to match the Vector Intrinsic
+ *  Programming Reference. This implementation is independent of the
+ *  compilers update status.
  *
  *  |processor|Latency|Throughput|
  *  |--------:|:-----:|:---------|
  *  |power8   | 6-7   | 2/cycle  |
  *  |power9   | 2     | 2/cycle  |
  *
- *  @param vf32x vector float values containing the magnitudes.
- *  @param vf32y vector float values containing the sign bits.
- *  @return vector float values with magnitude from vf32x and the
- *  sign of vf32y.
+ *  @param vf32x vector float values containing the sign bits.
+ *  @param vf32y vector float values containing the magnitudes.
+ *  @return vector float values with magnitude from vf32y and the
+ *  sign of vf32x.
  */
 static inline vf32_t
 vec_copysignf32 (vf32_t vf32x, vf32_t vf32y)
 {
 #if _ARCH_PWR7
-  /* P9 has a 2 cycle xvcpsgnsp and eliminates a const load. */
+#ifdef PVECLIB_CPSGN_FIXED
   return (vec_cpsgn (vf32x, vf32y));
+#else
+  vf32_t result;
+  __asm__(
+      "xvcpsgnsp %x0,%x1,%x2;\n"
+      : "=wa" (result)
+      : "wa" (vf32x), "wa" (vf32y)
+      :);
+  return (result);
+#endif
 #else
   const vui32_t signmask = CONST_VINT128_W(0x80000000, 0x80000000,
       0x80000000, 0x80000000);
   vf32_t result;
 
-  result = (vf32_t)vec_sel ((vui32_t)vf32x, (vui32_t)vf32y, signmask);
+  result = (vf32_t)vec_sel ((vui32_t)vf32y, (vui32_t)vf32x, signmask);
   return (result);
 #endif
 }
diff --git a/src/pveclib/vec_f64_ppc.h b/src/pveclib/vec_f64_ppc.h
index 2ac8340..3a1766e 100644
--- a/src/pveclib/vec_f64_ppc.h
+++ b/src/pveclib/vec_f64_ppc.h
@@ -780,31 +780,53 @@ vec_any_iszerof64 (vf64_t vf64)
 #endif
 }
 
-/** \brief Copy the sign bit from vf64y merged with magnitude from
- *  vf64x and return the resulting vector double values.
+/** \brief Copy the sign bit from vf64x merged with magnitude from
+ *  vf64y and return the resulting vector double values.
+ *
+ *  \note This operation was patterned after the intrinsic vec_cpsgn
+ *  (altivec.h) introduced for POWER7 and VSX. It turns out the
+ *  original (GCC 4.9) compiler implementation reversed the operands
+ *  and does not match the PowerISA or the Vector Intrinsic Programming
+ *  Reference manuals. Subsequent compilers and PVECLIB
+ *  implementations replicated this (operand order) error.
+ *  This has now been reported as bug against the compilers, which are
+ *  in the process of applying fixes and distributing updates.
+ *  This version of PVECLIB is updated to match the Vector Intrinsic
+ *  Programming Reference. This implementation is independent of the
+ *  compilers update status.
  *
  *  |processor|Latency|Throughput|
  *  |--------:|:-----:|:---------|
  *  |power8   | 6-7   | 2/cycle  |
  *  |power9   | 2     | 2/cycle  |
  *
- *  @param vf64x vector double values containing the magnitudes.
- *  @param vf64y vector double values containing the sign bits.
- *  @return vector double values with magnitude from vf64x and the
- *  sign of vf64y.
+ *  @param vf64x vector double values containing the sign bits.
+ *  @param vf64y vector double values containing the magnitudes.
+ *  @return vector double values with magnitude from vf64y and the
+ *  sign of vf64x.
  */
 static inline vf64_t
-vec_copysignf64 (vf64_t vf64x , vf64_t vf64y)
+vec_copysignf64 (vf64_t vf64x, vf64_t vf64y)
 {
 #if _ARCH_PWR7
   /* P9 has a 2 cycle xvcpsgndp and eliminates a const load. */
-	return (vec_cpsgn (vf64x, vf64y));
+#ifdef PVECLIB_CPSGN_FIXED
+  return (vec_cpsgn (vf64x, vf64y));
+#else
+  vf64_t result;
+  __asm__(
+      "xvcpsgndp %x0,%x1,%x2;\n"
+      : "=wa" (result)
+      : "wa" (vf64x), "wa" (vf64y)
+      :);
+  return (result);
+#endif
 #else
-	const vui32_t signmask  = CONST_VINT128_W(0x80000000, 0, 0x80000000, 0);
-	vf64_t result;
+  const vui32_t signmask = CONST_VINT128_W(0x80000000, 0, 0x80000000, 0);
+  vf64_t result;
 
-	result  = (vf64_t)vec_sel ((vui32_t)vf64x, (vui32_t)vf64y, signmask);
-	return (result);
+  result = (vf64_t) vec_sel ((vui32_t) vf64y, (vui32_t) vf64x, signmask);
+  return (result);
 #endif
 }
 
diff --git a/src/testsuite/arith128_test_f128.c b/src/testsuite/arith128_test_f128.c
index 2638c26..5f98fec 100644
--- a/src/testsuite/arith128_test_f128.c
+++ b/src/testsuite/arith128_test_f128.c
@@ -4251,6 +4251,8 @@ test_copysignf128 (void)
 
   const __binary128 f128_snan = vec_xfer_vui64t_2_bin128 (
       CONST_VINT128_DW(0x7fff400000000000, 0));
+  const __binary128 f128_nsnan = vec_xfer_vui64t_2_bin128 (
+      CONST_VINT128_DW(0xffff400000000000, 0));
 
   __binary128 x, y, t, e;
   long tests_count = 0;
@@ -4277,22 +4279,23 @@ test_copysignf128 (void)
   print_vfloat128x(" x=  ", x);
 #endif
   t = vec_copysignf128 (x, y);
-  e = (__binary128) f128_zero;
+  e = (__binary128) f128_nzero;
   rc += check_f128 ("check vec_copysignf128", x, t, e);
 #endif
 #if 1
   tests_count++;
-  x = (__binary128) f128_one;
+  y = (__binary128) f128_one;
 #ifdef __DEBUG_PRINT__
   print_vfloat128x(" x=  ", x);
 #endif
   t = vec_copysignf128 (x, y);
-  e = f128_one;
+  e = f128_none;
   rc += check_f128 ("check vec_copysignf128", x, t, e);
 #endif
 #if 1
   tests_count++;
-  x = (__binary128) f128_none;
+  x = (__binary128) f128_zero;
+  y = (__binary128) f128_none;
 #ifdef __DEBUG_PRINT__
   print_vfloat128x(" x=  ", x);
 #endif
@@ -4302,17 +4305,19 @@ test_copysignf128 (void)
 #endif
 #if 1
   tests_count++;
-  x = (__binary128) f128_max;
+  x = (__binary128) f128_nzero;
+  y = (__binary128) f128_max;
 #ifdef __DEBUG_PRINT__
   print_vfloat128x(" x=  ", x);
 #endif
   t = vec_copysignf128 (x, y);
-  e = f128_max;
+  e = f128_nmax;
   rc += check_f128 ("check vec_copysignf128", x, t, e);
 #endif
 #if 1
   tests_count++;
-  x = (__binary128) f128_nmax;
+  x = (__binary128) f128_zero;
+  y = (__binary128) f128_nmax;
 #ifdef __DEBUG_PRINT__
   print_vfloat128x(" x=  ", x);
 #endif
@@ -4322,17 +4327,19 @@ test_copysignf128 (void)
 #endif
 #if 1
   tests_count++;
-  x = (__binary128) f128_min;
+  x = (__binary128) f128_nzero;
+  y = (__binary128) f128_min;
 #ifdef __DEBUG_PRINT__
   print_vfloat128x(" x=  ", x);
 #endif
   t = vec_copysignf128 (x, y);
-  e = f128_min;
+  e = f128_nmin;
   rc += check_f128 ("check vec_copysignf128", x, t, e);
 #endif
 #if 1
   tests_count++;
-  x = (__binary128) f128_nmin;
+  x = (__binary128) f128_zero;
+  y = (__binary128) f128_nmin;
 #ifdef __DEBUG_PRINT__
   print_vfloat128x(" x=  ", x);
 #endif
@@ -4342,17 +4349,19 @@ test_copysignf128 (void)
 #endif
 #if 1
   tests_count++;
-  x = (__binary128) f128_sub;
+  x = (__binary128) f128_nzero;
+  y = (__binary128) f128_sub;
 #ifdef __DEBUG_PRINT__
   print_vfloat128x(" x=  ", x);
 #endif
   t = vec_copysignf128 (x, y);
-  e = f128_sub;
+  e = f128_nsub;
   rc += check_f128 ("check vec_copysignf128", x, t, e);
 #endif
 #if 1
   tests_count++;
-  x = (__binary128) f128_nsub;
+  x = (__binary128) f128_zero;
+  y = (__binary128) f128_nsub;
 #ifdef __DEBUG_PRINT__
   print_vfloat128x(" x=  ", x);
 #endif
@@ -4362,17 +4371,19 @@ test_copysignf128 (void)
 #endif
 #if 1
   tests_count++;
-  x = (__binary128) f128_inf;
+  x = (__binary128) f128_nzero;
+  y = (__binary128) f128_inf;
 #ifdef __DEBUG_PRINT__
   print_vfloat128x(" x=  ", x);
 #endif
   t = vec_copysignf128 (x, y);
-  e = f128_inf;
+  e = f128_ninf;
   rc += check_f128 ("check vec_copysignf128", x, t, e);
 #endif
 #if 1
   tests_count++;
-  x = (__binary128) f128_ninf;
+  x = (__binary128) f128_zero;
+  y = (__binary128) f128_ninf;
 #ifdef __DEBUG_PRINT__
   print_vfloat128x(" x=  ", x);
 #endif
@@ -4382,175 +4393,46 @@ test_copysignf128 (void)
 #endif
 #if 1
   tests_count++;
-  x = (__binary128) f128_nan;
-#ifdef __DEBUG_PRINT__
-  print_vfloat128x(" x=  ", x);
-#endif
-  t = vec_copysignf128 (x, y);
-  e = f128_nan;
-  rc += check_f128 ("check vec_copysignf128", x, t, e);
-#endif
-#if 1
-  tests_count++;
-  x = (__binary128) f128_nnan;
-#ifdef __DEBUG_PRINT__
-  print_vfloat128x(" x=  ", x);
-#endif
-  t = vec_copysignf128 (x, y);
-  e = f128_nan;
-  rc += check_f128 ("check vec_copysignf128", x, t, e);
-#endif
-#if 1
-  tests_count++;
-  x = (__binary128) f128_snan;
+  x = (__binary128) f128_nzero;
+  y = (__binary128) f128_nan;
 #ifdef __DEBUG_PRINT__
   print_vfloat128x(" x=  ", x);
 #endif
   t = vec_copysignf128 (x, y);
-  e = f128_snan;
+  e = f128_nnan;
   rc += check_f128 ("check vec_copysignf128", x, t, e);
 #endif
-
 #if 1
   tests_count++;
   x = (__binary128) f128_zero;
-  y = (__binary128) f128_nzero;
+  y = (__binary128) f128_nnan;
 #ifdef __DEBUG_PRINT__
   print_vfloat128x(" x=  ", x);
-  print_vfloat128x(" y=  ", y);
 #endif
   t = vec_copysignf128 (x, y);
-  e = (__binary128) f128_nzero;
+  e = f128_nan;
   rc += check_f128 ("check vec_copysignf128", x, t, e);
 #endif
 #if 1
   tests_count++;
   x = (__binary128) f128_nzero;
+  y = (__binary128) f128_snan;
 #ifdef __DEBUG_PRINT__
   print_vfloat128x(" x=  ", x);
 #endif
   t = vec_copysignf128 (x, y);
-  e = (__binary128) f128_nzero;
-  rc += check_f128 ("check vec_copysignf128", x, t, e);
-#endif
-#if 1
-  tests_count++;
-  x = (__binary128) f128_one;
-#ifdef __DEBUG_PRINT__
-  print_vfloat128x(" x=  ", x);
-#endif
-  t = vec_copysignf128 (x, y);
-  e = f128_none;
-  rc += check_f128 ("check vec_copysignf128", x, t, e);
-#endif
-#if 1
-  tests_count++;
-  x = (__binary128) f128_none;
-#ifdef __DEBUG_PRINT__
-  print_vfloat128x(" x=  ", x);
-#endif
-  t = vec_copysignf128 (x, y);
-  e = f128_none;
-  rc += check_f128 ("check vec_copysignf128", x, t, e);
-#endif
-#if 1
-  tests_count++;
-  x = (__binary128) f128_max;
-#ifdef __DEBUG_PRINT__
-  print_vfloat128x(" x=  ", x);
-#endif
-  t = vec_copysignf128 (x, y);
-  e = f128_nmax;
-  rc += check_f128 ("check vec_copysignf128", x, t, e);
-#endif
-#if 1
-  tests_count++;
-  x = (__binary128) f128_nmax;
-#ifdef __DEBUG_PRINT__
-  print_vfloat128x(" x=  ", x);
-#endif
-  t = vec_copysignf128 (x, y);
-  e = f128_nmax;
-  rc += check_f128 ("check vec_copysignf128", x, t, e);
-#endif
-#if 1
-  tests_count++;
-  x = (__binary128) f128_min;
-#ifdef __DEBUG_PRINT__
-  print_vfloat128x(" x=  ", x);
-#endif
-  t = vec_copysignf128 (x, y);
-  e = f128_nmin;
-  rc += check_f128 ("check vec_copysignf128", x, t, e);
-#endif
-#if 1
-  tests_count++;
-  x = (__binary128) f128_nmin;
-#ifdef __DEBUG_PRINT__
-  print_vfloat128x(" x=  ", x);
-#endif
-  t = vec_copysignf128 (x, y);
-  e = f128_nmin;
-  rc += check_f128 ("check vec_copysignf128", x, t, e);
-#endif
-#if 1
-  tests_count++;
-  x = (__binary128) f128_sub;
-#ifdef __DEBUG_PRINT__
-  print_vfloat128x(" x=  ", x);
-#endif
-  t = vec_copysignf128 (x, y);
-  e = f128_nsub;
-  rc += check_f128 ("check vec_copysignf128", x, t, e);
-#endif
-#if 1
-  tests_count++;
-  x = (__binary128) f128_nsub;
-#ifdef __DEBUG_PRINT__
-  print_vfloat128x(" x=  ", x);
-#endif
-  t = vec_copysignf128 (x, y);
-  e = f128_nsub;
-  rc += check_f128 ("check vec_copysignf128", x, t, e);
-#endif
-#if 1
-  tests_count++;
-  x = (__binary128) f128_inf;
-#ifdef __DEBUG_PRINT__
-  print_vfloat128x(" x=  ", x);
-#endif
-  t = vec_copysignf128 (x, y);
-  e = f128_ninf;
-  rc += check_f128 ("check vec_copysignf128", x, t, e);
-#endif
-#if 1
-  tests_count++;
-  x = (__binary128) f128_ninf;
-#ifdef __DEBUG_PRINT__
-  print_vfloat128x(" x=  ", x);
-#endif
-  t = vec_copysignf128 (x, y);
-  e = f128_ninf;
-  rc += check_f128 ("check vec_copysignf128", x, t, e);
-#endif
-#if 1
-  tests_count++;
-  x = (__binary128) f128_nan;
-#ifdef __DEBUG_PRINT__
-  print_vfloat128x(" x=  ", x);
-#endif
-  t = vec_copysignf128 (x, y);
-  e = f128_nnan;
+  e = f128_nsnan;
   rc += check_f128 ("check vec_copysignf128", x, t, e);
 #endif
 #if 1
   tests_count++;
-  x = (__binary128) f128_nnan;
+  x = (__binary128) f128_zero;
+  y = (__binary128) f128_nsnan;
 #ifdef __DEBUG_PRINT__
   print_vfloat128x(" x=  ", x);
 #endif
   t = vec_copysignf128 (x, y);
-  e = f128_nnan;
+  e = f128_snan;
   rc += check_f128 ("check vec_copysignf128", x, t, e);
 #endif
   /* accumulate the number of values tested, in case we are doing
diff --git a/src/testsuite/arith128_test_f32.c b/src/testsuite/arith128_test_f32.c
index 397387b..8eb9175 100644
--- a/src/testsuite/arith128_test_f32.c
+++ b/src/testsuite/arith128_test_f32.c
@@ -1194,7 +1194,7 @@ test_float_cpsgn (void)
 
   i = (vf32_t) { 0.0, -0.0, 0.0, -0.0 };
   j = (vf32_t) {-0.0, 0.0, -0.0, 0.0 };
-  e = (vf32_t) {-0.0, 0.0, -0.0, 0.0 };
+  e = (vf32_t) { 0.0, -0.0, 0.0, -0.0 };
   k = vec_copysignf32 (i, j);
 
 #ifdef __DEBUG_PRINT__
@@ -1204,9 +1204,9 @@ test_float_cpsgn (void)
 #endif
   rc += check_v4f32x ("vec_copysignf32 1:", k, e);
 
-  i = (vf32_t) { __FLT_MAX__, __FLT_MIN__, __FLT_EPSILON__,
+  i = (vf32_t) {-0.0, 0.0, -0.0, 0.0 };
+  j = (vf32_t) { __FLT_MAX__, __FLT_MIN__, __FLT_EPSILON__,
 		  __FLT_DENORM_MIN__ };
-  j = (vf32_t) {-0.0, 0.0, -0.0, 0.0 };
   e = (vf32_t) { -(__FLT_MAX__), __FLT_MIN__, -(__FLT_EPSILON__),
 		  __FLT_DENORM_MIN__ };
   k = vec_copysignf32 (i, j);
@@ -1218,9 +1218,9 @@ test_float_cpsgn (void)
 #endif
   rc += check_v4f32x ("vec_copysignf32 2:", k, e);
 
-  i = (vf32_t) CONST_VINT128_W(__FLOAT_INF, __FLOAT_NINF, __FLOAT_INF,
+  i = (vf32_t) CONST_VINT32_W(0.0, -0.0, 0.0, -0.0);
+  j = (vf32_t) CONST_VINT128_W(__FLOAT_INF, __FLOAT_NINF, __FLOAT_INF,
 			       __FLOAT_NINF);
-  j = (vf32_t) CONST_VINT32_W(0.0, -0.0, 0.0, -0.0);
   e = (vf32_t) CONST_VINT128_W(__FLOAT_INF, __FLOAT_NINF, __FLOAT_INF,
 			       __FLOAT_NINF);
   k = vec_copysignf32 (i, j);
@@ -1232,9 +1232,9 @@ test_float_cpsgn (void)
 #endif
   rc += check_v4f32x ("vec_copysignf32 3:", k, e);
 
-  i = (vf32_t) CONST_VINT128_W(__FLOAT_NAN, __FLOAT_NNAN, __FLOAT_NSNAN,
+  i = (vf32_t) {-0.0, 0.0, 0.0, -0.0 };
+  j = (vf32_t) CONST_VINT128_W(__FLOAT_NAN, __FLOAT_NNAN, __FLOAT_NSNAN,
 			       __FLOAT_SNAN);
-  j = (vf32_t) {-0.0, 0.0, 0.0, -0.0 };
   e = (vf32_t) CONST_VINT128_W(__FLOAT_NNAN, __FLOAT_NAN, __FLOAT_SNAN,
 			       __FLOAT_NSNAN);
   k = vec_copysignf32 (i, j);
diff --git a/src/testsuite/arith128_test_f64.c b/src/testsuite/arith128_test_f64.c
index 03999a7..77acf56 100644
--- a/src/testsuite/arith128_test_f64.c
+++ b/src/testsuite/arith128_test_f64.c
@@ -1604,7 +1604,7 @@ test_double_cpsgn (void)
 
   i = (vf64_t) { 0.0, -0.0 };
   j = (vf64_t) { -0.0, 0.0 };
-  e = (vf64_t) { -0.0, 0.0 };
+  e = (vf64_t) { 0.0, -0.0 };
   k = vec_copysignf64 (i, j);
 
 #ifdef __DEBUG_PRINT__
@@ -1614,8 +1614,8 @@ test_double_cpsgn (void)
 #endif
   rc += check_v2f64x ("vec_copysignf64 1:", k, e);
 
-  i = (vf64_t) { __DBL_MAX__, __DBL_MIN__ };
-  j = (vf64_t) { -0.0, 0.0 };
+  i = (vf64_t) { -0.0, 0.0 };
+  j = (vf64_t) { __DBL_MAX__, __DBL_MIN__ };
   e = (vf64_t) { -(__DBL_MAX__), __DBL_MIN__ };
   k = vec_copysignf64 (i, j);
 
@@ -1626,8 +1626,8 @@ test_double_cpsgn (void)
 #endif
   rc += check_v2f64x ("vec_copysignf64 2:", k, e);
 
-  i = (vf64_t) { __DBL_EPSILON__, __DBL_DENORM_MIN__ };
-  j = (vf64_t) { -0.0, 0.0 };
+  i = (vf64_t) { -0.0, 0.0 };
+  j = (vf64_t) { __DBL_EPSILON__, __DBL_DENORM_MIN__ };
   e = (vf64_t) { -(__DBL_EPSILON__), __DBL_DENORM_MIN__ };
   k = vec_copysignf64 (i, j);
 
@@ -1638,8 +1638,8 @@ test_double_cpsgn (void)
 #endif
   rc += check_v2f64x ("vec_copysignf64 3:", k, e);
 
-  i = (vf64_t) CONST_VINT128_DW(__DOUBLE_INF, __DOUBLE_NINF);
-  j = (vf64_t) CONST_VINT64_DW(0.0, -0.0);
+  i = (vf64_t) CONST_VINT64_DW(0.0, -0.0);
+  j = (vf64_t) CONST_VINT128_DW(__DOUBLE_INF, __DOUBLE_NINF);
   e = (vf64_t) CONST_VINT128_DW(__DOUBLE_INF, __DOUBLE_NINF);
   k = vec_copysignf64 (i, j);
 
@@ -1650,8 +1650,8 @@ test_double_cpsgn (void)
 #endif
   rc += check_v2f64x ("vec_copysignf64 4:", k, e);
 
-  i = (vf64_t) CONST_VINT128_DW(__DOUBLE_INF, __DOUBLE_NINF);
-  j = (vf64_t) CONST_VINT64_DW(0.0, -0.0);
+  i = (vf64_t) CONST_VINT64_DW(0.0, -0.0);
+  j = (vf64_t) CONST_VINT128_DW(__DOUBLE_INF, __DOUBLE_NINF);
   e = (vf64_t) CONST_VINT128_DW(__DOUBLE_INF, __DOUBLE_NINF);
   k = vec_copysignf64 (i, j);
 
@@ -1662,8 +1662,8 @@ test_double_cpsgn (void)
 #endif
   rc += check_v2f64x ("vec_copysignf64 5:", k, e);
 
-  i = (vf64_t) CONST_VINT128_DW(__DOUBLE_NAN, __DOUBLE_NNAN);
-  j = (vf64_t) CONST_VINT64_DW( -0.0, 0.0 );
+  i = (vf64_t) CONST_VINT64_DW( -0.0, 0.0 );
+  j = (vf64_t) CONST_VINT128_DW(__DOUBLE_NAN, __DOUBLE_NNAN);
   e = (vf64_t) CONST_VINT128_DW(__DOUBLE_NNAN, __DOUBLE_NAN);
   k = vec_copysignf64 (i, j);
 
@@ -1674,8 +1674,8 @@ test_double_cpsgn (void)
 #endif
   rc += check_v2f64x ("vec_copysignf64 6:", k, e);
 
-  i = (vf64_t) CONST_VINT128_DW(__DOUBLE_NSNAN, __DOUBLE_SNAN);
-  j = (vf64_t) CONST_VINT64_DW ( 0.0, -0.0 );
+  i = (vf64_t) CONST_VINT64_DW ( 0.0, -0.0 );
+  j = (vf64_t) CONST_VINT128_DW(__DOUBLE_NSNAN, __DOUBLE_SNAN);
   e = (vf64_t) CONST_VINT128_DW(__DOUBLE_SNAN, __DOUBLE_NSNAN);
   k = vec_copysignf64 (i, j);
 
diff --git a/src/testsuite/vec_f32_dummy.c b/src/testsuite/vec_f32_dummy.c
index 0af43ca..5ae56d1 100644
--- a/src/testsuite/vec_f32_dummy.c
+++ b/src/testsuite/vec_f32_dummy.c
@@ -34,6 +34,12 @@
 #include <pveclib/vec_f32_ppc.h>
 #include <testsuite/arith128_test_f32.h>
 
+vf32_t
+test_vec_copysignf32 (vf32_t x, vf32_t y)
+{
+  return vec_copysignf32 (x, y);
+}
+
 vf32_t
 test_vec_xviexpsp (vui32_t sig, vui32_t exp)
 {
diff --git a/src/testsuite/vec_f64_dummy.c b/src/testsuite/vec_f64_dummy.c
index aca8bb0..494b6ec 100644
--- a/src/testsuite/vec_f64_dummy.c
+++ b/src/testsuite/vec_f64_dummy.c
@@ -35,6 +35,12 @@
 #include <pveclib/vec_f64_ppc.h>
 #include <testsuite/arith128_test_f64.h>
 
+vf64_t
+test_vec_copysignf64 (vf64_t x, vf64_t y)
+{
+  return vec_copysignf64 (x, y);
+}
+
 vf64_t
 test_vec_xviexpdp (vui64_t sig, vui64_t exp)
 {