diff --git a/mex/cpfloat.c b/mex/cpfloat.c index efc831e..15a717c 100644 --- a/mex/cpfloat.c +++ b/mex/cpfloat.c @@ -41,11 +41,15 @@ void mexFunction(int nlhs, fpopts->precision = 11; fpopts->emin = -14; fpopts->emax = 15; - fpopts->subnormal = CPFLOAT_SUBN_USE; fpopts->explim = CPFLOAT_EXPRANGE_TARG; + fpopts->infinity = CPFLOAT_INF_USE; fpopts->round = CPFLOAT_RND_NE; - fpopts->flip = CPFLOAT_NO_SOFTERR; + fpopts->saturation = CPFLOAT_SAT_NO; + fpopts->subnormal = CPFLOAT_SUBN_USE; + + fpopts->flip = CPFLOAT_SOFTERR_NO; fpopts->p = 0.5; + fpopts->bitseed = NULL; fpopts->randseedf = NULL; fpopts->randseed = NULL; @@ -54,6 +58,7 @@ void mexFunction(int nlhs, /* Parse second argument and populate fpopts structure. */ if (nrhs > 1) { bool is_subn_rnd_default = false; + bool is_inf_no_default = false; if(!mxIsEmpty(prhs[1]) && !mxIsStruct(prhs[1])) { mexErrMsgIdAndTxt("cpfloat:invalidstruct", "Second argument must be a struct."); @@ -62,7 +67,7 @@ void mexFunction(int nlhs, if (tmp != NULL) { if (mxGetM(tmp) == 0 && mxGetN(tmp) == 0) - /* Use default format, for compatibility with chop. */ + /* Set default format, for compatibility with chop. */ strcpy(fpopts->format, "h"); else if (mxGetClassID(tmp) == mxCHAR_CLASS) strcpy(fpopts->format, mxArrayToString(tmp)); @@ -80,6 +85,7 @@ void mexFunction(int nlhs, fpopts->precision = 4; fpopts->emin = -6; fpopts->emax = 8; + is_inf_no_default = true; } else if (!strcmp(fpopts->format, "q52") || !strcmp(fpopts->format, "fp8-e5m2") || !strcmp(fpopts->format, "E5M2")) { @@ -134,6 +140,7 @@ void mexFunction(int nlhs, mexErrMsgIdAndTxt("cpfloat:invalidformat", "Invalid floating-point format specified."); } + /* Set default values to be compatible with MATLAB chop. */ tmp = mxGetField(prhs[1], 0, "subnormal"); if (tmp != NULL) { @@ -144,9 +151,8 @@ void mexFunction(int nlhs, } else { if (is_subn_rnd_default) fpopts->subnormal = CPFLOAT_SUBN_RND; /* Default for bfloat16. */ - else - fpopts->subnormal = CPFLOAT_SUBN_USE; } + tmp = mxGetField(prhs[1], 0, "explim"); if (tmp != NULL) { if (mxGetM(tmp) == 0 && mxGetN(tmp) == 0) @@ -154,6 +160,18 @@ void mexFunction(int nlhs, else if (mxGetClassID(tmp) == mxDOUBLE_CLASS) fpopts->explim = *((double *)mxGetData(tmp)); } + + tmp = mxGetField(prhs[1], 0, "infinity"); + if (tmp != NULL) { + if (mxGetM(tmp) == 0 && mxGetN(tmp) == 0) + fpopts->infinity = CPFLOAT_INF_USE; + else if (mxGetClassID(tmp) == mxDOUBLE_CLASS) + fpopts->infinity = *((double *)mxGetData(tmp)); + } else { + if (is_inf_no_default) + fpopts->infinity = CPFLOAT_INF_NO; /* Default for E4M5. */ + } + tmp = mxGetField(prhs[1], 0, "round"); if (tmp != NULL) { if (mxGetM(tmp) == 0 && mxGetN(tmp) == 0) @@ -161,10 +179,32 @@ void mexFunction(int nlhs, else if (mxGetClassID(tmp) == mxDOUBLE_CLASS) fpopts->round = *((double *)mxGetData(tmp)); } + + tmp = mxGetField(prhs[1], 0, "saturation"); + if (tmp != NULL) { + if (mxGetM(tmp) == 0 && mxGetN(tmp) == 0) + fpopts->saturation = CPFLOAT_SAT_NO; + else if (mxGetClassID(tmp) == mxDOUBLE_CLASS) + fpopts->saturation = *((double *)mxGetData(tmp)); + } + + tmp = mxGetField(prhs[1], 0, "subnormal"); + if (tmp != NULL) { + if (mxGetM(tmp) == 0 && mxGetN(tmp) == 0) + fpopts->subnormal = CPFLOAT_SUBN_USE; + else if (mxGetClassID(tmp) == mxDOUBLE_CLASS) + fpopts->subnormal = *((double *)mxGetData(tmp)); + } else { + if (is_subn_rnd_default) + fpopts->subnormal = CPFLOAT_SUBN_RND; /* Default for bfloat16. */ + else + fpopts->subnormal = CPFLOAT_SUBN_USE; + } + tmp = mxGetField(prhs[1], 0, "flip"); if (tmp != NULL) { if (mxGetM(tmp) == 0 && mxGetN(tmp) == 0) - fpopts->flip = CPFLOAT_NO_SOFTERR; + fpopts->flip = CPFLOAT_SOFTERR_NO; else if (mxGetClassID(tmp) == mxDOUBLE_CLASS) fpopts->flip = *((double *)mxGetData(tmp)); } @@ -288,10 +328,11 @@ void mexFunction(int nlhs, /* Allocate and return second output. */ if (nlhs > 1) { - const char* field_names[] = {"format", "params", "subnormal", "round", - "flip", "p", "explim"}; + const char* field_names[] = {"format", "params", "explim", "infinity", + "round", "saturation", "subnormal", + "flip", "p"}; mwSize dims[2] = {1, 1}; - plhs[1] = mxCreateStructArray(2, dims, 7, field_names); + plhs[1] = mxCreateStructArray(2, dims, 9, field_names); mxSetFieldByNumber(plhs[1], 0, 0, mxCreateString(fpopts->format)); mxArray *outparams = mxCreateDoubleMatrix(1,3,mxREAL); @@ -301,30 +342,40 @@ void mexFunction(int nlhs, outparamsptr[2] = fpopts->emax; mxSetFieldByNumber(plhs[1], 0, 1, outparams); - mxArray *outsubnormal = mxCreateDoubleMatrix(1,1,mxREAL); - double *outsubnormalptr = mxGetData(outsubnormal); - outsubnormalptr[0] = fpopts->subnormal; - mxSetFieldByNumber(plhs[1], 0, 2, outsubnormal); + mxArray *outexplim = mxCreateDoubleMatrix(1, 1, mxREAL); + double *outexplimptr = mxGetData(outexplim); + outexplimptr[0] = fpopts->explim; + mxSetFieldByNumber(plhs[1], 0, 2, outexplim); + + mxArray *outinfinity = mxCreateDoubleMatrix(1, 1, mxREAL); + double *outinfinityptr = mxGetData(outinfinity); + outinfinityptr[0] = fpopts->infinity; + mxSetFieldByNumber(plhs[1], 0, 3, outinfinity); mxArray *outround = mxCreateDoubleMatrix(1,1,mxREAL); double *outroundptr = mxGetData(outround); outroundptr[0] = fpopts->round; - mxSetFieldByNumber(plhs[1], 0, 3, outround); + mxSetFieldByNumber(plhs[1], 0, 4, outround); + + mxArray *outsaturation = mxCreateDoubleMatrix(1,1,mxREAL); + double *outsaturationptr = mxGetData(outsaturation); + outsaturationptr[0] = fpopts->saturation; + mxSetFieldByNumber(plhs[1], 0, 5, outsaturation); + + mxArray *outsubnormal = mxCreateDoubleMatrix(1,1,mxREAL); + double *outsubnormalptr = mxGetData(outsubnormal); + outsubnormalptr[0] = fpopts->subnormal; + mxSetFieldByNumber(plhs[1], 0, 6, outsubnormal); mxArray *outflip = mxCreateDoubleMatrix(1,1,mxREAL); double *outflipptr = mxGetData(outflip); outflipptr[0] = fpopts->flip; - mxSetFieldByNumber(plhs[1], 0, 4, outflip); + mxSetFieldByNumber(plhs[1], 0, 7, outflip); mxArray *outp = mxCreateDoubleMatrix(1,1,mxREAL); double *outpptr = mxGetData(outp); outpptr[0] = fpopts->p; - mxSetFieldByNumber(plhs[1], 0, 5, outp); - - mxArray *outexplim = mxCreateDoubleMatrix(1,1,mxREAL); - double *outexplimptr = mxGetData(outexplim); - outexplimptr[0] = fpopts->explim; - mxSetFieldByNumber(plhs[1], 0, 6, outexplim); + mxSetFieldByNumber(plhs[1], 0, 8, outp); } if (nlhs > 2) diff --git a/mex/cpfloat.m b/mex/cpfloat.m index 4f1dfa3..3e27dea 100644 --- a/mex/cpfloat.m +++ b/mex/cpfloat.m @@ -39,17 +39,17 @@ % the target format, respectively. The default value of this field is % the vector [11,-14,15]. % -% * The scalar FPOPTS.subnormal specifies the support for subnormal numbers. -% The target floating-point format will not support subnormal numbers if -% this field is set to 0, and will support them otherwise. The default value -% for this field is 0 if the target format is 'bfloat16' and 1 otherwise. -% % * The scalar FPOPTS.explim specifies the support for an extended exponent % range. The target floating-point format will have the exponent range of % the storage format ('single' or 'double', depending on the class of X) if % this field is set to 0, and the exponent range of the format specified in % FPOPTS.format otherwise. The default value for this field is 1. % +% * The scalar FPOPTS.infinity specifies whether infinities are supported. The +% target floating-point format will support infinities if this field is set +% to 1, and they will be replaced by NaNs otherwise. The default value for +% this field is 0 if the target format is 'E4M3' and 1 otherwise. +% % * The scalar FPOPTS.round specifies the rounding mode. Possible values are: % -1 for round-to-nearest with ties-to-away; % 0 for round-to-nearest with ties-to-zero; @@ -63,6 +63,16 @@ % Any other value results in no rounding. The default value for this field % is 1. % +% * The scalar FPOPTS.saturation specifies whether saturation arithmetic is in +% use. On overflow, the target floating-point format will use the largest +% representable floating-point if this field is set to 0, and infinity +% otherwise. The default value for this field is 0. + +% * The scalar FPOPTS.subnormal specifies the support for subnormal numbers. +% The target floating-point format will not support subnormal numbers if +% this field is set to 0, and will support them otherwise. The default value +% for this field is 0 if the target format is 'bfloat16' and 1 otherwise. +% % * The scalar FPOPTS.flip specifies whether the function should simulate the % occurrence of a single bit flip striking the floating-point representation % of elements of Y. Possible values are: diff --git a/src/cpfloat_binary32.h b/src/cpfloat_binary32.h index 3ecbe83..9ce1843 100644 --- a/src/cpfloat_binary32.h +++ b/src/cpfloat_binary32.h @@ -289,8 +289,8 @@ static inline int cpf_fmaf(float *X, const float *A, const float *B, #define INTSUFFIX U #define DEFPREC 24 -#define DEFEMAX 127 #define DEFEMIN -126 +#define DEFEMAX 127 #define NLEADBITS 9 #define NBITS 32 #define FULLMASK 0xFFFFFFFFU diff --git a/src/cpfloat_binary64.h b/src/cpfloat_binary64.h index 5df0a6b..fa57f63 100644 --- a/src/cpfloat_binary64.h +++ b/src/cpfloat_binary64.h @@ -289,8 +289,8 @@ static inline int cpf_fma(double *X, const double *A, const double *B, #define INTTYPE uint64_t #define INTSUFFIX ULL #define DEFPREC 53 -#define DEFEMAX 1023 #define DEFEMIN -1022 +#define DEFEMAX 1023 #define NLEADBITS 12 #define NBITS 64 #define FULLMASK 0xFFFFFFFFFFFFFFFFULL diff --git a/src/cpfloat_definitions.h b/src/cpfloat_definitions.h index 78bfc7b..9493f4d 100644 --- a/src/cpfloat_definitions.h +++ b/src/cpfloat_definitions.h @@ -8,10 +8,12 @@ * @details This file includes all the external header files used by CPFloat, * defines the enumerated types * - * + @ref cpfloat_subnormal_t, * + @ref cpfloat_explim_t, + * + @ref cpfloat_infinity_t, * + @ref cpfloat_rounding_t, + * + @ref cpfloat_saturation_t, * + @ref cpfloat_softerr_t, + * + @ref cpfloat_subnormal_t, * * and the structured data type @ref optstruct. It is not necessary to include * this file in order to use CPFloat, as it is already included by @ref @@ -52,16 +54,6 @@ typedef unsigned int cpfloat_precision_t; */ typedef int cpfloat_exponent_t; -/** - * @brief Subnormal support modes available in CPFloat. - */ -typedef enum { - /** Round subnormal numbers using current rounding mode. */ - CPFLOAT_SUBN_RND = 0, - /** Support storage of subnormal numbers. */ - CPFLOAT_SUBN_USE = 1 -} cpfloat_subnormal_t; - /** * @brief Extended exponent range modes available in CPFloat. */ @@ -72,6 +64,16 @@ typedef enum { CPFLOAT_EXPRANGE_TARG = 1 } cpfloat_explim_t; +/** + * @brief Infinity support modes available in CPFloat. + */ +typedef enum { + /** Use infinities in target format. */ + CPFLOAT_INF_NO = 0, + /** Replace infinities with NaNs in target format. */ + CPFLOAT_INF_USE = 1, +} cpfloat_infinity_t; + /** * @brief Rounding modes available in CPFloat. */ @@ -98,19 +100,38 @@ typedef enum { CPFLOAT_NO_RND = 8, } cpfloat_rounding_t; +/** + * @brief Saturation modes available in CPFloat. + */ +typedef enum { + /** Use standard arithmetic. */ + CPFLOAT_SAT_NO = 0, + /** Use saturation arithmetic. */ + CPFLOAT_SAT_USE = 1, +} cpfloat_saturation_t; + /** * @brief Soft fault simulation modes available in CPFloat. */ typedef enum { /** Do not introduce soft errors. */ - CPFLOAT_NO_SOFTERR = 0, + CPFLOAT_SOFTERR_NO = 0, /** Soft errors in fraction of target-format floating-point representation.*/ - CPFLOAT_FRAC_SOFTERR = 1, + CPFLOAT_SOFTERR_FRAC = 1, /** Soft errors anywhere in target-format floating-point representation. */ - CPFLOAT_FP_SOFTERR = 2 - + CPFLOAT_SOFTERR_FP = 2 } cpfloat_softerr_t; +/** + * @brief Subnormal support modes available in CPFloat. + */ +typedef enum { + /** Round subnormal numbers according to the current rounding mode. */ + CPFLOAT_SUBN_RND = 0, + /** Support storage of subnormal numbers. */ + CPFLOAT_SUBN_USE = 1 +} cpfloat_subnormal_t; + /** @cond */ #ifdef PCG_VARIANTS_H_INCLUDED #define CPFLOAT_BITSEEDTYPE pcg32_random_t @@ -157,12 +178,12 @@ typedef struct { * Possible values are: * + `q43`, `e4m3`, `E4M3` for E4M3 (4-bit exponent, 4-bit significand); * + `q52`, `e5m2`, `E5M2` for E5M2 (5-bit exponent, 2-bit significand); - * + `b`, `bf16`, and `bfloat16` for bfloat16; - * + `h`, `fp16`, `binary16`, and `half` for binary16; - * + `t`, `tf32`, and `TensorFloat-32`, for TensorFloat-32; - * + `s`, `fp32`, `binary32`, and `single` for binary32; - * + `d`, `fp64`, `binary64`, and `double` for binary64; and - * + `custom`, and `c` for a format specified using `precision` and `emax`. + * + `b`, `bf16`, `bfloat16` for bfloat16; + * + `h`, `fp16`, `binary16`, `half` for binary16; + * + `t`, `tf32`, `TensorFloat-32`, for TensorFloat-32; + * + `s`, `fp32`, `binary32`, `single` for binary32; + * + `d`, `fp64`, `binary64`, `double` for binary64; and + * + `custom`, `c` for a format specifying `precision`, `emin`, and `emax`. * * The validation functions cpfloatf_validate_optstruct() and * cpfloat_validate_optstruct() return a warning code if this field is not set @@ -179,8 +200,8 @@ typedef struct { * number of digits of precision for `float` and `double` cannot exceed 11 and * 25, respectively, when using stochastic rounding, and cannot exceed 23 and * 52, respectively, for other rounding modes. The C implementation does not - * have any such restrictions, but note that using larger values can cause - * double rounding. + * have any such restrictions, but using larger values can cause double + * rounding. * * The validation functions cpfloatf_validate_optstruct() and * cpfloat_validate_optstruct() return an error code if the required number of @@ -189,6 +210,19 @@ typedef struct { * by the MEX interface. */ cpfloat_precision_t precision; + /** + * @brief Minimum exponent of target format. + * + * @details The minimum values allowed are -126 and -1022 if the storage + * format is `float` or `double`, respectively. If a smaller value is chosen, + * it is changed to the minimum allowed value without warning. This field is + * ignored unless `explim` is set to `CPFLOAT_EXPRANGE_TARG`. + * + * The validation functions cpfloatf_validate_optstruct() and + * cpfloat_validate_optstruct() return an error code if the required minimum + * exponent is smaller than the minimum allowed by the storage format. + */ + cpfloat_exponent_t emin; /** * @brief Maximum exponent of target format. * @@ -202,27 +236,6 @@ typedef struct { * exponent is larger than the maximum allowed by the storage format. */ cpfloat_exponent_t emax; - /** - * @brief Minimum exponent of target format. - * - * @details The minimum values allowed are -126 and -1022 if the storage format - * is `float` or `double`, respectively. If a smaller value is chosen, it is - * changed to the minimum allowed value without warning. This field is ignored - * unless `explim` is set to `CPFLOAT_EXPRANGE_TARG`. - * - * The validation functions cpfloatf_validate_optstruct() and - * cpfloat_validate_optstruct() return an error code if the required minimum - * exponent is smaller than the minimum allowed by the storage format. - */ - cpfloat_exponent_t emin; - /** - * @brief Support for subnormal numbers in target format. - * - * @details Subnormal numbers are supported if this field is set to - * `CPFLOAT_SUBN_USE` and rounded to a normal number using the current - * rounding mode if it is set to `CPFLOAT_SUBN_RND`. - */ - cpfloat_subnormal_t subnormal; /** * @brief Support for extended exponents in target format. * @@ -232,6 +245,14 @@ typedef struct { * `CPFLOAT_EXPRANGE_STOR`. */ cpfloat_explim_t explim; + /** + * @brief Support for infinities in target format. + * + * @details If this field is set to `CPFLOAT_INF_USE`, the target format + * supports signed infinities. If the field is set to `CPFLOAT_INF_NO`, + * infinities are replaced with a quiet NaN. + */ + cpfloat_infinity_t infinity; /** * @brief Rounding mode to be used for the conversion. * @@ -257,15 +278,36 @@ typedef struct { * those in the list above is specified. */ cpfloat_rounding_t round; + /** + * @brief Support for saturation arithmetic in target format. + * + * @details If this field is set to `CPFLOAT_SAT_USE`, numbers too large to be + * represented in the target format are clamped to the largest floating-point + * number of appropriate sign. If this field is set to `CPFLOAT_SAT_NO`, + * numbers that are too large to be represented are rounded to either the + * largest normal value of appropriate sign or the closest infinity according + * to the current rounding mode. + */ + cpfloat_saturation_t saturation; + /** + * @brief Support for subnormal numbers in target format. + * + * @details Subnormal numbers are supported if this field is set to + * `CPFLOAT_SUBN_USE` and rounded to a normal number according to the current + * rounding mode if it is set to `CPFLOAT_SUBN_RND`. + */ + cpfloat_subnormal_t subnormal; + + /* Bit flips. */ /** * @brief Support for soft errors. * - * @details If this field is not set to `CPFLOAT_NO_SOFTERR`, a single bit + * @details If this field is not set to `CPFLOAT_SOFTERR_NO`, a single bit * flip is introduced in the binary floating-point representation of the * rounded result with probability `p`. The bit flip can strike only the * target-format fraction (significand without the implicit bit) if this field - * is set to `CPFLOAT_FRAC_SOFTERR` and any bit in the target-format - * representation if it is set to `CPFLOAT_FP_SOFTERR`. + * is set to `CPFLOAT_SOFTERR_FRAC` and any bit in the target-format + * representation if it is set to `CPFLOAT_SOFTERR_FP`. */ cpfloat_softerr_t flip; /** @@ -274,14 +316,16 @@ typedef struct { * @details The probability of flipping a single bit in the binary * floating-point representation or in the fraction (significand without the * implicit bit) of a number after rounding. This field is ignored if `flip` - * is set to `CPFLOAT_NO_SOFTERR`. + * is set to `CPFLOAT_SOFTERR_NO`. * * The validation functions cpfloatf_validate_optstruct() and * cpfloat_validate_optstruct() return an error code if `flip` is set to - * `CPFLOAT_FP_SOFTERR` or `CPFLOAT_FRAC_SOFTERR` and this field does not + * `CPFLOAT_FP_SOFTERR` or `CPFLOAT_SOFTERR_FRAC` and this field does not * contain a number in the interval [0,1]. */ double p; + + /* Internal: state of pseudo-random number generator. */ /** * @brief Internal state of pseudo-random number generator for single bits. * diff --git a/src/cpfloat_template.h b/src/cpfloat_template.h index e0d80b4..7695e87 100644 --- a/src/cpfloat_template.h +++ b/src/cpfloat_template.h @@ -158,11 +158,14 @@ typedef union { #define FPPARAMS ADDSUFFIXTO(fpparams) typedef struct { cpfloat_precision_t precision; - cpfloat_exponent_t emax; cpfloat_exponent_t emin; - cpfloat_subnormal_t subnormal; + cpfloat_exponent_t emax; + cpfloat_infinity_t infinity; cpfloat_rounding_t round; + cpfloat_saturation_t saturation; + cpfloat_subnormal_t subnormal; FPTYPE ftzthreshold; + FPTYPE ofvalue; FPTYPE xmin; FPTYPE xmax; FPTYPE xbnd; @@ -277,7 +280,7 @@ static inline int VALIDATE_INPUT(const optstruct *fpopts) { retval = -4; /* Return 5 if p is required but is not a valid probability. */ - if (fpopts->flip != CPFLOAT_NO_SOFTERR && (fpopts->p > 1 || fpopts->p < 0)) + if (fpopts->flip != CPFLOAT_SOFTERR_NO && (fpopts->p > 1 || fpopts->p < 0)) return 5; /* Return 0 or warning value. */ @@ -292,40 +295,49 @@ static inline FPPARAMS COMPUTE_GLOBAL_PARAMS(const optstruct *fpopts, /* Actual precision and exponent range. */ *retval = 0; cpfloat_precision_t precision = fpopts->precision; - cpfloat_exponent_t emax = fpopts->explim == CPFLOAT_EXPRANGE_TARG ? - fpopts->emax : - DEFEMAX; cpfloat_exponent_t emin = fpopts->explim == CPFLOAT_EXPRANGE_TARG ? fpopts->emin : DEFEMIN; + cpfloat_exponent_t emax = fpopts->explim == CPFLOAT_EXPRANGE_TARG ? + fpopts->emax : + DEFEMAX; if (precision > DEFPREC) { precision = DEFPREC; *retval = 1; } - if (emax > DEFEMAX) { - emax = DEFEMAX; + if (emin < DEFEMIN) { + emin = DEFEMIN; *retval = 2; } - if (emin < DEFEMIN) { - emin = DEFEMIN; + if (emax > DEFEMAX) { + emax = DEFEMAX; *retval = 2; } FPTYPE xmin = ldexp(1., emin); /* Smallest pos. normal. */ FPTYPE xmins = ldexp(1., emin-precision+1); /* Smallest pos. subnormal. */ FPTYPE ftzthreshold = (fpopts->subnormal == CPFLOAT_SUBN_USE) ? xmins : xmin; + FPTYPE xmax = ldexp(1., emax) * (2-ldexp(1., 1-precision)); FPTYPE xbnd = ldexp(1., emax) * (2-ldexp(1., -precision)); + /* + * Here, fpopts->saturation takes precedence over fpopts->infinity. Therefore, + * when saturation arithmetic is used, infinities are not produced even when + * the target format supports them. + */ + FPTYPE ofvalue = (fpopts->saturation == CPFLOAT_SAT_USE) ? xmax : + (fpopts->infinity == CPFLOAT_INF_USE ? INFINITY : NAN); /* Bitmasks. */ INTTYPE leadmask = FULLMASK << (DEFPREC-precision); /* To keep. */ INTTYPE trailmask = leadmask ^ FULLMASK; /* To discard. */ - FPPARAMS params = {precision, emax, emin, fpopts->subnormal, fpopts->round, - ftzthreshold, xmin, xmax, xbnd, + FPPARAMS params = {precision, emin, emax, fpopts->infinity, fpopts->round, + fpopts->saturation, fpopts->subnormal, + ftzthreshold, ofvalue, xmin, xmax, xbnd, leadmask, trailmask, NULL, NULL}; return params; @@ -401,7 +413,7 @@ static inline void UPDATE_LOCAL_PARAMS(const FPTYPE *A, else \ *(x) = FPOF(SIGN(y) | INTOFCONST(p->ftzthreshold)); \ } else if (ABS(y) >= p->xbnd) { /* Overflow */ \ - *(x) = FPOF(SIGN(y) | INTOFCONST(INFINITY)); \ + *(x) = FPOF(SIGN(y) | INTOFCONST(p->ofvalue)); \ } else { \ UPDATE_LOCAL_PARAMS(y, p, lp); \ *(x) = FPOF((INTOF(y) + \ @@ -425,7 +437,7 @@ static inline void UPDATE_LOCAL_PARAMS(const FPTYPE *A, else \ *(x) = FPOF(SIGN(y)); \ } else if (ABS(y) > p->xbnd) { /* Overflow */ \ - *(x) = FPOF(SIGN(y) | INTOFCONST(INFINITY)); \ + *(x) = FPOF(SIGN(y) | INTOFCONST(p->ofvalue)); \ } else { \ UPDATE_LOCAL_PARAMS(y, p, lp); \ *(x) = FPOF((INTOF(y) + (lp->trailmask>>1)) & lp->leadmask); \ @@ -450,7 +462,7 @@ static inline void UPDATE_LOCAL_PARAMS(const FPTYPE *A, else \ *(x) = FPOF(SIGN(y) | INTOFCONST(p->ftzthreshold)); \ } else if (ABS(y) >= p->xbnd) { /* Overflow */ \ - *(x) = FPOF(SIGN(y) | INTOFCONST(INFINITY)); \ + *(x) = FPOF(SIGN(y) | INTOFCONST(p->ofvalue)); \ } else { \ UPDATE_LOCAL_PARAMS(y, p, lp); \ INTTYPE LSB = ((INTOF(y) >> (DEFPREC-lp->precision)) & INTCONST(1)) \ @@ -477,11 +489,11 @@ static inline void UPDATE_LOCAL_PARAMS(const FPTYPE *A, *(x) = *(y) > 0 ? p->ftzthreshold : 0; \ } else if (ABS(y) > p->xmax) { /* Overflow */ \ if (*(y) > p->xmax) \ - *(x) = INFINITY; \ - else if (*(y) < -p->xmax && *(y) != -INFINITY) \ + *(x) = p->ofvalue; \ + else if (*(y) < -p->xmax && *(y) != -p->ofvalue) \ *(x) = -p->xmax; \ else /* *(y) == -INFINITY */ \ - *(x) = -INFINITY; \ + *(x) = -p->ofvalue; \ } else { \ UPDATE_LOCAL_PARAMS(y, p, lp); \ if (SIGN(y) == 0) /* Add ulp if x is positive. */ \ @@ -509,11 +521,11 @@ static inline void UPDATE_LOCAL_PARAMS(const FPTYPE *A, *(x) = *(y) >= 0 ? 0 : -p->ftzthreshold; \ } else if (ABS(y) > p->xmax) { /* Overflow */ \ if (*(y) < -p->xmax) \ - *(x) = -INFINITY; \ - else if (*(y) > p->xmax && *(y) != INFINITY) \ + *(x) = -p->ofvalue; \ + else if (*(y) > p->xmax && *(y) != p->ofvalue) \ *(x) = p->xmax; \ else /* *(y) == INFINITY */ \ - *(x) = INFINITY; \ + *(x) = p->ofvalue; \ } else { \ UPDATE_LOCAL_PARAMS(y, p, lp); \ if (SIGN(y)) /* Subtract ulp if x is positive. */ \ @@ -532,7 +544,7 @@ static inline void UPDATE_LOCAL_PARAMS(const FPTYPE *A, #define RD_TWD_ZERO_SCALAR_OTHER_EXP(x, y, p, lp) \ if (ABS(y) < p->ftzthreshold) { /* Underflow */ \ *(x) = FPOF(SIGN(y)); \ - } else if (ABS(y) > p->xmax && ABS(y) != INFINITY) { /* Overflow */ \ + } else if (ABS(y) > p->xmax && ABS(y) != p->ofvalue) { /* Overflow */ \ *(x) = FPOF(SIGN(y) | INTOFCONST(p->xmax)); \ } else { \ UPDATE_LOCAL_PARAMS(y, p, lp); \ @@ -581,7 +593,7 @@ static inline void UPDATE_LOCAL_PARAMS(const FPTYPE *A, *(x) = *(y); \ } \ if (ABS(x) >= p->xbnd) /* Overflow */ \ - *(x) = FPOF(SIGN(y) | INTOFCONST(INFINITY)); + *(x) = FPOF(SIGN(y) | INTOFCONST(p->ofvalue)); /* Stochastic rounding with equal probabilities. */ #define RS_EQUI_SCALAR(x, y, p, lp) \ @@ -589,9 +601,9 @@ static inline void UPDATE_LOCAL_PARAMS(const FPTYPE *A, if (ABS(y) < p->ftzthreshold && *(y) != 0) { /* Underflow */ \ randombit = GENBIT(p->BITSEED); \ *(x) = FPOF(SIGN(y) | INTOFCONST(randombit ? p->ftzthreshold : 0)); \ - } else if (ABS(y) > p->xmax && ABS(y) != INFINITY) { /* Overflow */ \ + } else if (ABS(y) > p->xmax && ABS(y) != p->ofvalue) { /* Overflow */ \ randombit = GENBIT(p->BITSEED); \ - *(x) = FPOF(SIGN(y) | INTOFCONST(randombit ? INFINITY : p->xmax)); \ + *(x) = FPOF(SIGN(y) | INTOFCONST(randombit ? p->ofvalue : p->xmax)); \ } else if ((INTOF(y) & lp->trailmask)) { /* Not exactly representable. */ \ randombit = GENBIT(p->BITSEED); \ *(x) = FPOF(INTOF(y) & lp->leadmask); \ @@ -604,7 +616,7 @@ static inline void UPDATE_LOCAL_PARAMS(const FPTYPE *A, #define RO_SCALAR(x, y, p, lp) \ if (ABS(y) < p->ftzthreshold && *(y) != 0) { /* Underflow */ \ *(x) = FPOF(SIGN(y) | INTOFCONST(p->ftzthreshold)); \ - } else if (ABS(y) > p->xmax && ABS(y) != INFINITY) { /* Overflow */ \ + } else if (ABS(y) > p->xmax && ABS(y) != p->ofvalue) { /* Overflow */ \ *(x) = FPOF(SIGN(y) | INTOFCONST(p->xmax)); \ } else { \ UPDATE_LOCAL_PARAMS(y, p, lp); \ @@ -651,7 +663,9 @@ static inline void UPDATE_LOCAL_PARAMS(const FPTYPE *A, numelem, p, lp) \ PARALLEL_STRING(PARALLEL) \ { \ - if (p->emax == DEFEMAX) { \ + if (p->emax == DEFEMAX \ + && p->saturation == CPFLOAT_SAT_NO \ + && p->infinity == CPFLOAT_INF_USE) { \ FOR_STRING(PARALLEL) \ for (size_t i=0; iflip != CPFLOAT_NO_SOFTERR) { \ + if (fpopts->flip != CPFLOAT_SOFTERR_NO) { \ PRNG_RAND_INIT \ size_t flippos; \ - const size_t totbits = fpopts->flip == CPFLOAT_FP_SOFTERR ? \ + const size_t totbits = fpopts->flip == CPFLOAT_SOFTERR_FP ? \ NBITS : DEFPREC; \ FOR_STRING(PARALLEL_TYPE) \ for (size_t i = 0; i < numelem; i++) { \ @@ -1207,8 +1221,8 @@ GENERATE_UNIVARIATE_MATH_H(lgamma, lgamma) int temp = ADDSUFFIXTO(ilogb)(*(A+i)); \ if (temp == FP_ILOGB0 || temp == FP_ILOGBNAN || temp == INT_MAX) { \ params.precision = DEFPREC-1; \ - params.emax = DEFEMAX; \ params.emin = DEFEMIN; \ + params.emax = DEFEMAX; \ } else { \ params.precision = temp + 1; \ params.ftzthreshold = 1.0; \ @@ -1490,8 +1504,8 @@ GENERATE_TRIVARIATE_MATH_H(fma, fma) #undef INTSUFFIX #undef DEFPREC -#undef DEFEMAX #undef DEFEMIN +#undef DEFEMAX #undef NLEADBITS #undef NBITS #undef FULLMASK diff --git a/test/cpfloat_test.m b/test/cpfloat_test.m index abef1f8..d44b4fd 100644 --- a/test/cpfloat_test.m +++ b/test/cpfloat_test.m @@ -20,46 +20,68 @@ pi_h = 6432*uh; % fp16(pi) % Check handling of defaults and persistent variable. - fp.format = 'bfloat16'; [c,options] = cpfloat(pi,fp); + fp.format = 'bfloat16'; + [~,options] = cpfloat(pi,fp); assert_eq(fp.format,options.format) assert_eq(options.subnormal,0) - fp.format = []; [c,options] = cpfloat(pi,fp); + fp.format = []; + [~,options] = cpfloat(pi,fp); assert_eq(options.format,'h') % Check default; - fp.subnormal = 0; [c,options] = cpfloat(pi,fp); - assert_eq(options.subnormal,0) + fp.explim = []; + [~,options] = cpfloat(pi,fp); + assert_eq(options.explim,1) % Check default. - fp.subnormal = []; [c,options] = cpfloat(pi,fp); - assert_eq(options.subnormal,1) % Check default; + fp.explim = 0; + [~,options] = cpfloat(pi,fp); + assert_eq(options.explim,0) % Check no default. - fp.round = []; [c,options] = cpfloat(pi,fp); + fp.round = []; + [~,options] = cpfloat(pi,fp); assert_eq(options.round,1) % Check default. - fp.flip = []; [c,options] = cpfloat(pi,fp); - assert_eq(options.flip,0) % Check no default. + fp.saturation = 1; + [~,options] = cpfloat(pi,fp); + assert_eq(options.saturation,1) - fp.explim = []; [c,options] = cpfloat(pi,fp); - assert_eq(options.explim,1) % Check default. + fp.saturation = []; + [~,options] = cpfloat(pi,fp); + assert_eq(options.saturation,0) % Check default; - fp.explim = 0; [c,options] = cpfloat(pi,fp); - assert_eq(options.explim,0) % Check no default. + fp.subnormal = 0; + [~,options] = cpfloat(pi,fp); + assert_eq(options.subnormal,0) + + fp.subnormal = []; + [~,options] = cpfloat(pi,fp); + assert_eq(options.subnormal,1) % Check default; + + fp.flip = []; + [~,options] = cpfloat(pi,fp); + assert_eq(options.flip,0) % Check no default. clear cpfloat fp options - fp.flip = 1; [~,options] = cpfloat([],fp); + fp.flip = 1; + [~,options] = cpfloat([],fp); assert_eq(options.format,'h') assert_eq(options.round,1) + assert_eq(options.saturation,0) assert_eq(options.subnormal,1) clear cpfloat fp options % check all default options - fp.format = []; fp.subnormal = []; - fp.round = []; fp.flip = []; + fp.format = []; + fp.round = []; + fp.saturation = []; + fp.subnormal = []; + fp.flip = []; fp.p = []; - [c,options] = cpfloat(pi,fp); + [~,options] = cpfloat(pi,fp); assert_eq(options.format,'h') - assert_eq(options.subnormal,1) assert_eq(options.round,1) + assert_eq(options.saturation,0) + assert_eq(options.subnormal,1) assert_eq(options.flip,0) assert_eq(options.p,0.5) % % Takes different path from previous test since fpopts exists. @@ -71,18 +93,22 @@ clear cpfloat fp fp.flip = 1; fp.format = 'd'; c = ones(8,1); - d = cpfloat(c,fp); assert_eq(norm(d-c,1)>0,true); - d = cpfloat(c',fp); assert_eq(norm(d-c',1)>0,true); + d = cpfloat(c,fp); + assert_eq(norm(d-c,1)>0,true); + d = cpfloat(c',fp); + assert_eq(norm(d-c',1)>0,true); fp.p = 0; % No bits flipped. - d = cpfloat(c,fp); assert_eq(d,d); + d = cpfloat(c,fp); + assert_eq(d,d); fp.p = 1; % All bits flipped. - d = cpfloat(c,fp); assert_eq(all(d ~= c),true); + d = cpfloat(c,fp); + assert_eq(all(d ~= c),true); clear cpfloat [~,fp] = cpfloat; assert_eq(fp.subnormal,1) assert_eq(fp.format,'h') - [c,options] = cpfloat(pi); + [~,options] = cpfloat(pi); assert_eq(options.format,'h') assert_eq(options.subnormal,1) assert_eq(options.round,1) @@ -91,7 +117,7 @@ clear fp fp.format = 'd'; - [c,options] = cpfloat(pi,fp); + [~,options] = cpfloat(pi,fp); assert_eq(options.format,'d') assert_eq(options.subnormal,1) assert_eq(options.params, [53 -1022 1023]) @@ -101,7 +127,20 @@ assert_eq(fp.params, [53 -1022 1023]) clear fp - fp.format = 'bfloat16'; [c,options] = cpfloat(pi,fp); + fp.format = 'E4M3'; + [~,options] = cpfloat(pi,fp); + assert_eq(options.format,'E4M3') + assert_eq(options.infinity,0) + assert_eq(options.params, [4 -6 8]) + [~,fp] = cpfloat; + assert_eq(fp.format,'E4M3') + assert_eq(fp.infinity,0) + assert_eq(fp.params, [4 -6 8]) + + + clear fp + fp.format = 'bfloat16'; + [~,options] = cpfloat(pi,fp); assert_eq(options.format,'bfloat16') assert_eq(options.subnormal,0) assert_eq(options.params, [8 -126 127]) @@ -114,7 +153,8 @@ [~,fp] = cpfloat; fp.format = 'b'; fp = rmfield(fp, 'params'); - [c,options] = cpfloat(pi,fp); + [~,options] = cpfloat(pi,fp); + assert_eq(options.saturation,0) % No saturation if that field was empty. assert_eq(options.subnormal,1) % No subnormals only if that field was empty. % Check these usages do not give an error. @@ -134,7 +174,8 @@ B = A + randn(size(A))*1e-12; C = cpfloat(B,options); assert_eq(A,C); - A2 = hilb(6); C = cpfloat(A2); + A2 = hilb(6); + C = cpfloat(A2); options.format = 'c'; options.params = [8 -126 127]; % bfloat16 @@ -152,7 +193,7 @@ [X1,opt] = cpfloat(A,options); [X2,opt2] = cpfloat(A,options2); assert_eq(X1,X2) - % assert_eq(cpfloat(A,options),cpfloat(A,options2)); + assert_eq(cpfloat(A,opt),cpfloat(A,opt2)); % Row vector clear options @@ -185,14 +226,14 @@ elseif i == 2 % Half precision tests. [u,xmins,xmin,xmax,p,emins,emin,emax] = float_params('half'); - options.format = 'h' + options.format = 'h'; elseif i == 3 % Quarter precision tests. [u,xmins,xmin,xmax,p,emins,emin,emax] = float_params('q43'); options.format = 'E4M3'; % Modification for OCP compliant q43. emin = -6; % Previously thought to be 1-emax=-7. - emax = 8; % Previously thought to be 7 + emax = 8; % Previously thought to be 7 emins = emin + 1 - p; % Exponent of smallest subnormal number. xmins = 2^emins; xmin = 2^emin; @@ -282,7 +323,22 @@ assert_eq(c,x) end + % Saturation tests. + options.saturation = 1; + for j = 1:6 + options.round = j; + x = inf; + c = cpfloat(x,options); + assert_eq(c,xmax) + c = cpfloat(-x,options); + assert_eq(c,-xmax) + end + % Infinities tests. + [~,fpopts] = cpfloat; + prev_infinity = fpopts.infinity; + options.infinity = 1; + options.saturation = 0; for j = 1:6 options.round = j; x = inf; @@ -345,18 +401,21 @@ end options.subnormal = 1; - c = cpfloat(xmin,options); assert_eq(c,xmin) + c = cpfloat(xmin,options); + assert_eq(c,xmin) x = [xmins xmin/2 xmin 0 xmax 2*xmax 1-delta/5 1+delta/4]; c = cpfloat(x,options); c_expected = [x(1:5) inf 1 1]; assert_eq(c,c_expected) options.subnormal = 0; - c = cpfloat(xmin,options); assert_eq(c,xmin) + c = cpfloat(xmin,options); + assert_eq(c,xmin) x = [xmins xmin/2 xmin 0 xmax 2*xmax 1-delta/5 1+delta/4]; c = cpfloat(x,options); c_expected = [0 0 x(3:5) inf 1 1]; assert_eq(c,c_expected) + options.infinity = prev_infinity; % Smallest normal number and spacing between the subnormal numbers. y = xmin; delta = xmin*2^(1-p); @@ -409,12 +468,24 @@ % Do not limit exponent. options.explim = 0; - x = xmin/2; c = cpfloat(x,options); assert_eq(c,x) - x = -xmin/2; c = cpfloat(x,options); assert_eq(c,x) - x = xmax*2; c = cpfloat(x,options); assert_eq(c,x) - x = -xmax*2; c = cpfloat(x,options); assert_eq(c,x) - x = xmins/2; c = cpfloat(x,options); assert_eq(c,x) - x = -xmins/2; c = cpfloat(x,options); assert_eq(c,x) + x = xmin/2; + c = cpfloat(x,options); + assert_eq(c,x) + x = -xmin/2; + c = cpfloat(x,options); + assert_eq(c,x) + x = xmax*2; + c = cpfloat(x,options); + assert_eq(c,x) + x = -xmax*2; + c = cpfloat(x,options); + assert_eq(c,x) + x = xmins/2; + c = cpfloat(x,options); + assert_eq(c,x) + x = -xmins/2; + c = cpfloat(x,options); + assert_eq(c,x) A = [pi -pi; pi -pi]; C = cpfloat(A,options); options.explim = 1; @@ -498,21 +569,32 @@ % Test rounding with CHOPFAST versus native rounding. options.format = 's'; - m = 100; y = zeros(3,n); z = y; + m = 100; + y = zeros(3,n); + z = y; for i = 1:m x = randn; - options.round = 2; y(i,1) = cpfloat(x,options); - options.round = 3; y(i,2) = cpfloat(x,options); - options.round = 4; y(i,3) = cpfloat(x,options); + options.round = 2; + y(i,1) = cpfloat(x,options); + options.round = 3; + y(i,2) = cpfloat(x,options); + options.round = 4; + y(i,3) = cpfloat(x,options); if usingoctave - fesetround(inf); z(i,1) = single(x); - fesetround(-inf); z(i,2) = single(x); - fesetround(0); z(i,3) = single(x); + fesetround(inf); + z(i,1) = single(x); + fesetround(-inf); + z(i,2) = single(x); + fesetround(0); + z(i,3) = single(x); else % Use undocumented function to set rounding mode in MATLAB. - feature('setround',inf), z(i,1) = single(x); - feature('setround',-inf), z(i,2) = single(x); - feature('setround',0), z(i,3) = single(x); + feature('setround',inf); + z(i,1) = single(x); + feature('setround',-inf); + z(i,2) = single(x); + feature('setround',0); + z(i,3) = single(x); end end assert_eq(y,z) @@ -533,9 +615,15 @@ c = cpfloat(x,options); assert_eq(c,[0 0 x(3:4)]) - options.format = 'd'; options.subnormal = 0; cpfloat([],options); - a = cpfloat(pi); assert_eq(a,pi) - options.format = 'd'; options.subnormal = 1; cpfloat([],options); + options.format = 'd'; + options.subnormal = 0; + cpfloat([],options); + a = cpfloat(pi); + assert_eq(a,pi) + + options.format = 'd'; + options.subnormal = 1; + cpfloat([],options); a = cpfloat(pi); assert_eq(a,pi) x = pi^2; @@ -563,8 +651,10 @@ yd = cpfloat(pd); assert_eq(double(ys),yd) - options.format = 'h'; options.round = 2; - as = single(rand(n,1)); ad = double(as); + options.format = 'h'; + options.round = 2; + as = single(rand(n,1)); + ad = double(as); delta = single(rand(n,1)); cd = cpfloat(ad + 1e-5*double(delta),options); cs = cpfloat(as + 1e-5*delta,options); @@ -581,21 +671,21 @@ % Test base 2 logarithm options.format = 'h'; options.round = 4; - x = single(2^-3 * (sum(2.^(-[0:23])))); - assert_eq(cpfloat(x,options), single(2^-3 * (sum(2.^(-[0:10]))))) + x = single(2^-3 * (sum(2.^(-(0:23))))); + assert_eq(cpfloat(x,options), single(2^-3 * (sum(2.^(-(0:10)))))) - x = 2^-3 * (sum(2.^(-[0:52]))); - assert_eq(cpfloat(x,options), 2^-3 * (sum(2.^(-[0:10])))) + x = 2^-3 * (sum(2.^(-(0:52)))); + assert_eq(cpfloat(x,options), 2^-3 * (sum(2.^(-(0:10))))) options.format = 's'; - x = single(2^-3 * (sum(2.^(-[0:23])))); + x = single(2^-3 * (sum(2.^(-(0:23))))); assert_eq(cpfloat(x,options), x) - x = 2^-3 * (sum(2.^(-[0:52]))); - assert_eq(cpfloat(x,options), 2^-3 * (sum(2.^(-[0:23])))) + x = 2^-3 * (sum(2.^(-(0:52)))); + assert_eq(cpfloat(x,options), 2^-3 * (sum(2.^(-(0:23))))) options.format = 'd'; - x = 2^-3 * (sum(2.^(-[0:52]))); + x = 2^-3 * (sum(2.^(-(0:52)))); assert_eq(cpfloat(x,options), x) options.round = 1; diff --git a/test/cpfloat_test.ts b/test/cpfloat_test.ts index be83030..fb3d791 100644 --- a/test/cpfloat_test.ts +++ b/test/cpfloat_test.ts @@ -53,10 +53,14 @@ optstruct *fpopts; void fpopts_setup(void) { fpopts = malloc(sizeof(optstruct)); - fpopts->flip = 0; - fpopts->p = 0.5; fpopts->round = 1; fpopts->explim = CPFLOAT_EXPRANGE_STOR; + fpopts->infinity = CPFLOAT_INF_USE; + fpopts->saturation = CPFLOAT_SAT_NO; + + fpopts->flip = 0; + fpopts->p = 0.5; + fpopts->bitseed = NULL; fpopts->randseedf = NULL; fpopts->randseed = NULL; @@ -163,7 +167,7 @@ void init_intarray_rounding_double(uint64_t *x, size_t n, uint64_t first, uint64_t step) { x[0] = first + 1ul; x[1] = first + (step >> 1); - for (size_t i = 2; i < n; i+=3) { + for (size_t i = 2; i < n; i += 3) { first += step; x[i] = first - 1ul; if (i+1 < n) @@ -178,7 +182,7 @@ void init_intarray_rounding_float(uint32_t *x, size_t n, uint32_t first, uint32_t step) { x[0] = first + 1u; x[1] = first + (step >> 1); - for (size_t i = 2; i < n; i+=3) { + for (size_t i = 2; i < n; i += 3) { first += step; x[i] = first - 1u; if (i+1 < n) @@ -209,13 +213,13 @@ void init_fparray_rounding_double(double *x, size_t n, double first, double step) { x[0] = FPOFd(INTOFd(first) + 1lu); x[1] = first + step/2; - for (size_t i = 2; i < n; i+=3) { + for (size_t i = 2; i < n; i += 3) { first += step; x[i] = FPOFd(INTOFd(first) - 1lu); if (i+1 < n) - x[i+1] = FPOFd(INTOFd(first) + 1lu); + x[i+1] = FPOFd(INTOFd(first) + 1lu); if (i+2 < n) - x[i+2] = first + step/2; + x[i+2] = first + step/2; } } @@ -224,13 +228,13 @@ void init_fparray_rounding_float(float *x, size_t n, float first, float step) { x[0] = FPOFf(INTOFf(first) + 1lu); x[1] = first + step/2; - for (size_t i = 2; i < n; i+=3) { + for (size_t i = 2; i < n; i += 3) { first += step; x[i] = FPOFf(INTOFf(first) - 1lu); if (i+1 < n) - x[i+1] = FPOFf(INTOFf(first) + 1lu); + x[i+1] = FPOFf(INTOFf(first) + 1lu); if (i+2 < n) - x[i+2] = first + step/2; + x[i+2] = first + step/2; } } @@ -368,7 +372,7 @@ void check_equality_double_long_long(double *x, long long *y, size_t n) { static inline void check_equality_float(float *x, float *y, size_t n) { for (size_t j = 0; j < n; j++) { - if (!nan_safe_compare_float(x[j], y[j])) { + if (!nan_safe_compare_float(x[j], y[j])) { printf("FLOAT\n"); printf("***\nj = %ld\nx = %23.15e [%X]\ny = %23.15e\n", j, x[j], *(uint32_t *)(x + j), (float)y[j]); @@ -458,12 +462,12 @@ void check_array_float(float *y, float *x, float *ref, "pr = %d\n" "e = %d\n" "s = %d\n", - j, - x[j], * (uint32_t *)(x+j), - y[j], * (uint32_t *)(y+j), - ref[j], * (uint32_t *)(ref+j), - fpopts->round, fpopts->precision, - fpopts->emax, fpopts->subnormal); + j, + x[j], * (uint32_t *)(x+j), + y[j], * (uint32_t *)(y+j), + ref[j], * (uint32_t *)(ref+j), + fpopts->round, fpopts->precision, + fpopts->emax, fpopts->subnormal); } ck_assert(nan_safe_compare_float(y[j], ref[j])); } @@ -593,7 +597,8 @@ void select_tests_det_double(double *y, double *x, double *ref, fpopts->round = round; for (format = minformat; format <= maxformat; format++) { fpopts->precision = precision[format]; - fpopts->emax = emax[format]; fpopts->emin = emin[format]; + fpopts->emax = emax[format]; + fpopts->emin = emin[format]; for (subnormal = minsubnormal; subnormal <= maxsubnormal; subnormal++) { fpopts->subnormal = subnormal; for (explim = minexplim; explim <= maxexplim; explim++) { @@ -625,7 +630,8 @@ void select_tests_det_float(float *y, float *x, float *ref, fpopts->round = round; for (format = minformat; format <= maxformat; format++) { fpopts->precision = precision[format]; - fpopts->emax = emax[format]; fpopts->emin = emin[format]; + fpopts->emax = emax[format]; + fpopts->emin = emin[format]; for (subnormal = minsubnormal; subnormal <= maxsubnormal; subnormal++) { fpopts->subnormal = subnormal; for (explim = minexplim; explim <= maxexplim; explim++) { @@ -657,7 +663,8 @@ void select_tests_stoc_double(double *tmpin, double *tmpout, fpopts->round = mode; for (format = minformat; format <= maxformat; format++) { fpopts->precision = precision[format]; - fpopts->emax = emax[format]; fpopts->emin = emin[format]; + fpopts->emax = emax[format]; + fpopts->emin = emin[format]; for (subnormal = minsubnormal; subnormal <= maxsubnormal; subnormal++) { fpopts->subnormal = subnormal; for (explim = minexplim; explim <= maxexplim; explim++) { @@ -692,7 +699,8 @@ void select_tests_stoc_float(float *tmpin, float *tmpout, fpopts->round = mode; for (format = minformat; format <= maxformat; format++) { fpopts->precision = precision[format]; - fpopts->emax = emax[format]; fpopts->emin = emin[format]; + fpopts->emax = emax[format]; + fpopts->emin = emin[format]; for (subnormal = minsubnormal; subnormal <= maxsubnormal; subnormal++) { fpopts->subnormal = subnormal; for (explim = minexplim; explim <= maxexplim; explim++) { @@ -709,25 +717,25 @@ void select_tests_stoc_float(float *tmpin, float *tmpout, double *alloc_init_array_double(double *x, size_t n) { double *y = malloc(n * sizeof(*y)); - for (size_t i=0; iprecision = precision[i]; - fpopts->emax = emax[i]; fpopts->emin = emin[i]; + fpopts->emax = emax[i]; + fpopts->emin = emin[i]; size_t n = ldexp(1.,fpopts->precision-1) - 1; // number of subnormals double *xd = malloc(n * sizeof(*xd)); init_fparray_double(xd, n, minsubnormal(fpopts), minsubnormal(fpopts)); @@ -962,10 +971,11 @@ for (size_t mode=1; mode<3; mode++) { /* Exactly representable normals. */ #test no_rounding_normal_numbers printf("1c. No rounding: normal numbers\n"); -for (size_t mode=1; mode<3; mode++) { +for (size_t mode = 1; mode < 3; mode++) { for (size_t i = 0; i < nformats; i++) { fpopts->precision = precision[i]; - fpopts->emax = emax[i]; fpopts->emin = emin[i]; + fpopts->emax = emax[i]; + fpopts->emin = emin[i]; size_t n = ldexp(1., fpopts->precision-1) * 2 * fpopts->emax; uint64_t *xd = malloc(n * sizeof(*xd)); init_intarray_double(xd, n, intminnormal_double(fpopts), @@ -1014,12 +1024,13 @@ double *refxmind = malloc(n * sizeof(*refxmind)); float *xf = malloc(n * sizeof(*xf)); float *refzerof = calloc(n, sizeof(*refzerof)); float *refxminf = malloc(n * sizeof(*refxminf)); -for (size_t mode=1; mode<3; mode++) { +for (size_t mode = 1; mode < 3; mode++) { double *yd = allocate_array_double(xd, n, mode); float *yf = allocate_array_float(xf, n, mode); for (size_t i = 0; i < nformats; i++) { fpopts->precision = precision[i]; - fpopts->emax = emax[i]; fpopts->emin = emin[i]; + fpopts->emax = emax[i]; + fpopts->emin = emin[i]; double xmin = minnormal(fpopts); double snxmin = minsubnormal(fpopts); double halfxmin = xmin / 2; @@ -1164,12 +1175,13 @@ double *refxmind = malloc(n * sizeof(*refxmind)); float *xf = malloc(n * sizeof(*xf)); float *refzerof = calloc(n, sizeof(*refzerof)); float *refxminf = malloc(n * sizeof(*refxminf)); -for (size_t mode=1; mode<3; mode++) { +for (size_t mode = 1; mode < 3; mode++) { double *yd = allocate_array_double(xd, n, mode); float *yf = allocate_array_float(xf, n, mode); for (size_t i = 0; i < nformats; i++) { fpopts->precision = precision[i]; - fpopts->emax = emax[i]; fpopts->emin = emin[i]; + fpopts->emax = emax[i]; + fpopts->emin = emin[i]; double xmin = minsubnormal(fpopts); double halfxmin = xmin / 2; double xd_imm [] = {nextafter(0, INFINITY), @@ -1308,145 +1320,165 @@ double *refinfd = malloc(n * sizeof(*refinfd)); float *xf = malloc(n * sizeof(*xf)); float *refxmaxf = malloc(n * sizeof(*refxmaxf)); float *refinff = malloc(n * sizeof(*refinff)); -for (size_t mode=1; mode<3; mode++) { - double *yd = allocate_array_double(xd, n, mode); - float *yf = allocate_array_float(xf, n, mode); - for (size_t i = 0; i < nformats; i++) { - fpopts->precision = precision[i]; - fpopts->emax = emax[i]; fpopts->emin = emin[i]; - double xmax = maxnormal(fpopts); - double xbound = maxbound(fpopts); - double xd_imm [] = {nextafter(xmax, INFINITY), - nextafter(xbound, 0), - xbound, - nextafter(xbound, INFINITY)}; - double refd [] = {xmax, xmax, inf_double(), inf_double()}; - for (size_t j = 0; j < n; j++) { - refxmaxd[j] = xmax; - refinfd[j] = inf_double(); - } - reset_array_double(yd, n, mode); - copy_array_double(xd, (double *)&xd_imm, n); - select_tests_det_double(yd, xd, refd, n, fpopts, -1, -1, i, i, -1, 1); - reset_array_double(yd, n, mode); - copy_array_double(xd, (double *)&xd_imm, n); - select_tests_det_double(yd, xd, refd, n, fpopts, 1, 1, i, i, -1, 1); - refd[2] = xmax; - reset_array_double(yd, n, mode); - copy_array_double(xd, (double *)&xd_imm, n); - select_tests_det_double(yd, xd, refd, n, fpopts, 0, 0, i, i, -1, 1); - refd[2] = inf_double(); - reset_array_double(yd, n, mode); - copy_array_double(xd, (double *)&xd_imm, n); - select_tests_det_double(yd, xd, refinfd, n, fpopts, 2, 2, i, i, -1, 1); - reset_array_double(yd, n, mode); - copy_array_double(xd, (double *)&xd_imm, n); - select_tests_det_double(yd, xd, refxmaxd, n, fpopts, 3, 4, i, i, -1, 1); - reset_array_double(yd, n, mode); - copy_array_double(xd, (double *)&xd_imm, n); - select_tests_det_double(yd, xd, refxmaxd, n, fpopts, 7, 7, i, i, -1, 1); - csign_intarray_double((uint64_t *)xd_imm, n); - csign_intarray_double((uint64_t *)refd, n); - csign_intarray_double((uint64_t *)refxmaxd, n); - csign_intarray_double((uint64_t *)refinfd, n); - reset_array_double(yd, n, mode); - copy_array_double(xd, (double *)&xd_imm, n); - select_tests_det_double(yd, xd, refd, n, fpopts, -1, -1, i, i, -1, 1); - reset_array_double(yd, n, mode); - copy_array_double(xd, (double *)&xd_imm, n); - select_tests_det_double(yd, xd, refd, n, fpopts, 1, 1, i, i, -1, 1); - refd[2] = -xmax; - reset_array_double(yd, n, mode); - copy_array_double(xd, (double *)&xd_imm, n); - select_tests_det_double(yd, xd, refd, n, fpopts, 0, 0, i, i, -1, 1); - reset_array_double(yd, n, mode); - copy_array_double(xd, (double *)&xd_imm, n); - select_tests_det_double(yd, xd, refxmaxd, n, fpopts, 2, 2, i, i, -1, 1); - reset_array_double(yd, n, mode); - copy_array_double(xd, (double *)&xd_imm, n); - select_tests_det_double(yd, xd, refinfd, n, fpopts, 3, 3, i, i, -1, 1); - reset_array_double(yd, n, mode); - copy_array_double(xd, (double *)&xd_imm, n); - select_tests_det_double(yd, xd, refxmaxd, n, fpopts, 4, 4, i, i, -1, 1); - reset_array_double(yd, n, mode); - copy_array_double(xd, (double *)&xd_imm, n); - select_tests_det_double(yd, xd, refxmaxd, n, fpopts, 7, 7, i, i, -1, 1); +for (size_t infinity = 0; infinity < 2; infinity++) { + fpopts->infinity = infinity; + for (size_t saturation = 0; saturation < 2; saturation++) { + fpopts->saturation = saturation; + for (size_t mode = 1; mode < 3; mode++) { + double *yd = allocate_array_double(xd, n, mode); + float *yf = allocate_array_float(xf, n, mode); + for (size_t i = 0; i < nformats; i++) { + fpopts->precision = precision[i]; + fpopts->emax = emax[i]; + fpopts->emin = emin[i]; + + double xmax = maxnormal(fpopts); + double xbound = maxbound(fpopts); + + double of_value = saturation == 1 ? xmax : + (infinity == 1 ? inf_double() : NAN); + + double xd_imm [] = {nextafter(xmax, INFINITY), + nextafter(xbound, 0), + xbound, + nextafter(xbound, INFINITY)}; + double refd [] = {xmax, xmax, of_value, of_value}; + for (size_t j = 0; j < n; j++) { + refxmaxd[j] = xmax; + refinfd[j] = of_value; + } - float xf_imm [] = {nextafterf(xmax, INFINITY), - nextafterf(xbound, 0), - xbound, - nextafterf(xbound, INFINITY)}; - float reff [] = {xmax, xmax, inf_float(), inf_float()}; - for (size_t j = 0; j < n; j++) { - refxmaxf[j] = xmax; - refinff[j] = inf_float(); + reset_array_double(yd, n, mode); + copy_array_double(xd, (double *)&xd_imm, n); + select_tests_det_double(yd, xd, refd, n, fpopts, -1, -1, i, i, -1, 1); + reset_array_double(yd, n, mode); + copy_array_double(xd, (double *)&xd_imm, n); + select_tests_det_double(yd, xd, refd, n, fpopts, 1, 1, i, i, -1, 1); + refd[2] = xmax; + reset_array_double(yd, n, mode); + copy_array_double(xd, (double *)&xd_imm, n); + select_tests_det_double(yd, xd, refd, n, fpopts, 0, 0, i, i, -1, 1); + refd[2] = of_value; + reset_array_double(yd, n, mode); + copy_array_double(xd, (double *)&xd_imm, n); + select_tests_det_double(yd, xd, refinfd, n, fpopts, 2, 2, i, i, -1, 1); + reset_array_double(yd, n, mode); + copy_array_double(xd, (double *)&xd_imm, n); + select_tests_det_double(yd, xd, refxmaxd, n, fpopts, 3, 4, i, i, -1, 1); + reset_array_double(yd, n, mode); + copy_array_double(xd, (double *)&xd_imm, n); + select_tests_det_double(yd, xd, refxmaxd, n, fpopts, 7, 7, i, i, -1, 1); + csign_intarray_double((uint64_t *)xd_imm, n); + csign_intarray_double((uint64_t *)refd, n); + csign_intarray_double((uint64_t *)refxmaxd, n); + csign_intarray_double((uint64_t *)refinfd, n); + reset_array_double(yd, n, mode); + copy_array_double(xd, (double *)&xd_imm, n); + select_tests_det_double(yd, xd, refd, n, fpopts, -1, -1, i, i, -1, 1); + reset_array_double(yd, n, mode); + copy_array_double(xd, (double *)&xd_imm, n); + select_tests_det_double(yd, xd, refd, n, fpopts, 1, 1, i, i, -1, 1); + refd[2] = -xmax; + reset_array_double(yd, n, mode); + copy_array_double(xd, (double *)&xd_imm, n); + select_tests_det_double(yd, xd, refd, n, fpopts, 0, 0, i, i, -1, 1); + reset_array_double(yd, n, mode); + copy_array_double(xd, (double *)&xd_imm, n); + select_tests_det_double(yd, xd, refxmaxd, n, fpopts, 2, 2, i, i, -1, 1); + reset_array_double(yd, n, mode); + copy_array_double(xd, (double *)&xd_imm, n); + select_tests_det_double(yd, xd, refinfd, n, fpopts, 3, 3, i, i, -1, 1); + reset_array_double(yd, n, mode); + copy_array_double(xd, (double *)&xd_imm, n); + select_tests_det_double(yd, xd, refxmaxd, n, fpopts, 4, 4, i, i, -1, 1); + reset_array_double(yd, n, mode); + copy_array_double(xd, (double *)&xd_imm, n); + select_tests_det_double(yd, xd, refxmaxd, n, fpopts, 7, 7, i, i, -1, 1); + + float of_valuef = saturation == 1 ? xmax : + (infinity == 1 ? inf_float() : NAN); + float xf_imm [] = {nextafterf(xmax, INFINITY), + nextafterf(xbound, 0), + xbound, + nextafterf(xbound, INFINITY)}; + float reff [] = {xmax, xmax, of_valuef, of_valuef}; + for (size_t j = 0; j < n; j++) { + refxmaxf[j] = xmax; + refinff[j] = of_valuef; + } + + reset_array_float(yf, n, mode); + copy_array_float(xf, (float *)&xf_imm, n); + select_tests_det_float(yf, xf, reff, n, fpopts, -1, -1, i, i, -1, 1); + reset_array_float(yf, n, mode); + copy_array_float(xf, (float *)&xf_imm, n); + select_tests_det_float(yf, xf, reff, n, fpopts, 1, 1, i, i, -1, 1); + reff[2] = xmax; + reset_array_float(yf, n, mode); + copy_array_float(xf, (float *)&xf_imm, n); + select_tests_det_float(yf, xf, reff, n, fpopts, 0, 0, i, i, -1, 1); + reff[2] = of_valuef; + reset_array_float(yf, n, mode); + copy_array_float(xf, (float *)&xf_imm, n); + select_tests_det_float(yf, xf, refinff, n, fpopts, 2, 2, i, i, -1, 1); + reset_array_float(yf, n, mode); + copy_array_float(xf, (float *)&xf_imm, n); + select_tests_det_float(yf, xf, refxmaxf, n, fpopts, 3, 4, i, i, -1, 1); + reset_array_float(yf, n, mode); + copy_array_float(xf, (float *)&xf_imm, n); + select_tests_det_float(yf, xf, refxmaxf, n, fpopts, 7, 7, i, i, -1, 1); + reset_array_float(yf, n, mode); + copy_array_float(xf, (float *)&xf_imm, n); + csign_intarray_float((uint32_t *)xf_imm, n); + csign_intarray_float((uint32_t *)reff, n); + csign_intarray_float((uint32_t *)refxmaxf, n); + csign_intarray_float((uint32_t *)refinff, n); + reset_array_float(yf, n, mode); + copy_array_float(xf, (float *)&xf_imm, n); + select_tests_det_float(yf, xf, reff, n, fpopts, -1, -1, i, i, -1, 1); + reset_array_float(yf, n, mode); + copy_array_float(xf, (float *)&xf_imm, n); + select_tests_det_float(yf, xf, reff, n, fpopts, 1, 1, i, i, -1, 1); + reff[2] = -xmax; + reset_array_float(yf, n, mode); + copy_array_float(xf, (float *)&xf_imm, n); + select_tests_det_float(yf, xf, reff, n, fpopts, 0, 0, i, i, -1, 1); + reset_array_float(yf, n, mode); + copy_array_float(xf, (float *)&xf_imm, n); + select_tests_det_float(yf, xf, refxmaxf, n, fpopts, 2, 2, i, i, -1, 1); + reset_array_float(yf, n, mode); + copy_array_float(xf, (float *)&xf_imm, n); + select_tests_det_float(yf, xf, refinff, n, fpopts, 3, 3, i, i, -1, 1); + reset_array_float(yf, n, mode); + copy_array_float(xf, (float *)&xf_imm, n); + select_tests_det_float(yf, xf, refxmaxf, n, fpopts, 4, 4, i, i, -1, 1); + reset_array_float(yf, n, mode); + copy_array_float(xf, (float *)&xf_imm, n); + select_tests_det_float(yf, xf, refxmaxf, n, fpopts, 7, 7, i, i, -1, 1); + } + free_array_double(yd, mode); + free_array_float(yf, mode); } - reset_array_float(yf, n, mode); - copy_array_float(xf, (float *)&xf_imm, n); - select_tests_det_float(yf, xf, reff, n, fpopts, -1, -1, i, i, -1, 1); - reset_array_float(yf, n, mode); - copy_array_float(xf, (float *)&xf_imm, n); - select_tests_det_float(yf, xf, reff, n, fpopts, 1, 1, i, i, -1, 1); - reff[2] = xmax; - reset_array_float(yf, n, mode); - copy_array_float(xf, (float *)&xf_imm, n); - select_tests_det_float(yf, xf, reff, n, fpopts, 0, 0, i, i, -1, 1); - reff[2] = inf_float(); - reset_array_float(yf, n, mode); - copy_array_float(xf, (float *)&xf_imm, n); - select_tests_det_float(yf, xf, refinff, n, fpopts, 2, 2, i, i, -1, 1); - reset_array_float(yf, n, mode); - copy_array_float(xf, (float *)&xf_imm, n); - select_tests_det_float(yf, xf, refxmaxf, n, fpopts, 3, 4, i, i, -1, 1); - reset_array_float(yf, n, mode); - copy_array_float(xf, (float *)&xf_imm, n); - select_tests_det_float(yf, xf, refxmaxf, n, fpopts, 7, 7, i, i, -1, 1); - reset_array_float(yf, n, mode); - copy_array_float(xf, (float *)&xf_imm, n); - csign_intarray_float((uint32_t *)xf_imm, n); - csign_intarray_float((uint32_t *)reff, n); - csign_intarray_float((uint32_t *)refxmaxf, n); - csign_intarray_float((uint32_t *)refinff, n); - reset_array_float(yf, n, mode); - copy_array_float(xf, (float *)&xf_imm, n); - select_tests_det_float(yf, xf, reff, n, fpopts, -1, -1, i, i, -1, 1); - reset_array_float(yf, n, mode); - copy_array_float(xf, (float *)&xf_imm, n); - select_tests_det_float(yf, xf, reff, n, fpopts, 1, 1, i, i, -1, 1); - reff[2] = -xmax; - reset_array_float(yf, n, mode); - copy_array_float(xf, (float *)&xf_imm, n); - select_tests_det_float(yf, xf, reff, n, fpopts, 0, 0, i, i, -1, 1); - reset_array_float(yf, n, mode); - copy_array_float(xf, (float *)&xf_imm, n); - select_tests_det_float(yf, xf, refxmaxf, n, fpopts, 2, 2, i, i, -1, 1); - reset_array_float(yf, n, mode); - copy_array_float(xf, (float *)&xf_imm, n); - select_tests_det_float(yf, xf, refinff, n, fpopts, 3, 3, i, i, -1, 1); - reset_array_float(yf, n, mode); - copy_array_float(xf, (float *)&xf_imm, n); - select_tests_det_float(yf, xf, refxmaxf, n, fpopts, 4, 4, i, i, -1, 1); - reset_array_float(yf, n, mode); - copy_array_float(xf, (float *)&xf_imm, n); - select_tests_det_float(yf, xf, refxmaxf, n, fpopts, 7, 7, i, i, -1, 1); } - free_array_double(yd, mode); - free_array_float(yf, mode); -} -free(xd); -free(refxmaxd); -free(refinfd); -free(xf); -free(refxmaxf); -free(refinff); + free(xd); + free(refxmaxd); + free(refinfd); + free(xf); + free(refxmaxf); + free(refinff); + } +fpopts->saturation = CPFLOAT_SAT_NO; +fpopts->infinity = CPFLOAT_INF_USE; + /* Rounding of numbers in the subnormal range. */ #test deterministic_rounding_subnormal_numbers printf("2d. Deterministic rounding: subnormal numbers\n"); -for (size_t mode=1; mode<3; mode++) { +for (size_t mode = 1; mode < 3; mode++) { for (size_t i = 0; i < nformats; i++) { fpopts->precision = precision[i]; - fpopts->emax = emax[i]; fpopts->emin = emin[i]; + fpopts->emax = emax[i]; + fpopts->emin = emin[i]; size_t n = 3 * ldexp(1.,fpopts->precision-1) - 1; double *xd_imm = malloc(n * sizeof(*xd_imm)); double *xd = malloc(n * sizeof(*xd)); @@ -1472,7 +1504,7 @@ for (size_t mode=1; mode<3; mode++) { -1, -1, i, i, 1, 1); csign_intarray_double((uint64_t *)xd_imm, n); csign_intarray_double((uint64_t *)refd, n); - for (size_t j = 1; j < n; j+=3) + for (size_t j = 1; j < n; j += 3) refd[j] -= stepd; reset_array_double(yd, n, mode); copy_array_double(xd, xd_imm, n); @@ -1488,7 +1520,7 @@ for (size_t mode=1; mode<3; mode++) { csign_intarray_double((uint64_t *)refd, n); reset_array_double(yd, n, mode); copy_array_double(xd, xd_imm, n); - for (size_t j = 1; j < n; j+=6) + for (size_t j = 1; j < n; j += 6) refd[j] += stepd; reset_array_double(yd, n, mode); copy_array_double(xd, xd_imm, n); @@ -1573,7 +1605,7 @@ for (size_t mode=1; mode<3; mode++) { -1, -1, i, i, 1, 1); csign_intarray_float((uint32_t *)xf_imm, n); csign_intarray_float((uint32_t *)reff, n); - for (size_t j = 1; j < n; j+=3) + for (size_t j = 1; j < n; j += 3) reff[j] -= stepf; reset_array_float(yf, n, mode); copy_array_float(xf, xf_imm, n); @@ -1587,7 +1619,7 @@ for (size_t mode=1; mode<3; mode++) { 0, 0, i, i, 1, 1); csign_intarray_float((uint32_t *)xf_imm, n); csign_intarray_float((uint32_t *)reff, n); - for (size_t j = 1; j < n; j+=6) + for (size_t j = 1; j < n; j += 6) reff[j] += stepf; reset_array_float(yf, n, mode); copy_array_float(xf, xf_imm, n); @@ -1655,10 +1687,11 @@ for (size_t mode=1; mode<3; mode++) { /* Rounding of numbers in the normal range. */ #test deterministic_rounding_normal_numbers printf("2e. Deterministic rounding: normal numbers\n"); -for (size_t mode=1; mode<3; mode++) { +for (size_t mode = 1; mode < 3; mode++) { for (size_t i = 0; i < nformats; i++) { fpopts->precision = precision[i]; - fpopts->emax = emax[i]; fpopts->emin = emin[i]; + fpopts->emax = emax[i]; + fpopts->emin = emin[i]; size_t n = 3 * (ldexp(1., fpopts->precision-1) * 2 * fpopts->emax - 1); uint64_t *xd_imm = malloc(n * sizeof(*xd_imm)); uint64_t *refd = malloc(n * sizeof(*refd)); @@ -1684,7 +1717,7 @@ for (size_t mode=1; mode<3; mode++) { -1, -1, i, i, -1, -1); csign_intarray_double(xd_imm, n); csign_intarray_double(refd, n); - for (size_t j = 1; j < n; j+=3) + for (size_t j = 1; j < n; j += 3) refd[j] -= stepd; reset_array_double(yd, n, mode); copy_array_double(xd, (double *)xd_imm, n); @@ -1698,7 +1731,7 @@ for (size_t mode=1; mode<3; mode++) { 0, 0, i, i, -1, -1); csign_intarray_double(xd_imm, n); csign_intarray_double(refd, n); - for (size_t j = 4; j < n; j+=6) + for (size_t j = 4; j < n; j += 6) refd[j] += stepd; reset_array_double(yd, n, mode); copy_array_double(xd, (double *)xd_imm, n); @@ -1785,7 +1818,7 @@ for (size_t mode=1; mode<3; mode++) { -1, -1, i, i, -1, -1); csign_intarray_float(xf_imm, n); csign_intarray_float(reff, n); - for (size_t j = 1; j < n; j+=3) + for (size_t j = 1; j < n; j += 3) reff[j] -= stepf; reset_array_float(yf, n, mode); copy_array_float(xf, (float *)xf_imm, n); @@ -1799,7 +1832,7 @@ for (size_t mode=1; mode<3; mode++) { 0, 0, i, i, -1, -1); csign_intarray_float(xf_imm, n); csign_intarray_float(reff, n); - for (size_t j = 4; j < n; j+=6) + for (size_t j = 4; j < n; j += 6) reff[j] += stepf; reset_array_float(yf, n, mode); copy_array_float(xf, (float *)xf_imm, n); @@ -1878,11 +1911,12 @@ double *tmpdout = malloc(NREPS * sizeof(*tmpdout)); float *tmpfin = malloc(NREPS * sizeof(*tmpfin)); float *tmpfout = malloc(NREPS * sizeof(*tmpfout)); double proundequi = 0.50; -for (size_t mode=1; mode<3; mode++) { +for (size_t mode = 1; mode < 3; mode++) { for(subnormal = 0; subnormal <= 1; subnormal++) { for(size_t i = 0; i < nformats; i++) { fpopts->precision = precision[i]; - fpopts->emax = emax[i]; fpopts->emin = emin[i]; + fpopts->emax = emax[i]; + fpopts->emin = emin[i]; double xmin; if (subnormal) xmin = minsubnormal(fpopts); @@ -1929,11 +1963,12 @@ double *tmpdout = malloc(NREPS * sizeof(*tmpdout)); float *tmpfin = malloc(NREPS * sizeof(*tmpfin)); float *tmpfout = malloc(NREPS * sizeof(*tmpfout)); double proundequi = 0.50; -for (size_t mode=1; mode<3; mode++ ) { +for (size_t mode = 1; mode < 3; mode++ ) { for(subnormal = 0; subnormal <= 1; subnormal++) { for(i = 0; i < nformats; i++) { fpopts->precision = precision[i]; - fpopts->emax = emax[i]; fpopts->emin = emin[i]; + fpopts->emax = emax[i]; + fpopts->emin = emin[i]; double xmax = maxnormal(fpopts); double xbound = maxbound(fpopts); double *yd = malloc(n * sizeof(*yd)); @@ -2009,10 +2044,11 @@ double *tmpdout = malloc(NREPS * sizeof(*tmpdout)); float *tmpfin = malloc(NREPS * sizeof(*tmpfin)); float *tmpfout = malloc(NREPS * sizeof(*tmpfout)); double proundequi = 0.50; -for (size_t mode=1; mode<3; mode++ ) { +for (size_t mode = 1; mode < 3; mode++ ) { for (size_t i = 0; i < nformats; i++) { fpopts->precision = precision[i]; - fpopts->emax = emax[i]; fpopts->emin = emin[i]; + fpopts->emax = emax[i]; + fpopts->emin = emin[i]; size_t n = 3 * ldexp(1.,fpopts->precision-1) - 1; double *xd = malloc(n * sizeof(*xd)); double stepd = minsubnormal(fpopts); @@ -2061,12 +2097,13 @@ double *tmpdout = malloc(NREPS * sizeof(*tmpdout)); float *tmpfin = malloc(NREPS * sizeof(*tmpfin)); float *tmpfout = malloc(NREPS * sizeof(*tmpfout)); double proundequi = 0.50; -for (size_t mode=1; mode<3; mode++ ) { +for (size_t mode = 1; mode < 3; mode++ ) { for (subnormal = 0; subnormal <= 1; subnormal++) { for (explim = 0; explim <= 1; explim++) { for (size_t i = 0; i < nformats; i++) { fpopts->precision = precision[i]; - fpopts->emax = emax[i]; fpopts->emin = emin[i]; + fpopts->emax = emax[i]; + fpopts->emin = emin[i]; size_t n = 3 * (ldexp(1., fpopts->precision-1) * 2 * fpopts->emax - 1); uint64_t *xd = malloc(n * sizeof(*xd)); @@ -2139,7 +2176,8 @@ size_t n_roundings = 7; for (size_t i = 0; i < nformats; i++) { fpopts->precision = precision[i]; - fpopts->emax = emax[i]; fpopts->emin = emin[i]; + fpopts->emax = emax[i]; + fpopts->emin = emin[i]; for (size_t i = 0; i < n_roundings; i++) { fpopts->round = det_roundings[i]; @@ -2272,7 +2310,8 @@ float *reff = malloc(n * sizeof(* xf)); for (size_t i = 0; i < nformats; i++) { fpopts->precision = precision[i]; - fpopts->emax = emax[i]; fpopts->emin = emin[i]; + fpopts->emax = emax[i]; + fpopts->emin = emin[i]; for (size_t i = 0; i < n_roundings; i++) { fpopts->round = det_roundings[i]; // Univariate functions. @@ -2413,7 +2452,8 @@ const int resd_fpclassify [] = {FP_NORMAL, FP_NORMAL,FP_NORMAL, FP_SUBNORMAL, for (size_t i = 0; i < nformats; i++) { fpopts->precision = precision[i]; - fpopts->emax = emax[i]; fpopts->emin = emin[i]; + fpopts->emax = emax[i]; + fpopts->emin = emin[i]; double xmind = minnormal(fpopts); double xminds = minsubnormal(fpopts); double ad[] = {-2., -1, -xmind, -xminds, xminds, xmind, 1., CONST_SQRT2, @@ -2505,7 +2545,8 @@ const int resf_fpclassify [] = {FP_NORMAL, FP_NORMAL,FP_NORMAL, for (size_t i = 0; i < nformats; i++) { fpopts->precision = precision[i]; - fpopts->emax = emax[i]; fpopts->emin = emin[i]; + fpopts->emax = emax[i]; + fpopts->emin = emin[i]; float xminf = minnormal(fpopts); float xminfs = minsubnormal(fpopts); float af[] = { -2., -1, -xminf, xminf, 1., CONST_SQRT2, @@ -2588,10 +2629,11 @@ printf("4c. Next and previous floating-point number\n"); fpopts->explim = CPFLOAT_EXPRANGE_TARG; fpopts->subnormal = CPFLOAT_SUBN_USE; // Subnormals -for (size_t mode=2; mode<3; mode++) { +for (size_t mode = 2; mode < 3; mode++) { for (size_t i = 0; i < nformats; i++) { fpopts->precision = precision[i]; - fpopts->emax = emax[i]; fpopts->emin = emin[i]; + fpopts->emax = emax[i]; + fpopts->emin = emin[i]; size_t n = (ldexp(1.,fpopts->precision-1) - 1) + 2; double *ad = malloc(n * sizeof(*ad)); @@ -2688,10 +2730,11 @@ for (size_t mode=2; mode<3; mode++) { } // Normals -for (size_t mode=2; mode<3; mode++) { +for (size_t mode = 2; mode < 3; mode++) { for (size_t i = 0; i < nformats; i++) { fpopts->precision = precision[i]; - fpopts->emax = emax[i]; fpopts->emin = emin[i]; + fpopts->emax = emax[i]; + fpopts->emin = emin[i]; size_t n = ldexp(1., fpopts->precision-1) * 2 * fpopts->emax + 2; double *ad = malloc(n * sizeof(*ad)); @@ -2972,7 +3015,8 @@ check_equality_double_long_long(rd7, xll, n-3); printf("4d. Arithmetic operations on large arrays\n"); size_t i = 0; fpopts->precision = precision[i]; -fpopts->emax = emax[i]; fpopts->emin = emin[i]; +fpopts->emax = emax[i]; +fpopts->emin = emin[i]; size_t n = 3 * (ldexp(1., fpopts->precision-1) * (ldexp(1., 5) - 2) - 1) - 2; double *ad = malloc(n * sizeof(*ad)); double *bd = malloc(n * sizeof(*bd));