apyfloat_util.h

Defines

GET_QNTZ_FUNC_ALTERNATIVE(Q, SUPPORT)
_ARGS_SD
_ARGS_QZ
_ARGS_ME
_ARGS_MS
_ARGS_AS
_ARGS_MUL_SHORT
_ARGS_MUL_GENERAL
_ARGS_ADD_SAME_WL
_ARGS_ADD_GENERAL
DEFINE_NEW_MUL(_NAME_, _CALLBACK_)
DEFINE_NEW_ADD(_NAME_, _CALLBACK_)

Functions

static inline bool do_infinity(QuantizationMode mode, bool sign)

Check if one should saturate to infinity or maximum normal number.

static inline std::uint64_t to_bits_uint64(const APyFloatData &d, std::uint8_t exp_bits, std::uint8_t man_bits)

Return the bit pattern of a floating-point data field. No checks on bit width is done.

static inline nb::int_ apyfloat_to_bits(const APyFloatData &d, std::uint8_t exp_bits, std::uint8_t man_bits)
static inline exp_t calc_bias(int new_exp_bits, int exp_bits1, exp_t bias1, int exp_bits2, exp_t bias2)

Calculate new bias. Assumes new_exp_bits is larger than exp_bits1 and exp_bits2.

static inline exp_t calc_bias(int new_exp_bits, const APyFloatSpec &spec1, const APyFloatSpec &spec2)
static inline exp_t calc_bias_general(int new_exp_bits, int exp_bits1, exp_t bias1, int exp_bits2, exp_t bias2)

General calculation of new bias. Should only be used if new_exp_bits can be strictly less than exp_bits1 or exp_bits2.

template<QuantizationMode QNTZ, bool SUPPORT_NEGATIVE_BITS_TO_QUANTIZE = false>
static inline auto _qntz_func(man_t &man, exp_t &exp, exp_t max_exp, int bits_to_quantize, bool sign, man_t man_msb_constant, man_t sticky_constant)

Specialized floating-point mantissa quantization functions.

static inline auto get_qntz_func(QuantizationMode qntz, bool support_negative_bits_to_quantize = false)
static inline man_t quantize_close_to_zero(bool sign, QuantizationMode quantization)

Quantize mantissa when the result is guaranteed to be either be 0 or 1. Quantization mode STOCH_WEIGHTED should be used with this function.

static inline man_t ipow(man_t base, unsigned int n)

Fast integer power by squaring.

static inline unsigned int leading_zeros_apyfixed(const APyFixed &fx)

Get the number of left shifts needed to make fx>=1.0.

static inline QuantizationMode translate_quantization_mode(QuantizationMode quantization, bool sign)

Translate the quantization mode for floating-point to the fixed-point equivalent. This is used for the mantissa so the sign must be taken into account.

static inline void quantize_apymantissa(APyFixed &apyman, bool sign, int bits, QuantizationMode quantization)

Quantize mantissa stored as an APyFixed

static inline void check_exponent_format(int exp_bits)

Check that the number of exponent bits is allowed, throw otherwise.

static inline void check_mantissa_format(int man_bits)

Check that the number of mantissa bits is allowed, throw otherwise.

static inline exp_t ieee_bias(std::uint8_t exp_bits)

Retrieve a generalized IEEE-754 bias for exp_bits

static inline bool is_zero(const APyFloatData &src)
static inline bool is_max_exponent(const APyFloatData &src, uint8_t exp_bits)
static inline bool is_max_exponent(const APyFloatData &src, const APyFloatSpec &spec)
static inline bool is_normal(const APyFloatData &src, uint8_t exp_bits)
static inline bool is_nan(const APyFloatData &src, uint8_t exp_bits)
static inline bool is_nan(const APyFloatData &src, const APyFloatSpec &spec)
static inline bool is_inf(const APyFloatData &src, uint8_t exp_bits)
static inline bool is_inf(const APyFloatData &src, const APyFloatSpec &spec)
static inline int64_t true_exp(const APyFloatData &src, exp_t bias)
static inline int64_t true_exp(const APyFloatData &src, const APyFloatSpec &spec)
static inline std::tuple<int64_t, std::size_t> pure_exp(const APyFloatData &src, const APyFloatSpec &spec)
static inline man_t true_man(const APyFloatData &src, uint8_t exp_bits, uint8_t man_bits)
static inline man_t true_man(const APyFloatData &src, const APyFloatSpec &spec)
static inline std::tuple<APyFloatData, uint8_t, exp_t> normalize(const APyFloatData &src, uint8_t exp_bits, uint8_t man_bits, exp_t bias)

Return a normalized (non subnormal) floating-point copy of src

static inline std::tuple<APyFloatData, uint8_t, exp_t> normalize(const APyFloatData &src, const APyFloatSpec &spec)
static inline APyFloatData cast_no_quant(const APyFloatData &src, const APyFloatSpec &src_spec, const APyFloatSpec &dst_spec)
static inline bool floating_point_less_than(const APyFloatData &src1, const APyFloatSpec &src1_spec, const APyFloatData &src2, const APyFloatSpec &src2_spec)

Iterator-based less-than function, comparing src1 < src2

static inline bool floating_point_less_than_abs(const APyFloatData &src1, const APyFloatSpec &src1_spec, const APyFloatData &src2, const APyFloatSpec &src2_spec)

Iterator-based less-than function, comparing abs(src1) < abs(src2)

static inline bool floating_point_less_than_abs_same_wl(const APyFloatData &src1, const APyFloatData &src2)

Memory-based absolute value less-than function, comparing abs(src1) < abs(src2) when src1 and src2 have the same bit specifiers. This method assumes that neither src1 nor src2 are NaN or inf.

template<const bool SUB = false, typename RANDOM_ACCESS_ITERATOR_IN1, typename RANDOM_ACCESS_ITERATOR_IN2, typename RANDOM_ACCESS_ITERATOR_INOUT, typename QNTZ_FUNC_SIGNATURE>
static inline void _floating_point_add_same_wl(RANDOM_ACCESS_ITERATOR_IN1 src1, RANDOM_ACCESS_ITERATOR_IN2 src2, RANDOM_ACCESS_ITERATOR_INOUT dst, const APyFloatSpec &spec, const QuantizationMode &qntz, QNTZ_FUNC_SIGNATURE qntz_func, const exp_t MAX_EXP, const man_t FINAL_RES_LO, const man_t RES_LO, const man_t CARRY_RES_LO, const man_t MAN_MASK, const unsigned NORMALIZATION_CONST)

Floating-point addition of src1 and src2 when they share the exact word length (exp_bits, man_bits, and bias). To use this function, all of the following must be satisfied:

  • spec.man_bits + 5 <= _MAN_T_SIZE_BITS

  • qntz != QuantizationMode::STOCH_WEIGHTED

template<const bool SUB = false, typename RANDOM_ACCESS_ITERATOR_IN1, typename RANDOM_ACCESS_ITERATOR_IN2, typename RANDOM_ACCESS_ITERATOR_INOUT, typename QNTZ_FUNC_SIGNATURE>
static inline void _floating_point_add_diff_wl(RANDOM_ACCESS_ITERATOR_IN1 src1, RANDOM_ACCESS_ITERATOR_IN2 src2, RANDOM_ACCESS_ITERATOR_INOUT dst, const APyFloatSpec &src1_spec, const APyFloatSpec &src2_spec, const APyFloatSpec &dst_spec, const QuantizationMode &qntz, QNTZ_FUNC_SIGNATURE qntz_func, const exp_t RES_MAX_EXP, const man_t FINAL_RES_LO, const man_t RES_LO, const man_t CARRY_RES_LO, const man_t MAN_MASK, const unsigned NORMALIZATION_CONST)

Floating-point addition of src1 and src2 when they don’t share word lengths (any of: exp_bits, man_bits, and bias). To use this function, all of the following must be satisfied:

  • dst_spec.man_bits + 5 <= _MAN_T_SIZE_BITS

  • qntz != QuantizationMode::STOCH_WEIGHTED

template<const bool SUB = false, typename RANDOM_ACCESS_ITERATOR_IN1, typename RANDOM_ACCESS_ITERATOR_IN2, typename RANDOM_ACCESS_ITERATOR_INOUT>
static inline void _floating_point_add_general(RANDOM_ACCESS_ITERATOR_IN1 src1, RANDOM_ACCESS_ITERATOR_IN2 src2, RANDOM_ACCESS_ITERATOR_INOUT dst, const APyFloatSpec &src1_spec, const APyFloatSpec &src2_spec, const APyFloatSpec &dst_spec, const QuantizationMode &qntz, const exp_t RES_MAX_EXP)

Floating-point addition of src1 and src2 for when_floating_point_sum_same_wl or _floating_point_sum_diff_wl can’t be used. Works in all cases, but is the slowest.

template<typename RANDOM_ACCESS_ITERATOR_IN1, typename RANDOM_ACCESS_ITERATOR_IN2, typename RANDOM_ACCESS_ITERATOR_INOUT, typename QNTZ_FUNC_SIGNATURE>
static inline void _floating_point_mul_short(RANDOM_ACCESS_ITERATOR_IN1 src1, RANDOM_ACCESS_ITERATOR_IN2 src2, RANDOM_ACCESS_ITERATOR_INOUT dst, const APyFloatSpec &src1_spec, const APyFloatSpec &src2_spec, const APyFloatSpec &dst_spec, const QuantizationMode &qntz, QNTZ_FUNC_SIGNATURE qntz_func, const unsigned SUM_MAN_BITS, const exp_t SRC1_MAX_EXP, const exp_t SRC2_MAX_EXP, const exp_t RES_MAX_EXP, const man_t TWO, const man_t TWO_BEFORE, const man_t ONE_BEFORE, const man_t TWO_RES, const int MAN_DELTA, const man_t STICKY)

Floating-point multiplication of src1 and src2 for when the mantissa product fit into a single std::uint64_t.

template<typename RANDOM_ACCESS_ITERATOR_IN1, typename RANDOM_ACCESS_ITERATOR_IN2, typename RANDOM_ACCESS_ITERATOR_INOUT>
static inline void _floating_point_mul_general(RANDOM_ACCESS_ITERATOR_IN1 src1, RANDOM_ACCESS_ITERATOR_IN2 src2, RANDOM_ACCESS_ITERATOR_INOUT dst, const APyFloatSpec &src1_spec, const APyFloatSpec &src2_spec, const APyFloatSpec &dst_spec, const QuantizationMode &qntz, const exp_t SRC1_MAX_EXP, const exp_t SRC2_MAX_EXP, const exp_t RES_MAX_EXP)

Floating-point multiplication of src1 and src2. This is the most general low-level multiplication function to use only when others won’t work.

template<const bool SUB = false, const std::size_t SRC1_INC = 1, const std::size_t SRC2_INC = 1, const std::size_t DST_INC = 1, typename RANDOM_ACCESS_ITERATOR_IN1, typename RANDOM_ACCESS_ITERATOR_IN2, typename RANDOM_ACCESS_ITERATOR_INOUT>
static inline void floating_point_sums(RANDOM_ACCESS_ITERATOR_IN1 src1, RANDOM_ACCESS_ITERATOR_IN2 src2, RANDOM_ACCESS_ITERATOR_INOUT dst, const APyFloatSpec &src1_spec, const APyFloatSpec &src2_spec, const APyFloatSpec &dst_spec, const std::size_t n_elements, const QuantizationMode &qntz)

Iterator-based floating-point addition.

template<const std::size_t SRC1_INC = 1, const std::size_t SRC2_INC = 1, const std::size_t DST_INC = 1, typename RANDOM_ACCESS_ITERATOR_IN1, typename RANDOM_ACCESS_ITERATOR_IN2, typename RANDOM_ACCESS_ITERATOR_INOUT>
static inline void floating_point_products(RANDOM_ACCESS_ITERATOR_IN1 src1, RANDOM_ACCESS_ITERATOR_IN2 src2, RANDOM_ACCESS_ITERATOR_INOUT dst, const APyFloatSpec &src1_spec, const APyFloatSpec &src2_spec, const APyFloatSpec &dst_spec, const std::size_t n_elements, const QuantizationMode &qntz)

Iterator-based floating-point products.

template<const std::size_t SRC1_INC = 1, const std::size_t SRC2_INC = 1, const std::size_t DST_INC = 1, typename RANDOM_ACCESS_ITERATOR_IN1, typename RANDOM_ACCESS_ITERATOR_IN2, typename RANDOM_ACCESS_ITERATOR_INOUT>
static inline void floating_point_quotients(RANDOM_ACCESS_ITERATOR_IN1 src1, RANDOM_ACCESS_ITERATOR_IN2 src2, RANDOM_ACCESS_ITERATOR_INOUT dst, const APyFloatSpec &src1_spec, const APyFloatSpec &src2_spec, const APyFloatSpec &dst_spec, const std::size_t n_elements, const QuantizationMode &qntz)

Iterator-based floating-point quotients.

static inline void floating_point_product(const APyFloatData &src1, const APyFloatData &src2, APyFloatData &dst, const APyFloatSpec &src1_spec, const APyFloatSpec &src2_spec, const APyFloatSpec &dst_spec, const QuantizationMode &qntz)

Perform a single float product.

template<const bool SUB = false>
static inline void floating_point_sum(const APyFloatData &src1, const APyFloatData &src2, APyFloatData &dst, const APyFloatSpec &src1_spec, const APyFloatSpec &src2_spec, const APyFloatSpec &dst_spec, const QuantizationMode &qntz)

Perform a single float product.

template<const bool SUB = false>
static inline void floating_point_quotient(const APyFloatData &src1, const APyFloatData &src2, APyFloatData &dst, const APyFloatSpec &src1_spec, const APyFloatSpec &src2_spec, const APyFloatSpec &dst_spec, const QuantizationMode &qntz)

Perform a single float quotient.

Variables

static constexpr std::size_t _MAN_T_SIZE_BYTES = sizeof(man_t)

Sizes of APyFloat datatypes

static constexpr std::size_t _MAN_T_SIZE_BITS = 8 * _MAN_T_SIZE_BYTES
static constexpr std::size_t _EXP_T_SIZE_BYTES = sizeof(exp_t)
static constexpr std::size_t _EXP_T_SIZE_BITS = 8 * _EXP_T_SIZE_BYTES
static constexpr std::size_t _MAN_LIMIT_BITS = _MAN_T_SIZE_BITS - 3

APyFloat word length limits.

static constexpr std::size_t _EXP_LIMIT_BITS = _EXP_T_SIZE_BITS - 2
template<typename RANDOM_ACCESS_ITERATOR_IN1, typename RANDOM_ACCESS_ITERATOR_IN2, typename RANDOM_ACCESS_ITERATOR_INOUT>
struct FloatInnerProdFunctor

Public Functions

inline FloatInnerProdFunctor(const APyFloatSpec &src1_spec, const APyFloatSpec &src2_spec, const APyFloatSpec &dst_spec, const QuantizationMode &qntz)
inline auto get_inner_ptr(const APyFloatSpec &src1_spec, const APyFloatSpec &src2_spec, const APyFloatSpec &dst_spec, const QuantizationMode &qntz) const
DEFINE_NEW_MUL(MUL_SHORT, _floating_point_mul_short)
DEFINE_NEW_MUL(MUL_GENERAL, _floating_point_mul_general)
DEFINE_NEW_ADD(ADD_SAME_WL, _floating_point_add_same_wl)
DEFINE_NEW_ADD(ADD_GENERAL, _floating_point_add_general)
template<auto MUL_FUNC, auto ADD_FUNC>
inline void inner_product(RANDOM_ACCESS_ITERATOR_IN1 src1, RANDOM_ACCESS_ITERATOR_IN2 src2, RANDOM_ACCESS_ITERATOR_INOUT dst, std::size_t N, std::size_t M = 1, std::size_t DST_STEP = 1) const
inline void operator()(RANDOM_ACCESS_ITERATOR_IN1 src1, RANDOM_ACCESS_ITERATOR_IN2 src2, RANDOM_ACCESS_ITERATOR_INOUT dst, std::size_t N, std::size_t M = 1, std::size_t DST_STEP = 1) const

Public Members

const APyFloatSpec _src1_spec
const APyFloatSpec _src2_spec
const APyFloatSpec _dst_spec
const QuantizationMode _qntz
const std::optional<APyFloatAccumulatorOption> _accumulator_mode = std::nullopt
void (FloatInnerProdFunctor::*const inner_f_ptr)(RANDOM_ACCESS_ITERATOR_IN1 src1, RANDOM_ACCESS_ITERATOR_IN2 src2, RANDOM_ACCESS_ITERATOR_INOUT dst, std::size_t N, std::size_t M, std::size_t RES_STEP) const
const unsigned SUM_MAN_BITS = _src1_spec.man_bits + _src2_spec.man_bits
const exp_t SRC1_MAX_EXP = ((1ULL << _src1_spec.exp_bits) - 1)
const exp_t SRC2_MAX_EXP = ((1ULL << _src2_spec.exp_bits) - 1)
const exp_t RES_MAX_EXP = ((1ULL << _dst_spec.exp_bits) - 1)
const man_t TWO = 1ULL << (SUM_MAN_BITS + 2)
const man_t TWO_BEFORE = 1ULL << (SUM_MAN_BITS + 1)
const man_t ONE_BEFORE = 1ULL << (SUM_MAN_BITS + 0)
const man_t TWO_RES = (1ULL << _dst_spec.man_bits)
const int MAN_DELTA = int(SUM_MAN_BITS) + 2 - _dst_spec.man_bits
const man_t STICKY = (1ULL << (MAN_DELTA - 1)) - 1
const unsigned MAX_MAN_BITS = _dst_spec.man_bits + 5
const man_t FINAL_RES_LO = (1ULL << _dst_spec.man_bits)
const man_t RES_LO = FINAL_RES_LO << 3
const man_t CARRY_RES_LO = RES_LO << 1
const man_t MAN_MASK = CARRY_RES_LO - 1
const unsigned NORM_CONST = _MAN_T_SIZE_BITS - _dst_spec.man_bits - 4
const decltype(get_qntz_func(_qntz, MAN_DELTA < 0)) _qntz_func = get_qntz_func(_qntz, MAN_DELTA < 0)