apyfloat_util.h
¶
Defines
-
GET_QNTZ_FUNC_ALTERNATIVE(Q, SUPPORT)¶
-
_ARGS_SD¶
-
_ARGS_QZ¶
-
_ARGS_ME¶
-
_ARGS_MS¶
-
_ARGS_AS¶
-
_ARGS_MUL_SHORT¶
-
_ARGS_MUL_GENERAL¶
-
_ARGS_ADD_SAME_WL¶
-
_ARGS_ADD_GENERAL¶
-
DEFINE_NEW_MUL(_NAME_, _CALLBACK_)¶
-
DEFINE_NEW_ADD(_NAME_, _CALLBACK_)¶
Functions
-
static inline bool do_infinity(QuantizationMode mode, bool sign)¶
Check if one should saturate to infinity or maximum normal number.
-
static inline std::uint64_t to_bits_uint64(const APyFloatData &d, std::uint8_t exp_bits, std::uint8_t man_bits)¶
Return the bit pattern of a floating-point data field. No checks on bit width is done.
-
static inline nb::int_ apyfloat_to_bits(const APyFloatData &d, std::uint8_t exp_bits, std::uint8_t man_bits)¶
-
static inline exp_t calc_bias(int new_exp_bits, int exp_bits1, exp_t bias1, int exp_bits2, exp_t bias2)¶
Calculate new bias. Assumes new_exp_bits is larger than exp_bits1 and exp_bits2.
-
static inline exp_t calc_bias(int new_exp_bits, const APyFloatSpec &spec1, const APyFloatSpec &spec2)¶
-
static inline exp_t calc_bias_general(int new_exp_bits, int exp_bits1, exp_t bias1, int exp_bits2, exp_t bias2)¶
General calculation of new bias. Should only be used if new_exp_bits can be strictly less than exp_bits1 or exp_bits2.
-
template<QuantizationMode QNTZ, bool SUPPORT_NEGATIVE_BITS_TO_QUANTIZE = false>
static inline auto _qntz_func(man_t &man, exp_t &exp, exp_t max_exp, int bits_to_quantize, bool sign, man_t man_msb_constant, man_t sticky_constant)¶ Specialized floating-point mantissa quantization functions.
-
static inline auto get_qntz_func(QuantizationMode qntz, bool support_negative_bits_to_quantize = false)¶
-
static inline man_t quantize_close_to_zero(bool sign, QuantizationMode quantization)¶
Quantize mantissa when the result is guaranteed to be either be 0 or 1. Quantization mode
STOCH_WEIGHTED
should be used with this function.
-
static inline man_t ipow(man_t base, unsigned int n)¶
Fast integer power by squaring.
-
static inline unsigned int leading_zeros_apyfixed(const APyFixed &fx)¶
Get the number of left shifts needed to make fx>=1.0.
-
static inline QuantizationMode translate_quantization_mode(QuantizationMode quantization, bool sign)¶
Translate the quantization mode for floating-point to the fixed-point equivalent. This is used for the mantissa so the sign must be taken into account.
-
static inline void quantize_apymantissa(APyFixed &apyman, bool sign, int bits, QuantizationMode quantization)¶
Quantize mantissa stored as an
APyFixed
-
static inline void check_exponent_format(int exp_bits)¶
Check that the number of exponent bits is allowed, throw otherwise.
-
static inline void check_mantissa_format(int man_bits)¶
Check that the number of mantissa bits is allowed, throw otherwise.
-
static inline exp_t ieee_bias(std::uint8_t exp_bits)¶
Retrieve a generalized IEEE-754 bias for
exp_bits
-
static inline bool is_zero(const APyFloatData &src)¶
-
static inline bool is_max_exponent(const APyFloatData &src, uint8_t exp_bits)¶
-
static inline bool is_max_exponent(const APyFloatData &src, const APyFloatSpec &spec)¶
-
static inline bool is_normal(const APyFloatData &src, uint8_t exp_bits)¶
-
static inline bool is_nan(const APyFloatData &src, uint8_t exp_bits)¶
-
static inline bool is_nan(const APyFloatData &src, const APyFloatSpec &spec)¶
-
static inline bool is_inf(const APyFloatData &src, uint8_t exp_bits)¶
-
static inline bool is_inf(const APyFloatData &src, const APyFloatSpec &spec)¶
-
static inline int64_t true_exp(const APyFloatData &src, exp_t bias)¶
-
static inline int64_t true_exp(const APyFloatData &src, const APyFloatSpec &spec)¶
-
static inline std::tuple<int64_t, std::size_t> pure_exp(const APyFloatData &src, const APyFloatSpec &spec)¶
-
static inline man_t true_man(const APyFloatData &src, uint8_t exp_bits, uint8_t man_bits)¶
-
static inline man_t true_man(const APyFloatData &src, const APyFloatSpec &spec)¶
-
static inline std::tuple<APyFloatData, uint8_t, exp_t> normalize(const APyFloatData &src, uint8_t exp_bits, uint8_t man_bits, exp_t bias)¶
Return a normalized (non subnormal) floating-point copy of
src
-
static inline std::tuple<APyFloatData, uint8_t, exp_t> normalize(const APyFloatData &src, const APyFloatSpec &spec)¶
-
static inline APyFloatData cast_no_quant(const APyFloatData &src, const APyFloatSpec &src_spec, const APyFloatSpec &dst_spec)¶
-
static inline bool floating_point_less_than(const APyFloatData &src1, const APyFloatSpec &src1_spec, const APyFloatData &src2, const APyFloatSpec &src2_spec)¶
Iterator-based less-than function, comparing
src1 < src2
-
static inline bool floating_point_less_than_abs(const APyFloatData &src1, const APyFloatSpec &src1_spec, const APyFloatData &src2, const APyFloatSpec &src2_spec)¶
Iterator-based less-than function, comparing
abs(src1) < abs(src2)
-
static inline bool floating_point_less_than_abs_same_wl(const APyFloatData &src1, const APyFloatData &src2)¶
Memory-based absolute value less-than function, comparing
abs(src1) < abs(src2)
whensrc1
andsrc2
have the same bit specifiers. This method assumes that neithersrc1
norsrc2
are NaN or inf.
-
template<const bool SUB = false, typename RANDOM_ACCESS_ITERATOR_IN1, typename RANDOM_ACCESS_ITERATOR_IN2, typename RANDOM_ACCESS_ITERATOR_INOUT, typename QNTZ_FUNC_SIGNATURE>
static inline void _floating_point_add_same_wl(RANDOM_ACCESS_ITERATOR_IN1 src1, RANDOM_ACCESS_ITERATOR_IN2 src2, RANDOM_ACCESS_ITERATOR_INOUT dst, const APyFloatSpec &spec, const QuantizationMode &qntz, QNTZ_FUNC_SIGNATURE qntz_func, const exp_t MAX_EXP, const man_t FINAL_RES_LO, const man_t RES_LO, const man_t CARRY_RES_LO, const man_t MAN_MASK, const unsigned NORMALIZATION_CONST)¶ Floating-point addition of
src1
andsrc2
when they share the exact word length (exp_bits
,man_bits
, andbias
). To use this function, all of the following must be satisfied:spec.man_bits + 5 <= _MAN_T_SIZE_BITS
qntz != QuantizationMode::STOCH_WEIGHTED
-
template<const bool SUB = false, typename RANDOM_ACCESS_ITERATOR_IN1, typename RANDOM_ACCESS_ITERATOR_IN2, typename RANDOM_ACCESS_ITERATOR_INOUT, typename QNTZ_FUNC_SIGNATURE>
static inline void _floating_point_add_diff_wl(RANDOM_ACCESS_ITERATOR_IN1 src1, RANDOM_ACCESS_ITERATOR_IN2 src2, RANDOM_ACCESS_ITERATOR_INOUT dst, const APyFloatSpec &src1_spec, const APyFloatSpec &src2_spec, const APyFloatSpec &dst_spec, const QuantizationMode &qntz, QNTZ_FUNC_SIGNATURE qntz_func, const exp_t RES_MAX_EXP, const man_t FINAL_RES_LO, const man_t RES_LO, const man_t CARRY_RES_LO, const man_t MAN_MASK, const unsigned NORMALIZATION_CONST)¶ Floating-point addition of
src1
andsrc2
when they don’t share word lengths (any of:exp_bits
,man_bits
, andbias
). To use this function, all of the following must be satisfied:dst_spec.man_bits + 5 <= _MAN_T_SIZE_BITS
qntz != QuantizationMode::STOCH_WEIGHTED
-
template<const bool SUB = false, typename RANDOM_ACCESS_ITERATOR_IN1, typename RANDOM_ACCESS_ITERATOR_IN2, typename RANDOM_ACCESS_ITERATOR_INOUT>
static inline void _floating_point_add_general(RANDOM_ACCESS_ITERATOR_IN1 src1, RANDOM_ACCESS_ITERATOR_IN2 src2, RANDOM_ACCESS_ITERATOR_INOUT dst, const APyFloatSpec &src1_spec, const APyFloatSpec &src2_spec, const APyFloatSpec &dst_spec, const QuantizationMode &qntz, const exp_t RES_MAX_EXP)¶ Floating-point addition of
src1
andsrc2
for when_floating_point_sum_same_wl
or_floating_point_sum_diff_wl
can’t be used. Works in all cases, but is the slowest.
-
template<typename RANDOM_ACCESS_ITERATOR_IN1, typename RANDOM_ACCESS_ITERATOR_IN2, typename RANDOM_ACCESS_ITERATOR_INOUT, typename QNTZ_FUNC_SIGNATURE>
static inline void _floating_point_mul_short(RANDOM_ACCESS_ITERATOR_IN1 src1, RANDOM_ACCESS_ITERATOR_IN2 src2, RANDOM_ACCESS_ITERATOR_INOUT dst, const APyFloatSpec &src1_spec, const APyFloatSpec &src2_spec, const APyFloatSpec &dst_spec, const QuantizationMode &qntz, QNTZ_FUNC_SIGNATURE qntz_func, const unsigned SUM_MAN_BITS, const exp_t SRC1_MAX_EXP, const exp_t SRC2_MAX_EXP, const exp_t RES_MAX_EXP, const man_t TWO, const man_t TWO_BEFORE, const man_t ONE_BEFORE, const man_t TWO_RES, const int MAN_DELTA, const man_t STICKY)¶ Floating-point multiplication of
src1
andsrc2
for when the mantissa product fit into a singlestd::uint64_t
.
-
template<typename RANDOM_ACCESS_ITERATOR_IN1, typename RANDOM_ACCESS_ITERATOR_IN2, typename RANDOM_ACCESS_ITERATOR_INOUT>
static inline void _floating_point_mul_general(RANDOM_ACCESS_ITERATOR_IN1 src1, RANDOM_ACCESS_ITERATOR_IN2 src2, RANDOM_ACCESS_ITERATOR_INOUT dst, const APyFloatSpec &src1_spec, const APyFloatSpec &src2_spec, const APyFloatSpec &dst_spec, const QuantizationMode &qntz, const exp_t SRC1_MAX_EXP, const exp_t SRC2_MAX_EXP, const exp_t RES_MAX_EXP)¶ Floating-point multiplication of
src1
andsrc2
. This is the most general low-level multiplication function to use only when others won’t work.
-
template<const bool SUB = false, const std::size_t SRC1_INC = 1, const std::size_t SRC2_INC = 1, const std::size_t DST_INC = 1, typename RANDOM_ACCESS_ITERATOR_IN1, typename RANDOM_ACCESS_ITERATOR_IN2, typename RANDOM_ACCESS_ITERATOR_INOUT>
static inline void floating_point_sums(RANDOM_ACCESS_ITERATOR_IN1 src1, RANDOM_ACCESS_ITERATOR_IN2 src2, RANDOM_ACCESS_ITERATOR_INOUT dst, const APyFloatSpec &src1_spec, const APyFloatSpec &src2_spec, const APyFloatSpec &dst_spec, const std::size_t n_elements, const QuantizationMode &qntz)¶ Iterator-based floating-point addition.
-
template<const std::size_t SRC1_INC = 1, const std::size_t SRC2_INC = 1, const std::size_t DST_INC = 1, typename RANDOM_ACCESS_ITERATOR_IN1, typename RANDOM_ACCESS_ITERATOR_IN2, typename RANDOM_ACCESS_ITERATOR_INOUT>
static inline void floating_point_products(RANDOM_ACCESS_ITERATOR_IN1 src1, RANDOM_ACCESS_ITERATOR_IN2 src2, RANDOM_ACCESS_ITERATOR_INOUT dst, const APyFloatSpec &src1_spec, const APyFloatSpec &src2_spec, const APyFloatSpec &dst_spec, const std::size_t n_elements, const QuantizationMode &qntz)¶ Iterator-based floating-point products.
-
template<const std::size_t SRC1_INC = 1, const std::size_t SRC2_INC = 1, const std::size_t DST_INC = 1, typename RANDOM_ACCESS_ITERATOR_IN1, typename RANDOM_ACCESS_ITERATOR_IN2, typename RANDOM_ACCESS_ITERATOR_INOUT>
static inline void floating_point_quotients(RANDOM_ACCESS_ITERATOR_IN1 src1, RANDOM_ACCESS_ITERATOR_IN2 src2, RANDOM_ACCESS_ITERATOR_INOUT dst, const APyFloatSpec &src1_spec, const APyFloatSpec &src2_spec, const APyFloatSpec &dst_spec, const std::size_t n_elements, const QuantizationMode &qntz)¶ Iterator-based floating-point quotients.
-
static inline void floating_point_product(const APyFloatData &src1, const APyFloatData &src2, APyFloatData &dst, const APyFloatSpec &src1_spec, const APyFloatSpec &src2_spec, const APyFloatSpec &dst_spec, const QuantizationMode &qntz)¶
Perform a single float product.
-
template<const bool SUB = false>
static inline void floating_point_sum(const APyFloatData &src1, const APyFloatData &src2, APyFloatData &dst, const APyFloatSpec &src1_spec, const APyFloatSpec &src2_spec, const APyFloatSpec &dst_spec, const QuantizationMode &qntz)¶ Perform a single float product.
-
template<const bool SUB = false>
static inline void floating_point_quotient(const APyFloatData &src1, const APyFloatData &src2, APyFloatData &dst, const APyFloatSpec &src1_spec, const APyFloatSpec &src2_spec, const APyFloatSpec &dst_spec, const QuantizationMode &qntz)¶ Perform a single float quotient.
Variables
-
static constexpr std::size_t _MAN_T_SIZE_BITS = 8 * _MAN_T_SIZE_BYTES¶
-
static constexpr std::size_t _EXP_T_SIZE_BYTES = sizeof(exp_t)¶
-
static constexpr std::size_t _EXP_T_SIZE_BITS = 8 * _EXP_T_SIZE_BYTES¶
-
static constexpr std::size_t _MAN_LIMIT_BITS = _MAN_T_SIZE_BITS - 3¶
APyFloat word length limits.
-
static constexpr std::size_t _EXP_LIMIT_BITS = _EXP_T_SIZE_BITS - 2¶
-
template<typename RANDOM_ACCESS_ITERATOR_IN1, typename RANDOM_ACCESS_ITERATOR_IN2, typename RANDOM_ACCESS_ITERATOR_INOUT>
struct FloatInnerProdFunctor¶ Public Functions
-
inline FloatInnerProdFunctor(const APyFloatSpec &src1_spec, const APyFloatSpec &src2_spec, const APyFloatSpec &dst_spec, const QuantizationMode &qntz)¶
-
inline auto get_inner_ptr(const APyFloatSpec &src1_spec, const APyFloatSpec &src2_spec, const APyFloatSpec &dst_spec, const QuantizationMode &qntz) const¶
-
DEFINE_NEW_MUL(MUL_SHORT, _floating_point_mul_short)¶
-
DEFINE_NEW_MUL(MUL_GENERAL, _floating_point_mul_general)¶
-
DEFINE_NEW_ADD(ADD_SAME_WL, _floating_point_add_same_wl)¶
-
DEFINE_NEW_ADD(ADD_GENERAL, _floating_point_add_general)¶
-
template<auto MUL_FUNC, auto ADD_FUNC>
inline void inner_product(RANDOM_ACCESS_ITERATOR_IN1 src1, RANDOM_ACCESS_ITERATOR_IN2 src2, RANDOM_ACCESS_ITERATOR_INOUT dst, std::size_t N, std::size_t M = 1, std::size_t DST_STEP = 1) const¶
-
inline void operator()(RANDOM_ACCESS_ITERATOR_IN1 src1, RANDOM_ACCESS_ITERATOR_IN2 src2, RANDOM_ACCESS_ITERATOR_INOUT dst, std::size_t N, std::size_t M = 1, std::size_t DST_STEP = 1) const¶
Public Members
-
const APyFloatSpec _src1_spec¶
-
const APyFloatSpec _src2_spec¶
-
const APyFloatSpec _dst_spec¶
-
const QuantizationMode _qntz¶
-
const std::optional<APyFloatAccumulatorOption> _accumulator_mode = std::nullopt¶
-
void (FloatInnerProdFunctor::*const inner_f_ptr)(RANDOM_ACCESS_ITERATOR_IN1 src1, RANDOM_ACCESS_ITERATOR_IN2 src2, RANDOM_ACCESS_ITERATOR_INOUT dst, std::size_t N, std::size_t M, std::size_t RES_STEP) const¶
-
const unsigned SUM_MAN_BITS = _src1_spec.man_bits + _src2_spec.man_bits¶
-
const exp_t SRC1_MAX_EXP = ((1ULL << _src1_spec.exp_bits) - 1)¶
-
const exp_t SRC2_MAX_EXP = ((1ULL << _src2_spec.exp_bits) - 1)¶
-
const man_t TWO = 1ULL << (SUM_MAN_BITS + 2)¶
-
const man_t TWO_BEFORE = 1ULL << (SUM_MAN_BITS + 1)¶
-
const man_t ONE_BEFORE = 1ULL << (SUM_MAN_BITS + 0)¶
-
const int MAN_DELTA = int(SUM_MAN_BITS) + 2 - _dst_spec.man_bits¶
-
const man_t RES_LO = FINAL_RES_LO << 3¶
-
const man_t MAN_MASK = CARRY_RES_LO - 1¶
-
const unsigned NORM_CONST = _MAN_T_SIZE_BITS - _dst_spec.man_bits - 4¶
-
const decltype(get_qntz_func(_qntz, MAN_DELTA < 0)) _qntz_func = get_qntz_func(_qntz, MAN_DELTA < 0)¶
-
inline FloatInnerProdFunctor(const APyFloatSpec &src1_spec, const APyFloatSpec &src2_spec, const APyFloatSpec &dst_spec, const QuantizationMode &qntz)¶