Source code for afe.tvm_converter.quantization

#########################################################
# Copyright (C) 2022 SiMa Technologies, Inc.
#
# This material is SiMa proprietary and confidential.
#
# This material may not be copied or distributed without
# the express prior written permission of SiMa.
#
# All rights reserved.
#########################################################
# Code owner: Christopher Rodrigues
#########################################################
"""
Quantization code that is specific to the TVM converter.
"""
import math
from typing import Optional, Tuple, Union, List, Iterable, TypeVar

import numpy as np

from afe.ir.defines import Quantization
from afe.ir.quantization_utils import float_requantization

# Number of fractional bits in the fixed-point scale correction factor when using TFLite quantization.
_TFLITE_SC_BITS = 6



[docs]
def correction_factors(input_q: Quantization, output_q: Quantization) -> Tuple[float, float, int]:
    """
    Determine correction factors for requantizing from input_q to output_q.

    The correction factors consist of a scale correction sc, zero point correction zc,
    and shift n such that

        output = (input * sc + zc) * 2**-n

    and sc is in the range 0.5 to 1.

    :param input_q: Quantization of data prior to requantization
    :param output_q: Quantization of data after requantization
    :return: Scale correction, zero point correction, and shift
    """
    sc_correction, zp_correction = float_requantization(input_q, output_q)

    # Decompose sc_correction = sc_normalized / 2**sc_exponent
    # 2**sc_exponent must stay in 32-bit range
    sc_exponent: int = -math.ceil(math.log2(sc_correction))
    assert 0 <= sc_exponent <= 32, "Insufficient precision for quantized calculation"
    if sc_exponent == 32:
        # Original assertion was: assert 0 <= sc_exponent < 32.
        # This is a unique case, made to run tflite_quant_efficientnet_s in AFE.
        # We can’t use a shift value of 32.
        # Instead, sc_normalized and shift can be redistributed so that shift is 31.
        # The effect will be that sc_normalized is smaller, so weight * sc_normalized
        # is smaller (won’t cover the full int8 range -128–127).
        # SWMLA-1540.
        sc_exponent = 31
    sc_normalized = sc_correction * (2 ** sc_exponent)
    zc_normalized = zp_correction * (2 ** sc_exponent)

    return sc_normalized, zc_normalized, sc_exponent



_A = TypeVar("_A")


def _all_equal(sequence: Iterable[_A]) -> bool:
    """
    Return true if all items in input are equal.
    """
    i = iter(sequence)
    try:
        first = next(i)
    except StopIteration:
        # Empty sequence
        return True

    return all(x == first for x in i)


def _reshape_for_broadcast(tensor: np.ndarray, broadcast_shape: Tuple[int, ...], axis: int) -> np.ndarray:
    """
    Reshape the 1D tensor so it can be broadcast to broadcast_shape along axis.  The tensor
    is reshaped to have size 1 in all other axes.

        _reshape_for_broadcast([0.5, 0.6, 0.7, 0.8], (4, 4, 4, 4), 1).shape == (1, 4, 1, 1)
        _reshape_for_broadcast([0.5, 0.6, 0.7, 0.8], (4, 4, 4, 4), 2).shape == (1, 1, 4, 1)

    :param tensor: Tensor to reshape
    :param broadcast_shape: Shape that it should be broadcast to
    :param axis: Axis to broadcast along
    :return: Reshaped tensor
    """
    assert len(tensor.shape) == 1
    n = tensor.shape[0]
    assert n == broadcast_shape[axis]

    shape = [1] * len(broadcast_shape)
    shape[axis] = n
    shape = tuple(shape)

    return np.reshape(tensor, shape)


def _apply_input_zp_correction(weight: np.ndarray, data_zero_point: int, bias: Optional[np.ndarray], is_dense: bool) \
        -> Optional[np.ndarray]:
    """
    Calculate the zero point correction for convolution input and add it to the bias for the convolution.

    In a quantized convolution, the zero point correction should be removed.
    We could subtract the zero point, then convolve and add the bias, as in (A - zp) * W + B.
    However, it is more efficient to convolve the quantized data first and then subtract the zero point
    together with the bias, (A * W) + (B - zp * W).  This function computes (B - zp * W).

    :param weight: Quantized weight array for the convolution or dense operator.
    :param data_zero_point: Zero point of the operator's input data.
    :param bias: Quantized bias array for the convolution or dense operator.
    :param is_dense: Whether this is a dense operator.  If false, this is a convolution operator.
    :return: If a bias array was given or a zero point correction was used, the bias array including
        zero point correction.
    """
    assert len(weight.shape) == (2 if is_dense else 4)
    if data_zero_point == 0:
        return bias

    # Axis of the weight array that corresponds to the bias array's axis
    bias_axis = 0 if is_dense else 3

    sum_axes = tuple(x for x in range(len(weight.shape)) if x != bias_axis)
    bias_zp_correction = weight.sum(axis=sum_axes, dtype=np.int32) * -data_zero_point
    bias_zp_corrected = bias_zp_correction if bias is None else bias + bias_zp_correction
    return bias_zp_corrected


# Requantization for convolution and dense matrix multiply
#
# A SiMa IR convolution or dense matrix multiply
# consists of fused convolution, bias-add, and requantization.
# These fused operations are algebraically simplified compared to the individual
# operations.  In QNN, these are three separate operations.  We cannot execute them
# as is.  We have to reassociate some arithmetic so that it matches our implementation.
#
# Convolution and dense behave essentially the same way, the difference being in which
# dimensions are summed.  Convolution is described below.
#
# In QNN, convolution calculates an integer convolution with zero point correction.
# We require the weights to be symmetrically quantized. A bias is added to the product.
# The result is requantized.  The arithmetic for these 3 convolution steps in QNN is:
#
#     P = Sum_crs (W_kcrs * A_kcrs) - Sum_crs (W_kcrs) * z_a
#     Q = P + B
#     R = round(Q * sc + zc)
#
# In SiMa IR, the factors of sc and zc are folded into the other arithmetic.
# Only A_kcrs is a run-time input, so the factors can be precalculated.
# Multiplicative factors get folded into W and additive factors get folded into B.
# Disregarding quantization, we would calculate new weights W' and B' and
# perform arithmetic for SiMa's fused version of the operator:
#
#     W' = W * sc
#     B' = B * sc - Sum_crs (W_kcrs) * z_a * sc + zc
#     P' = Sum_crs (W'_kcrs * A_kcrs)
#     Q' = P' + B'
#     R' = round(Q')
#
# We have to requantize these values so that W' stays within the 8-bit representable range.
# (B' comprises 32-bit values so it is not a concern.)  We decompose
# sc = sc' * 2**-n where 0.5 <= sc' <= 1+epsilon, scale intermediate values by sc', and shift
# the final result to apply the factor of 2**-n.
#
#     sc' = sc * 2**n
#     zc' = zc * 2**n
#     W' = round(W * sc')
#     B' = round((B * sc' - Sum_crs (W_kcrs) * z_a * sc' + zc')
#     P' = Sum_crs (W'_kcrs * A_kcrs)
#     Q' = P' + B'
#     R' = round(Q' * 2**-n)
#
# The values that we calculate and store are W', B', and (n-m).


[docs]
def requantize_qnn_convolution_dense(
        weight: np.ndarray, bias: Optional[np.ndarray],
        data_zero_point: int,
        product_q: Union[Quantization, List[Quantization]],
        output_q: Quantization,
        is_dense: bool
    ) -> Tuple[np.ndarray, np.ndarray, Union[int, np.ndarray]]:
    """
    Convert constant parameters from a relay IR quantized convolution/dense, bias-add,
    and requantization to constant parameters for a SiMa IR convolution/dense.  The
    SiMa IR operator is equivalent to these 3 operators.  Some precision will be
    lost due to rounding when converting between these parameters.

    :param weight: Weight tensor from QNN convolution, in HWIGO layout
        or from QNN dense in OI layout.
    :param bias: Bias tensor from QNN convolution/dense.  If None is given, it is
       treated as an array of zeros.
    :param data_zero_point: Zero point of the convolution's input activation matrix.
    :param product_q: Quantization of the input of the Relay IR requantize operator.
       When using per-tensor quantization, it is a single Quantization.  When using
       per-channel quantization, it is a list of Quantization with one item per channel.
    :param output_q: Quantization of the output of the Relay IR requantize operator.
       This is the same as the quantization of the output of the SiMa IR operator.
    :param is_dense: If True, function is used for requantization of dense operator,
        otherwise for convolution operator.
    :return: Weight, bias, and shift for SiMa IR convolution/dense.
    """
    assert weight.dtype == np.int8 or np.uint8
    assert bias.dtype == np.int32

    original_weight_shape = weight.shape

    if not is_dense:
        # Reshape weight tensor to HWIO so that it follows TVM's convention
        weight = weight.reshape(original_weight_shape[:-2] + (-1,))

    # Axis of the weight array that corresponds to to the bias array's axis
    bias_axis = 0 if is_dense else 3

    channel_count = weight.shape[bias_axis]

    # Add the input zero point correction to the bias
    bias_zp_corrected = _apply_input_zp_correction(weight, data_zero_point, bias, is_dense)
    if bias_zp_corrected is None:
        # Even if no bias was given, initialize a bias array so that scale and zp correction can be applied to it
        bias_zp_corrected = np.full((channel_count,), 0)

    shift: Union[int, np.ndarray]  # Right shift value for SiMa convolution/dense

    if isinstance(product_q, Quantization):
        # Get requantization factor for entire tensor
        sc_normalized_scalar, zp_normalized_scalar, shift_scalar = correction_factors(product_q, output_q)

        # Expand sc and zp to per-channel arrays
        sc_normalized = np.full(shape=channel_count, fill_value=sc_normalized_scalar, dtype=np.float32)
        zp_normalized = np.full(shape=channel_count, fill_value=zp_normalized_scalar, dtype=np.float32)
        shift = shift_scalar
    else:
        # Get requantization factors for each channel
        assert len(product_q) == channel_count
        assert _all_equal(pq.zero_point for pq in product_q)
        factors: List[Tuple[float, float, int]] = [correction_factors(p, output_q) for p in product_q]

        # Convert list of tuples to arrays
        sc_normalized = np.array([f[0] for f in factors], dtype=np.float32)
        zp_normalized = np.array([f[1] for f in factors], dtype=np.float32)
        shift = np.array([f[2] for f in factors], dtype=int)

    # Fold requantization factors into weight and bias
    sc_normalized_wshape = _reshape_for_broadcast(sc_normalized, weight.shape, bias_axis)
    weight_out = np.round(weight.astype(np.float32) * sc_normalized_wshape).astype(weight.dtype)
    bias_out = np.round(bias_zp_corrected * sc_normalized + zp_normalized).astype(np.int32)

    if not is_dense:
        # Restore weight tensor to the same layout as the original weight parameter
        weight_out = weight_out.reshape(original_weight_shape)

    return weight_out, bias_out, shift



def _quantize_per_channel_tflite_scale(sc: np.ndarray, shift: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
    """
    Quantize scale factor and shift for per-channel TFLite requantization.

    TFLite requantization approximates the multiplication (a * sc), where sc is between 0.25 and 1,
    using integer arithmetic (a * sc') >> shift.  By default, sc' is quantized with
    _TFLITE_SC_BITS bits of precision.  However, if shift would be more than 30 bits, the
    number of bits is reduced to avoid precision loss.

    :param sc: Scale correction factors.  An array of floating-point values in the range 0.25 to 1.
    :param shift: Shift factors.  An array of integers in the range 0 to 31.
    :return: Quantized scale correction and shift.  The quantized scale correction has int8 type.
    """
    assert not np.any(shift >= 32), "Shift is out of range"
    assert np.all(sc <= 1)
    assert np.all(sc >= 0.25)

    adjusted_shift = shift + _TFLITE_SC_BITS

    if np.all(adjusted_shift <= 30):
        # All values are in acceptable range.  Use the full number of bits.
        sc_multiplier = 1 << _TFLITE_SC_BITS
    else:
        # Some shift factors would be greater than 30 bits.
        # Reduce the shift to retain 2 bits of precision in the output.
        excess_shift = np.maximum(adjusted_shift - 30, 0)
        adjusted_shift -= excess_shift
        assert np.all(adjusted_shift >= 0)
        sc_shift = _TFLITE_SC_BITS - excess_shift

        # At least 2 bits of precision are needed to ensure that the
        # quantized scale is nonzero.  If it has less than 2 bits of precision,
        # set it to zero.
        adjusted_shift = np.where(sc_shift < 2, 0, adjusted_shift)
        sc_multiplier = np.where(sc_shift < 2, 0, np.left_shift(1, sc_shift))

    sc = np.round(sc * sc_multiplier).astype(np.int8)
    shift = adjusted_shift.astype(np.int8)
    return sc, shift



[docs]
def tflite_requantization_constants(
        weight: np.ndarray, bias: Optional[np.ndarray],
        data_zero_point: int,
        input_q: Union[Quantization, List[Quantization]],
        output_q: Quantization,
        is_dense: bool) \
    -> Union[Tuple[Optional[np.ndarray], int, int, int],
             Tuple[Optional[np.ndarray], np.ndarray, int, np.ndarray]]:
    """
    Compute constants for TFLite-style requantization.

    :param weight: Weight tensor from QNN convolution, in HWIGO layout
        or from QNN dense in OI layout.
    :param bias: Bias tensor from QNN convolution/dense.  If None is given, it is
       treated as an array of zeros.
    :param data_zero_point: Zero point of the convolution's input activation matrix.
    :param input_q: Quantization of the input of the Relay IR requantize operator.
       When using per-tensor quantization, it is a single Quantization.  When using
       per-channel quantization, it is a list of Quantization with one item per channel.
    :param output_q: Quantization of the output of the Relay IR requantize operator.
       This is the same as the quantization of the output of the SiMa IR operator.
    :param is_dense: If True, function is used for requantization of dense operator,
        otherwise for convolution operator.
    :return: Weight, bias, and shift for SiMa IR convolution/dense.
    :return: Modified bias, scale correction, zero point correction, and shift for convolution.
       Scale correction and shift are integers for per-tensor convolution, or arrays for per-channel convolution.
    """
    assert not is_dense, "TFLite quantization is not implemented for dense matrix multiply"

    # Reshape weight tensor to HWIO so that it follows TVM's convention
    weight = weight.reshape(weight.shape[:-2] + (-1,))

    # Axis of the weight array that corresponds to to the bias array's axis
    bias_zp_corrected = _apply_input_zp_correction(weight, data_zero_point, bias, is_dense)

    int32_iinfo = np.iinfo(np.int32)
    if isinstance(input_q, Quantization):
        # Per-tensor quantization
        sc_normalized_scalar, zp_normalized_scalar, shift_scalar = correction_factors(input_q, output_q)
        zp_normalized_scalar *= 2**(-shift_scalar)  # Undo the shift from correction_factors
        zp = round(np.clip(zp_normalized_scalar, int32_iinfo.min, int32_iinfo.max).item())

        # Convert sc from float range (0.25, 1) to fixed point range [16, 64]
        sc = round(sc_normalized_scalar * (1 << _TFLITE_SC_BITS))
        shift_scalar += _TFLITE_SC_BITS
        assert shift_scalar < 32, "Shift is out of range"
        shift = shift_scalar
    else:
        # Per-channel quantization
        assert len(input_q) > 0
        factors: List[Tuple[float, float, int]] = [correction_factors(p, output_q) for p in input_q]

        # Convert list of tuples to arrays
        sc = np.array([f[0] for f in factors], dtype=np.float32)
        zp = np.array([f[1] for f in factors], dtype=np.float32)
        shift = np.array([f[2] for f in factors], dtype=int)
        zp *= 2.0**(-shift)  # Undo the shift from correction_factors

        # All input scales in input_q were calculated using the same zero point, so they will all be equal after
        # they are converted to the same scale.
        zp = zp.clip(int32_iinfo.min, int32_iinfo.max).round().astype(np.int32)
        assert np.all(zp == zp[0])
        zp = int(zp[0])

        # Quantize the scale corrections
        sc, shift = _quantize_per_channel_tflite_scale(sc, shift)

    if bias_zp_corrected is not None:
        bias_out = np.round(bias_zp_corrected).clip(int32_iinfo.min, int32_iinfo.max).astype(np.int32)
    else:
        bias_out = None

    return bias_out, sc, zp, shift