Source code for afe.tvm_converter.quantization

#########################################################
# Copyright (C) 2022 SiMa Technologies, Inc.
#
# This material is SiMa proprietary and confidential.
#
# This material may not be copied or distributed without
# the express prior written permission of SiMa.
#
# All rights reserved.
#########################################################
# Code owner: Christopher Rodrigues
#########################################################
"""
Quantization code that is specific to the TVM converter.
"""
import math
from typing import Optional, Tuple, Union, List, Iterable, TypeVar

import numpy as np

from afe.ir.defines import Quantization
from afe.ir.quantization_utils import float_requantization

# Number of fractional bits in the fixed-point scale correction factor when using TFLite quantization.
_TFLITE_SC_BITS = 6


[docs] def correction_factors(input_q: Quantization, output_q: Quantization) -> Tuple[float, float, int]: """ Determine correction factors for requantizing from input_q to output_q. The correction factors consist of a scale correction sc, zero point correction zc, and shift n such that output = (input * sc + zc) * 2**-n and sc is in the range 0.5 to 1. :param input_q: Quantization of data prior to requantization :param output_q: Quantization of data after requantization :return: Scale correction, zero point correction, and shift """ sc_correction, zp_correction = float_requantization(input_q, output_q) # Decompose sc_correction = sc_normalized / 2**sc_exponent # 2**sc_exponent must stay in 32-bit range sc_exponent: int = -math.ceil(math.log2(sc_correction)) assert 0 <= sc_exponent <= 32, "Insufficient precision for quantized calculation" if sc_exponent == 32: # Original assertion was: assert 0 <= sc_exponent < 32. # This is a unique case, made to run tflite_quant_efficientnet_s in AFE. # We can’t use a shift value of 32. # Instead, sc_normalized and shift can be redistributed so that shift is 31. # The effect will be that sc_normalized is smaller, so weight * sc_normalized # is smaller (won’t cover the full int8 range -128–127). # SWMLA-1540. sc_exponent = 31 sc_normalized = sc_correction * (2 ** sc_exponent) zc_normalized = zp_correction * (2 ** sc_exponent) return sc_normalized, zc_normalized, sc_exponent
_A = TypeVar("_A") def _all_equal(sequence: Iterable[_A]) -> bool: """ Return true if all items in input are equal. """ i = iter(sequence) try: first = next(i) except StopIteration: # Empty sequence return True return all(x == first for x in i) def _reshape_for_broadcast(tensor: np.ndarray, broadcast_shape: Tuple[int, ...], axis: int) -> np.ndarray: """ Reshape the 1D tensor so it can be broadcast to broadcast_shape along axis. The tensor is reshaped to have size 1 in all other axes. _reshape_for_broadcast([0.5, 0.6, 0.7, 0.8], (4, 4, 4, 4), 1).shape == (1, 4, 1, 1) _reshape_for_broadcast([0.5, 0.6, 0.7, 0.8], (4, 4, 4, 4), 2).shape == (1, 1, 4, 1) :param tensor: Tensor to reshape :param broadcast_shape: Shape that it should be broadcast to :param axis: Axis to broadcast along :return: Reshaped tensor """ assert len(tensor.shape) == 1 n = tensor.shape[0] assert n == broadcast_shape[axis] shape = [1] * len(broadcast_shape) shape[axis] = n shape = tuple(shape) return np.reshape(tensor, shape) def _apply_input_zp_correction(weight: np.ndarray, data_zero_point: int, bias: Optional[np.ndarray], is_dense: bool) \ -> Optional[np.ndarray]: """ Calculate the zero point correction for convolution input and add it to the bias for the convolution. In a quantized convolution, the zero point correction should be removed. We could subtract the zero point, then convolve and add the bias, as in (A - zp) * W + B. However, it is more efficient to convolve the quantized data first and then subtract the zero point together with the bias, (A * W) + (B - zp * W). This function computes (B - zp * W). :param weight: Quantized weight array for the convolution or dense operator. :param data_zero_point: Zero point of the operator's input data. :param bias: Quantized bias array for the convolution or dense operator. :param is_dense: Whether this is a dense operator. If false, this is a convolution operator. :return: If a bias array was given or a zero point correction was used, the bias array including zero point correction. """ assert len(weight.shape) == (2 if is_dense else 4) if data_zero_point == 0: return bias # Axis of the weight array that corresponds to the bias array's axis bias_axis = 0 if is_dense else 3 sum_axes = tuple(x for x in range(len(weight.shape)) if x != bias_axis) bias_zp_correction = weight.sum(axis=sum_axes, dtype=np.int32) * -data_zero_point bias_zp_corrected = bias_zp_correction if bias is None else bias + bias_zp_correction return bias_zp_corrected # Requantization for convolution and dense matrix multiply # # A SiMa IR convolution or dense matrix multiply # consists of fused convolution, bias-add, and requantization. # These fused operations are algebraically simplified compared to the individual # operations. In QNN, these are three separate operations. We cannot execute them # as is. We have to reassociate some arithmetic so that it matches our implementation. # # Convolution and dense behave essentially the same way, the difference being in which # dimensions are summed. Convolution is described below. # # In QNN, convolution calculates an integer convolution with zero point correction. # We require the weights to be symmetrically quantized. A bias is added to the product. # The result is requantized. The arithmetic for these 3 convolution steps in QNN is: # # P = Sum_crs (W_kcrs * A_kcrs) - Sum_crs (W_kcrs) * z_a # Q = P + B # R = round(Q * sc + zc) # # In SiMa IR, the factors of sc and zc are folded into the other arithmetic. # Only A_kcrs is a run-time input, so the factors can be precalculated. # Multiplicative factors get folded into W and additive factors get folded into B. # Disregarding quantization, we would calculate new weights W' and B' and # perform arithmetic for SiMa's fused version of the operator: # # W' = W * sc # B' = B * sc - Sum_crs (W_kcrs) * z_a * sc + zc # P' = Sum_crs (W'_kcrs * A_kcrs) # Q' = P' + B' # R' = round(Q') # # We have to requantize these values so that W' stays within the 8-bit representable range. # (B' comprises 32-bit values so it is not a concern.) We decompose # sc = sc' * 2**-n where 0.5 <= sc' <= 1+epsilon, scale intermediate values by sc', and shift # the final result to apply the factor of 2**-n. # # sc' = sc * 2**n # zc' = zc * 2**n # W' = round(W * sc') # B' = round((B * sc' - Sum_crs (W_kcrs) * z_a * sc' + zc') # P' = Sum_crs (W'_kcrs * A_kcrs) # Q' = P' + B' # R' = round(Q' * 2**-n) # # The values that we calculate and store are W', B', and (n-m).
[docs] def requantize_qnn_convolution_dense( weight: np.ndarray, bias: Optional[np.ndarray], data_zero_point: int, product_q: Union[Quantization, List[Quantization]], output_q: Quantization, is_dense: bool ) -> Tuple[np.ndarray, np.ndarray, Union[int, np.ndarray]]: """ Convert constant parameters from a relay IR quantized convolution/dense, bias-add, and requantization to constant parameters for a SiMa IR convolution/dense. The SiMa IR operator is equivalent to these 3 operators. Some precision will be lost due to rounding when converting between these parameters. :param weight: Weight tensor from QNN convolution, in HWIGO layout or from QNN dense in OI layout. :param bias: Bias tensor from QNN convolution/dense. If None is given, it is treated as an array of zeros. :param data_zero_point: Zero point of the convolution's input activation matrix. :param product_q: Quantization of the input of the Relay IR requantize operator. When using per-tensor quantization, it is a single Quantization. When using per-channel quantization, it is a list of Quantization with one item per channel. :param output_q: Quantization of the output of the Relay IR requantize operator. This is the same as the quantization of the output of the SiMa IR operator. :param is_dense: If True, function is used for requantization of dense operator, otherwise for convolution operator. :return: Weight, bias, and shift for SiMa IR convolution/dense. """ assert weight.dtype == np.int8 or np.uint8 assert bias.dtype == np.int32 original_weight_shape = weight.shape if not is_dense: # Reshape weight tensor to HWIO so that it follows TVM's convention weight = weight.reshape(original_weight_shape[:-2] + (-1,)) # Axis of the weight array that corresponds to to the bias array's axis bias_axis = 0 if is_dense else 3 channel_count = weight.shape[bias_axis] # Add the input zero point correction to the bias bias_zp_corrected = _apply_input_zp_correction(weight, data_zero_point, bias, is_dense) if bias_zp_corrected is None: # Even if no bias was given, initialize a bias array so that scale and zp correction can be applied to it bias_zp_corrected = np.full((channel_count,), 0) shift: Union[int, np.ndarray] # Right shift value for SiMa convolution/dense if isinstance(product_q, Quantization): # Get requantization factor for entire tensor sc_normalized_scalar, zp_normalized_scalar, shift_scalar = correction_factors(product_q, output_q) # Expand sc and zp to per-channel arrays sc_normalized = np.full(shape=channel_count, fill_value=sc_normalized_scalar, dtype=np.float32) zp_normalized = np.full(shape=channel_count, fill_value=zp_normalized_scalar, dtype=np.float32) shift = shift_scalar else: # Get requantization factors for each channel assert len(product_q) == channel_count assert _all_equal(pq.zero_point for pq in product_q) factors: List[Tuple[float, float, int]] = [correction_factors(p, output_q) for p in product_q] # Convert list of tuples to arrays sc_normalized = np.array([f[0] for f in factors], dtype=np.float32) zp_normalized = np.array([f[1] for f in factors], dtype=np.float32) shift = np.array([f[2] for f in factors], dtype=int) # Fold requantization factors into weight and bias sc_normalized_wshape = _reshape_for_broadcast(sc_normalized, weight.shape, bias_axis) weight_out = np.round(weight.astype(np.float32) * sc_normalized_wshape).astype(weight.dtype) bias_out = np.round(bias_zp_corrected * sc_normalized + zp_normalized).astype(np.int32) if not is_dense: # Restore weight tensor to the same layout as the original weight parameter weight_out = weight_out.reshape(original_weight_shape) return weight_out, bias_out, shift
def _quantize_per_channel_tflite_scale(sc: np.ndarray, shift: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: """ Quantize scale factor and shift for per-channel TFLite requantization. TFLite requantization approximates the multiplication (a * sc), where sc is between 0.25 and 1, using integer arithmetic (a * sc') >> shift. By default, sc' is quantized with _TFLITE_SC_BITS bits of precision. However, if shift would be more than 30 bits, the number of bits is reduced to avoid precision loss. :param sc: Scale correction factors. An array of floating-point values in the range 0.25 to 1. :param shift: Shift factors. An array of integers in the range 0 to 31. :return: Quantized scale correction and shift. The quantized scale correction has int8 type. """ assert not np.any(shift >= 32), "Shift is out of range" assert np.all(sc <= 1) assert np.all(sc >= 0.25) adjusted_shift = shift + _TFLITE_SC_BITS if np.all(adjusted_shift <= 30): # All values are in acceptable range. Use the full number of bits. sc_multiplier = 1 << _TFLITE_SC_BITS else: # Some shift factors would be greater than 30 bits. # Reduce the shift to retain 2 bits of precision in the output. excess_shift = np.maximum(adjusted_shift - 30, 0) adjusted_shift -= excess_shift assert np.all(adjusted_shift >= 0) sc_shift = _TFLITE_SC_BITS - excess_shift # At least 2 bits of precision are needed to ensure that the # quantized scale is nonzero. If it has less than 2 bits of precision, # set it to zero. adjusted_shift = np.where(sc_shift < 2, 0, adjusted_shift) sc_multiplier = np.where(sc_shift < 2, 0, np.left_shift(1, sc_shift)) sc = np.round(sc * sc_multiplier).astype(np.int8) shift = adjusted_shift.astype(np.int8) return sc, shift
[docs] def tflite_requantization_constants( weight: np.ndarray, bias: Optional[np.ndarray], data_zero_point: int, input_q: Union[Quantization, List[Quantization]], output_q: Quantization, is_dense: bool) \ -> Union[Tuple[Optional[np.ndarray], int, int, int], Tuple[Optional[np.ndarray], np.ndarray, int, np.ndarray]]: """ Compute constants for TFLite-style requantization. :param weight: Weight tensor from QNN convolution, in HWIGO layout or from QNN dense in OI layout. :param bias: Bias tensor from QNN convolution/dense. If None is given, it is treated as an array of zeros. :param data_zero_point: Zero point of the convolution's input activation matrix. :param input_q: Quantization of the input of the Relay IR requantize operator. When using per-tensor quantization, it is a single Quantization. When using per-channel quantization, it is a list of Quantization with one item per channel. :param output_q: Quantization of the output of the Relay IR requantize operator. This is the same as the quantization of the output of the SiMa IR operator. :param is_dense: If True, function is used for requantization of dense operator, otherwise for convolution operator. :return: Weight, bias, and shift for SiMa IR convolution/dense. :return: Modified bias, scale correction, zero point correction, and shift for convolution. Scale correction and shift are integers for per-tensor convolution, or arrays for per-channel convolution. """ assert not is_dense, "TFLite quantization is not implemented for dense matrix multiply" # Reshape weight tensor to HWIO so that it follows TVM's convention weight = weight.reshape(weight.shape[:-2] + (-1,)) # Axis of the weight array that corresponds to to the bias array's axis bias_zp_corrected = _apply_input_zp_correction(weight, data_zero_point, bias, is_dense) int32_iinfo = np.iinfo(np.int32) if isinstance(input_q, Quantization): # Per-tensor quantization sc_normalized_scalar, zp_normalized_scalar, shift_scalar = correction_factors(input_q, output_q) zp_normalized_scalar *= 2**(-shift_scalar) # Undo the shift from correction_factors zp = round(np.clip(zp_normalized_scalar, int32_iinfo.min, int32_iinfo.max).item()) # Convert sc from float range (0.25, 1) to fixed point range [16, 64] sc = round(sc_normalized_scalar * (1 << _TFLITE_SC_BITS)) shift_scalar += _TFLITE_SC_BITS assert shift_scalar < 32, "Shift is out of range" shift = shift_scalar else: # Per-channel quantization assert len(input_q) > 0 factors: List[Tuple[float, float, int]] = [correction_factors(p, output_q) for p in input_q] # Convert list of tuples to arrays sc = np.array([f[0] for f in factors], dtype=np.float32) zp = np.array([f[1] for f in factors], dtype=np.float32) shift = np.array([f[2] for f in factors], dtype=int) zp *= 2.0**(-shift) # Undo the shift from correction_factors # All input scales in input_q were calculated using the same zero point, so they will all be equal after # they are converted to the same scale. zp = zp.clip(int32_iinfo.min, int32_iinfo.max).round().astype(np.int32) assert np.all(zp == zp[0]) zp = int(zp[0]) # Quantize the scale corrections sc, shift = _quantize_per_channel_tflite_scale(sc, shift) if bias_zp_corrected is not None: bias_out = np.round(bias_zp_corrected).clip(int32_iinfo.min, int32_iinfo.max).astype(np.int32) else: bias_out = None return bias_out, sc, zp, shift