#########################################################
# Copyright (C) 2022 SiMa Technologies, Inc.
#
# This material is SiMa proprietary and confidential.
#
# This material may not be copied or distributed without
# the express prior written permission of SiMa.
#
# All rights reserved.
#########################################################
# Code owner: Ljubomir Papuga
#########################################################
"""
This file contains definitions of the types exposed by the development API for AFE.
"""
import dataclasses
from dataclasses import dataclass
from enum import Enum, auto
import numpy as np
from typing import Dict, Set, List, Optional, Any, ContextManager
from afe.ir.defines import InputName, NodeName, BiasCorrectionType
#########
# Exporting
from afe.ir.tensor_type import scalar_type_from_dtype, scalar_type_to_dtype, TensorType, ScalarType
from afe.ir.defines import RequantizationMode
from afe.ir.utils import transpose_tensor_according_to_layout_strings
from afe.backends.mla.afe_to_n2a_compiler.defines import (
TensorDRAMLayout, TensorTessellateParameters, TessellateParameters
)
########
from afe.apis import error_handling_variables
from sima_utils.logging import sima_logger
from sima_utils.common import Platform
[docs]
gen1_target = Platform.GEN1
[docs]
gen2_target = Platform.GEN2
[docs]
class ExceptionFuncType(Enum):
[docs]
LOADED_NET_LOAD = auto()
[docs]
LOADED_NET_EXECUTE = auto()
[docs]
LOADED_NET_QUANTIZE = auto()
[docs]
LOADED_NET_CONVERT = auto()
[docs]
MODEL_CREATE_AUXILIARY = auto()
[docs]
MODEL_EVALUATE = auto()
[docs]
GENERATE_ELF_FILES = auto()
[docs]
QUANTIZATION_ERROR_ANALYSIS = auto()
def _message_print(custom_message, exc_message, exc_type):
if exc_type == sima_logger.UserFacingException:
sima_logger.sima_log_critical(exc_message)
# Special for changes in our API
if exc_type == TypeError and "got an unexpected keyword" in str(exc_message):
sima_logger.sima_log_critical(exc_message)
print("#######################################")
print(custom_message)
class _SimaExceptionManager(ContextManager):
"""
Provides exception handling for gracefully exiting.
"""
call_type: ExceptionFuncType
def __init__(self, test_type: ExceptionFuncType):
self.call_type = test_type
def __enter__(self):
pass
def __exit__(self, exc_type, exc_val, exc_tb):
if exc_type is not None:
if error_handling_variables.ENABLE_VERBOSE_ERROR_MESSAGES:
raise
else:
summary = ""
if self.call_type == ExceptionFuncType.LOADED_NET_LOAD:
summary = "Loading into TVM error."
elif self.call_type == ExceptionFuncType.LOADED_NET_EXECUTE:
summary = "Loaded net execution error."
elif self.call_type == ExceptionFuncType.LOADED_NET_QUANTIZE:
summary = "Quantization failed."
elif self.call_type == ExceptionFuncType.MODEL_EXECUTE:
summary = "Model execution failed."
elif self.call_type == ExceptionFuncType.MODEL_SAVE:
summary = "Saving model failed."
elif self.call_type == ExceptionFuncType.MODEL_LOAD:
summary = "Model load failed."
elif self.call_type == ExceptionFuncType.MODEL_COMPILE:
summary = "Compile model failed."
elif self.call_type == ExceptionFuncType.MODEL_CREATE_AUXILIARY:
summary = "Create auxiliary model failed."
elif self.call_type == ExceptionFuncType.MODEL_COMPOSE:
summary = "Model compose failed."
elif self.call_type == ExceptionFuncType.MODEL_EVALUATE:
summary = "Model evaluate failed."
elif self.call_type == ExceptionFuncType.MODEL_PERFORMANCE:
summary = "Getting model performance failed."
elif self.call_type == ExceptionFuncType.GENERATE_ELF_FILES:
summary = "Generation of files failed."
elif self.call_type == ExceptionFuncType.QUANTIZATION_ERROR_ANALYSIS:
summary = "Quantization error analysis failed."
_message_print(summary, exc_val, exc_type)
raise SystemExit()
[docs]
class ColorSpaceStandard(str, Enum):
"""
Color space standards for YUV and RGB conversion.
BT601 is for SD video; BT709 is for HD video; BT2020 is for HDR.
"""
[docs]
BT_COLOR_COEFF: Dict[ColorSpaceStandard, List[float]] = {
ColorSpaceStandard.BT601: [0.299, 0.587, 0.114, 1.772, 1.402],
ColorSpaceStandard.BT709: [0.2126, 0.7152, 0.0722, 1.8556, 1.5748],
ColorSpaceStandard.BT2020: [0.2627, 0.6780, 0.0593, 1.8814, 1.4747]
}
[docs]
YUV2RGB_FULL_RANGE_CONSTANTS: Dict[str, List[float]] = {
'offset': [16, 128, 128],
'scale': [255/219, 255/224, 255/224]
}
[docs]
class ColorConversion(str, Enum):
"""
Color conversion direction.
"""
[docs]
class ChromaSampling(str, Enum):
"""
Chroma sub-sampling representation.
"""
[docs]
class ResizeMethod(str, Enum):
"""
Interpolation method used in resize transform.
"""
[docs]
class ResizeDepositLocation(str, Enum):
"""
Deposit location of resized image in padded frame.
"""
[docs]
BOTTOMRIGHT = 'bottomright'
@dataclass
[docs]
class CalibrationMethod:
"""
Represents a calibration method for model quantization.
The ``CalibrationMethod`` class defines a base structure for various calibration techniques used during the quantization process. Each method is identified by a unique name and can be instantiated using the `from_str` method.
"""
_name: str
@property
[docs]
def name(self):
return self._name
@staticmethod
[docs]
def from_str(method: str):
"""
Creates a calibration method based on the provided method name.
Supported Methods:
- ``MIN_MAX`` / ``min_max``: Uses the minimum and maximum values of the dataset to determine the quantization range.
- ``MSE`` / ``mse``: Utilizes a histogram-based method that minimizes the mean squared error (MSE) between the original and quantized values. This method uses 2048 histogram bins for precise calibration.
- ``MOVING_AVERAGE`` / ``moving_average``: Computes the quantization range by maintaining a moving average of the observed min and max values during calibration.
- ``HISTOGRAM_ENTROPY`` / ``entropy``: Employs an entropy-based approach to find the optimal threshold for quantization by minimizing information loss. It uses 512 histogram bins.
- ``HISTOGRAM_PERCENTILE`` / ``percentile``: Sets the quantization range based on a specified percentile of the distribution (defaulting to the 99.9th percentile). This method helps to ignore outliers and uses 1024 histogram bins.
Args:
method (str): The name of the calibration method to use.
Returns:
CalibrationMethod: The corresponding calibration method configured with default parameters.
Raises:
UserFacingException: If an unsupported calibration method is specified.
"""
if method in ('MIN_MAX', 'min_max'):
return MinMaxMethod()
elif method in ('MSE', 'mse'):
num_bins = 2048
return HistogramMSEMethod(num_bins)
elif method in ('MOVING_AVERAGE', 'moving_average'):
return MovingAverageMinMaxMethod()
elif method in ('HISTOGRAM_ENTROPY', 'entropy'):
num_bins = 512
return HistogramEntropyMethod(num_bins)
elif method in ('HISTOGRAM_PERCENTILE', 'percentile'):
percentile_value = 99.9
num_bins = 1024
return HistogramPercentileMethod(percentile_value, num_bins)
else:
raise sima_logger.UserFacingException(f"Not implemented calibration method: {method}")
@dataclass
[docs]
class MinMaxMethod(CalibrationMethod):
def __init__(self):
self._name = 'min_max'
@dataclass
[docs]
class HistogramMSEMethod(CalibrationMethod):
"""
Represents the histogram MSE calibration method for quantization.
The ``HistogramMSEMethod`` records the running histogram of tensor values during calibration. It searches for the optimal ``min`` and ``max`` values based
on the histogram distribution to minimize the mean squared error (MSE) between the quantized model and the floating-point model. These optimal values are then
used to compute the quantization parameters in the same way as the ``MIN_MAX`` method.
By default, the number of bins used for histogram calculation is ``2048`` when instantiated via the ``from_str()`` method. To customize the number of bins,
use the ``HistogramMSEMethod(num_bins)`` method.
Parameters:
num_bins (int): The number of bins used for histogram calculation. Default is ``2048``.
Returns:
HistogramMSEMethod: The configured histogram MSE calibration method.
"""
def __init__(self, num_bins):
self.num_bins = num_bins
self._name = 'mse'
@dataclass
[docs]
class MovingAverageMinMaxMethod(CalibrationMethod):
"""
Represents the moving average min-max calibration method for quantization.
The ``MovingAverageMinMaxMethod`` records the running histogram of tensor values during calibration. It searches for the optimal ``min`` and ``max`` values
based on the distribution of the histogram to minimize the mean squared error (MSE) between the quantized model and the floating-point model. These optimal
values are then used to compute the quantization parameters similarly to the ``MIN_MAX`` method.
By default, the number of bins used for histogram calculation is 2048 when instantiated via the ``from_str()`` method. To customize the number of bins, use
the ``HistogramMSEMethod(num_bins)`` instead.
Attributes:
_name (str): The internal name of the calibration method, set to ``'moving_average'``.
"""
def __init__(self):
[docs]
self._name = 'moving_average'
@dataclass
[docs]
class HistogramEntropyMethod(CalibrationMethod):
"""
Represents the histogram entropy calibration method for quantization.
The ``HistogramEntropyMethod`` records the running histogram of tensor values during calibration. It searches for the optimal ``min`` and ``max`` values based
on the distribution of the histogram to minimize the Kullback-Leibler (KL) divergence (relative entropy) between the quantized model and the floating-point model.
These optimal values are then used to compute the quantization parameters in the same way as the ``MIN_MAX`` method.
By default, the number of bins used for histogram calculation is ``512`` when instantiated via the ``from_str()`` method. To customize the number of bins, use the
``HistogramEntropyMethod(num_bins)`` method.
Attributes:
num_bins (int): The number of bins used for histogram calculation.
_name (str): The internal name of the calibration method, set to ``'entropy'``.
"""
def __init__(self, num_bins):
self.num_bins = num_bins
@dataclass
[docs]
class HistogramPercentileMethod(CalibrationMethod):
"""
Represents the histogram percentile calibration method for quantization.
The ``HistogramPercentileMethod`` records the running histogram of tensor values during calibration.
It determines the optimal ``min`` and ``max`` values based on the specified percentile of the histogram distribution.
These values are then used to compute the quantization parameters in the same way as the ``MIN_MAX`` method.
When instantiated using the ``from_str`` method, the default values are:
- ``percentile_value``: 99.9
- ``num_bins``: 1024
To use custom values for the percentile or the number of bins, this class should be instantiated directly.
Parameters:
percentile_value (float): The select percentile for determining the quantization range. Defaults to ``99.9``.
num_bins (int): The number of bins used for histogram calculation. Defaults to ``1024``.
"""
[docs]
percentile_value: float
def __init__(self, percentile_value, num_bins):
self.percentile_value = percentile_value
self.num_bins = num_bins
self._name = 'percentile'
[docs]
def default_calibration() -> CalibrationMethod:
return HistogramMSEMethod(2048)
@dataclass
[docs]
class QuantizationScheme:
"""
Quantization scheme.
:param asymmetric: Whether to use asymmetric quantization.
:param per_channel: Whether to use per-channel quantization.
:param bits: Number of bits of precision to use in the quantized representation
:param bf16: Whether to use bfloat16. If True, then asymmetric, per_channel, and bits are ignored.
"""
[docs]
def quantization_scheme(asymmetric: bool, per_channel: bool, bits: int = 8) -> QuantizationScheme:
"""
Constructs a quantization scheme, which determines the range of quantizations that a quantization algorithm may choose from.
Parameters:
asymmetric (bool): Required. Specifies whether to use asymmetric (versus symmetric) quantization.
per_channel (bool): Required. Specifies whether to use per-channel (versus per-tensor) quantization.
bits (int, optional): The number of bits of precision to use for the quantized representation of activations.
Must be either ``8`` (for int8) or ``16`` (for int16). Defaults to ``8``.
The quantization of weights is fixed as int8.
Returns:
QuantizationScheme: The defined quantization scheme configured with the specified parameters.
"""
return QuantizationScheme(asymmetric, per_channel, bits, bf16=False)
[docs]
def bfloat16_scheme() -> QuantizationScheme:
"""
Constructs a bfloat16 quantization scheme.
It directs the compiler to use bfloat16 instead of integer quantization.
"""
return QuantizationScheme(asymmetric=False, per_channel=False, bf16=True)
@dataclass
[docs]
class QuantizationParams:
"""
Parameters controlling how to quantize a network.
:param calibration_method: Calibration method.
:param activation_quantization_scheme: Quantization scheme for activation tensors.
:param weight_quantization_scheme: Quantization scheme for weights tensors.
:param requantization_mode: A way of doing quantized arithmetic.
:param node_names: Nodes to prevent from quantizing.
:param custom_quantization_configs: Dictionary setting the node's custom quantization
options.
:param biascorr_type: Selection of bias correction: regular/iterative/none
:param channel_equalization: If True, channel equalization is enabled.
:param smooth_quant: If True, smooth quant is enabled.
"""
[docs]
calibration_method: CalibrationMethod
[docs]
activation_quantization_scheme: QuantizationScheme
[docs]
weight_quantization_scheme: QuantizationScheme
[docs]
requantization_mode: RequantizationMode = RequantizationMode.sima
[docs]
node_names: Set[str] = dataclasses.field(default_factory=set)
[docs]
custom_quantization_configs: Optional[Dict[NodeName, Dict[str, Any]]] = None
[docs]
biascorr_type: BiasCorrectionType = BiasCorrectionType.NONE
[docs]
channel_equalization: bool = False
[docs]
smooth_quant: bool = False
[docs]
def with_calibration(self, method: CalibrationMethod) -> "QuantizationParams":
"""
Sets the calibration method for activation tensors.
This method configures the calibration approach for activation tensors during quantization. An observer is inserted at each layer to collect statistics of
the output tensors. The derived min and max values from these statistics are used to compute the quantization parameters (scale and zero point) for each layer.
Parameters:
method (CalibrationMethod): Required. The calibration method to use. Supported methods include various approaches to determine the optimal quantization range based on tensor statistics.
Returns:
QuantizationParams: A new instance of quantization parameters with the updated calibration method.
"""
new_quant_params = dataclasses.replace(self, calibration_method=method)
return new_quant_params
[docs]
def with_activation_quantization(self, scheme: QuantizationScheme) -> "QuantizationParams":
"""
Sets the quantization scheme for activation tensors.
For activations, per-channel quantization is not supported. With per-tensor quantization, the ``asymmetric`` flag in the scheme can be set to either ``True`` or ``False``
to define the quantization behavior.
Parameters:
scheme (QuantizationScheme): Required. The quantization scheme to be applied for the model activations.
Returns:
QuantizationParams: A new instance of quantization parameters with the updated activation quantization scheme.
"""
new_quant_params = dataclasses.replace(self, activation_quantization_scheme=scheme)
return new_quant_params
[docs]
def with_weight_quantization(self, scheme: QuantizationScheme) -> "QuantizationParams":
"""
Sets the quantization scheme for weight tensors.
For weights, the asymmetric quantization scheme is not supported. With symmetric quantization, the ``per_channel`` flag can be set to ``True`` or ``False`` to define
the quantization behavior for weights.
Parameters:
scheme (QuantizationScheme): Required. The quantization scheme to be applied for the model weights.
Returns:
QuantizationParams: A new instance of quantization parameters using the chosen weight quantization scheme.
"""
new_quant_params = dataclasses.replace(self, weight_quantization_scheme=scheme)
return new_quant_params
[docs]
def with_requantization_mode(self, requantization_mode: RequantizationMode):
"""
Sets the requantization mode for convolutions.
Two requantization modes are supported:
- ``RequantizationMode.sima``: Uses arithmetic optimized for fast performance on SiMa’s accelerator. This is the default mode.
- ``RequantizationMode.tflite``: Uses TFLite’s arithmetic with an 8-bit constant multiplier.
Parameters:
requantization_mode (RequantizationMode): Required. The requantization mode to be applied.
Returns:
QuantizationParams: A new instance of quantization parameters with the updated requantization mode.
"""
new_quant_params = dataclasses.replace(self, requantization_mode=requantization_mode)
return new_quant_params
[docs]
def with_unquantized_nodes(self, node_names: Set[str]) -> "QuantizationParams":
"""
Selects nodes to prevent from quantizing.
Nodes with the specified names will be excluded from the quantization process. This replaces the set of node names selected by any previous call to ``with_unquantized_nodes``.
Note that node names can be sensitive to changes in optimization or quantization settings, as some nodes may be created or renamed by the compiler.
Parameters:
node_names (Set[str]): Required. A set of strings specifying the names of nodes that should not be quantized.
Returns:
QuantizationParams: A new instance of quantization parameters with the updated unquantized node configuration.
"""
new_quant_params = dataclasses.replace(self, node_names=node_names)
return new_quant_params
[docs]
def with_custom_quantization_configs(self, custom_quantization_configs: Dict[NodeName, Dict[str, Any]]):
"""
Sets custom quantization options for specific nodes.
The ``custom_quantization_configs`` is a dictionary where each key is a node name, and the corresponding value is a dictionary defining custom quantization options. This method is
typically used in the following scenarios:
1. Enable the ``int32`` output of the last convolution node.
2. Enable mixed-precision quantization.
**Note:** Users must obtain the node names from the SiMa IR graph. To do this, perform int8 quantization of a model and inspect the `.sima.json` file using Netron to identify node names.
Parameters:
custom_quantization_configs (Dict[NodeName, Dict[str, Any]]): A dictionary where each key is a node name and the value is a dictionary of custom quantization settings for that node.
Returns:
QuantizationParams: A new instance of quantization parameters with the custom quantization configuration applied.
"""
new_quant_params = dataclasses.replace(self, custom_quantization_configs=custom_quantization_configs)
return new_quant_params
[docs]
def with_bias_correction(self, enable: bool | BiasCorrectionType = True):
"""
Enables or disables bias correction for the quantization of convolutions with a bias.
Bias correction calculates a bias term based on the observed input mean and the quantized weights. This term is then added to the convolution output to compensate for quantization errors.
The algorithm is described in detail in Section 4.2 of the referenced paper.
Parameters:
enable (bool | BiasCorrectionType): Required. Determines whether bias correction is enabled or disabled.
- ``True``: Enables regular bias correction.
- ``False``: Disables bias correction.
- ``BiasCorrectionType``: Allows specifying a custom bias correction type.
Returns:
QuantizationParams: A new instance of quantization parameters with bias correction enabled or disabled as specified.
"""
match enable:
case True:
corr_type = BiasCorrectionType.REGULAR
case False:
corr_type = BiasCorrectionType.NONE
case _:
corr_type = enable
new_quant_params = dataclasses.replace(self, biascorr_type=corr_type)
return new_quant_params
[docs]
def with_channel_equalization(self, enable: bool = True):
"""
Enables or disables channel equalization for the quantization parameters.
Channel equalization is a preprocessing step that aims to balance the distribution of weight tensors across different channels, which can enhance the accuracy of quantized models.
Parameters:
enable (bool, optional): Specifies whether to enable channel equalization. Defaults to ``True``.
Returns:
QuantizationParams: A new instance of quantization parameters with the updated channel equalization setting.
"""
new_quant_params = dataclasses.replace(self, channel_equalization=enable)
return new_quant_params
[docs]
def with_smooth_quant(self, enable: bool = True):
new_quant_params = dataclasses.replace(self, smooth_quant=enable)
return new_quant_params
[docs]
default_quantization: QuantizationParams = QuantizationParams(calibration_method=default_calibration(),
activation_quantization_scheme=quantization_scheme(True,
False),
weight_quantization_scheme=quantization_scheme(False,
True),
requantization_mode=RequantizationMode.sima,
node_names={''},
custom_quantization_configs=None)
"""
Default quantization parameters for model quantization.
This configuration can be used as a baseline for quantizing a neural network using ``quantize_net`` or as a starting point for customizing quantization parameters.
It specifies default settings for calibration methods, activation and weight quantization schemes, and the requantization mode.
Attributes:
calibration_method (CalibrationMethod): The calibration method used for quantization. Defaults to `MSE()`.
activation_quantization_scheme (QuantizationScheme): Defines the quantization scheme for activations.
- Asymmetric: ``True``
- Per-channel: ``False``
- Bits: `8`
weight_quantization_scheme (QuantizationScheme): Defines the quantization scheme for weights.
- Asymmetric: ``False``
- Per-channel: ``True`
- Bits: ``8``
requantization_mode (RequantizationMode): The mode used for requantization. Defaults to ``RequantizationMode.sima``.
node_names (set): A set of node names to apply custom quantization configurations. Defaults to an empty set (`{''}`).
custom_quantization_configs (Optional[Dict]): Custom configurations for specific nodes. Defaults to `None`.
Returns:
QuantizationParams: The default quantization configuration for model quantization.
"""
[docs]
int16_quantization: QuantizationParams = QuantizationParams(calibration_method=default_calibration(),
# int16 is for activation only
activation_quantization_scheme=quantization_scheme(True,
False,
16),
# Always int8 for weight
weight_quantization_scheme=quantization_scheme(False,
True),
requantization_mode=RequantizationMode.sima,
node_names={''},
custom_quantization_configs=None)
"""
Int16 quantization parameters for model quantization.
This configuration is designed for quantizing neural networks where activations use 16-bit precision and weights use 8-bit precision.
It can be used with ``quantize_net`` as a baseline for models requiring higher precision for activations, while maintaining efficient
weight quantization.
Attributes:
calibration_method (CalibrationMethod): The calibration method used for quantization. Defaults to ``MSE()``.
activation_quantization_scheme (QuantizationScheme): Defines the quantization scheme for activations.
- Asymmetric: ``True``
- Per-channel: ``False``
- Bits: ``16``
weight_quantization_scheme (QuantizationScheme): Defines the quantization scheme for weights.
- Asymmetric: ``False``
- Per-channel: ``True``
- Bits: ``8``
requantization_mode (RequantizationMode): The mode used for requantization. Defaults to ``RequantizationMode.sima``.
node_names (set): A set of node names to apply custom quantization configurations. Defaults to an empty set (``{''}``).
custom_quantization_configs (Optional[Dict]): Custom configurations for specific nodes. Defaults to ``None``.
Returns:
QuantizationParams: The quantization configuration using int16 for activations and int8 for weights.
"""