SIMA_DETESS_DEQUANT

Description

The detess_dequant graph accepts an array of tensors in NCHW and their corresponding detessellate tile dimension and dequantization parameters. This input tensor array is expected to be placed sequentially one after the other in the memory. The graph detessellates the input tensors and dequantizes them to FP16 or FP32 depending on the output data type configured by the user.

Detessellation tile/slcie size is calculated as below:

tile_sz = tile_width x tile_height x (tile_channel rounded to nearest 16 byte boundary) x data_type_size_bytes

The various supported input output combinations are as per the below table:

+--------------+---------------+------------+-----------+-----------+
| in_data_type | out_data_type | out_format | tile_size | supported |
+--------------+---------------+------------+-----------+-----------+
| INT8         | FP16          | NHWC       | small     |   Yes     |
|              | FP16          | NHWC       | large     |   No      |
|              | FP16          | NCHW       | small     |   No      |
|              | FP16          | NCHW       | large     |   No      |
|              | FP32          | NHWC       | small     |   Yes     |
|              | FP32          | NHWC       | large     |   Yes     |
|              | FP32          | NCHW       | small     |   Yes     |
|              | FP32          | NCHW       | large     |   Yes     |
+--------------+---------------+------------+-----------+-----------+
| INT16        | FP16          | NHWC       | small     |   No      |
|              | FP16          | NHWC       | large     |   No      |
|              | FP16          | NCHW       | small     |   No      |
|              | FP16          | NCHW       | large     |   No      |
|              | FP32          | NHWC       | small     |   Yes     |
|              | FP32          | NHWC       | large     |   No      |
|              | FP32          | NCHW       | small     |   No      |
|              | FP32          | NCHW       | large     |   No      |
+--------------+---------------+------------+-----------+-----------+
| INT32        | FP16          | NHWC       | small     |   No      |
|              | FP16          | NHWC       | large     |   No      |
|              | FP16          | NCHW       | small     |   No      |
|              | FP16          | NCHW       | large     |   No      |
|              | FP32          | NHWC       | small     |   Yes     |
|              | FP32          | NHWC       | large     |   No      |
|              | FP32          | NCHW       | small     |   No      |
|              | FP32          | NCHW       | large     |   No      |
+--------------+---------------+------------+-----------+-----------+

Graph Info

Overview

SIMA_DETESS_DEQUANT
Graph Name	SIMA_DETESS_DEQUANT
Graph ID	201
Operations Supported	Dequantize Detesselate
Available Since Yocto Build	B684

Example Config

Below is the example config json for this graph. We need to use such config for configuring the EV74 graph first. For this purpose, we need a CVU Configuration Application developed in C++.

{
    "version": 0.1,
    "node_name": "detess-dequant",
    "simaai__params": {
        "params": 15,
        "cpu": 1,
        "next_cpu": 0,
        "no_of_outbuf": 1,
        "ibufname": "",
        "graph_id": 201,
        "num_tensors": 1,
        "input_width": [
            1
        ],
        "input_height": [
            1
        ],
        "input_depth": [
            1000
        ],
        "slice_width": [
            1
        ],
        "slice_height": [
            1
        ],
        "slice_depth": [
            1000
        ],
        "dq_scale": [
            255.02200010497842
        ],
        "dq_zp": [
            -128
        ],
        "data_type": [
            0
        ],
        "fp16_out_en": [
            0
        ],
        "output_format": [
            0
        ],
        "debug": 0,
        "out_sz": 4000,
        "dump_data": 1
    }
}

Parameters

SIMA_DETESS_DEQUANT Params
Parameter Name	Parameter Description	Data Type	Default	Min	Max
num_tensors	Number of input tensors in the input buffer	int32_t	13	1	32
input_width	Width of the input tensor	int32_t	[64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64]	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]	[4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096]
input_height	Height of the input tensor	int32_t	[64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64]	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]	[4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096]
input_depth	Depth of the input tensor	int32_t	[32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32]	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]	[4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096]
slice_width	Slice/Tile width to be used for detessellation	int32_t	[16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16]	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]	[4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096]
slice_height	Slice/Tile height to be used for detessellation	int32_t	[16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16]	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]	[4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096]
slice_depth	Slice/Tile depth/channels to be used for detessellation	int32_t	[32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32]	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]	[4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096]
dq_scale	Dequantization scale	float32	[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]	[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]	[1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0]
dq_zp	Dequantization zero point	int32_t	[‘INT8’, ‘INT8’, ‘INT8’, ‘INT8’, ‘INT8’, ‘INT8’, ‘INT8’, ‘INT8’, ‘INT8’, ‘INT8’, ‘INT8’, ‘INT8’, ‘INT8’, ‘INT8’, ‘INT8’, ‘INT8’, ‘INT8’, ‘INT8’, ‘INT8’, ‘INT8’, ‘INT8’, ‘INT8’, ‘INT8’, ‘INT8’, ‘INT8’, ‘INT8’, ‘INT8’, ‘INT8’, ‘INT8’, ‘INT8’, ‘INT8’, ‘INT8’]	[-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128]	[127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127]
data_type	Specifies the input tessellated datatype, 0=> ‘INT8’, 1=> ‘INT16’, 2=> ‘INT32’.	int32_t	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]	N/A	N/A
fp16_out_en	Enables fp16 output if set to 1 else, gives fp32 output.	bool	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]	0	1
output_format	Output tensor format. 0 => NHWC, 1 => NCHW..	int32_t	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]	0	1
debug	Enable more debug logs, 0 => disable, 1=> additonal logs, 2 => profile runtime of individual input tensors, 3 => profile overall graph runtime.	int32_t	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]	0	3
dump_data	Enable (1) or disable (0) dumping of output tensor to `/tmp` directory on device with the name `{node_name}-###.out`. The sequence number `###` will increment with each output dump (e.g., -001.out, -002.out, …).	int32_t	0	0	1

CVU Configuration Application

Note

The need to write, build and execute a dependent application for the CVU will be removed in an upcoming release.

To configure any CVU graph, a C++ CVU Configuration Application must be cross-compiled and executed on the board before using the CVU. Multiple graphs can be pre-programmed into the CVU before running any application. This guide provides a pre-written C++ application for download for each graph that can be cross-compiled on Palette, and executed on the board prior to running the simaaiprocesscvu GStreamer plugin. An pre-compiled version is also included for direct use. If you encounter issues, please re-compile the application from the sources provided.

How to compile using the files below

Please refer to How to compile CVU Configuration Application? for more info.

Directory structure

.
├── CMakeLists.txt
├── cvu_cfg_graph.cpp
└── cvu_cfg_main.cpp

Code files

cvu_cfg_graph.cpp

 #include <simaai/ev_cfg_helper.h>
 #include <simaai/parser.h>
 #include <simaai/platform/simaevxxipc.h>
 #include <string.h>

 #define SIMA_IPC_GRAPH_NAME "SIMA_DETESS_DEQUANT"
 #define SIMA_IPC_GRAPH_CODE (201)

 #define NUM_IN_TENSORS (1)
 #define INPUT_WIDTH (2)
 #define INPUT_HEIGHT (3)
 #define INPUT_DEPTH (4)
 #define SLICE_WIDTH (5)
 #define SLICE_HEIGHT (6)
 #define SLICE_DEPTH (7)
 #define DEQUANT_SCALE (8)
 #define DEQUANT_ZEROPOINT (9)
 #define INPUT_TYPE (10)
 #define FP16_OUT_ENABLED (11)
 #define OUTPUT_FORMAT (12)
 #define DEBUG (13)

 int parse_and_send_array_param(simaai_params_t *params, const char *param_name, int array_len, int graph_id, int param_id, bool is_float) {
   simaai_double_array_t *arr;
   uint8_t *buf = (uint8_t *)calloc(1, sizeof(uint32_t) * array_len);
   arr = (simaai_double_array_t *)parser_get_double_array(params, param_name);
   if (arr != NULL) {
     if (arr->size != array_len) {
       std::cout << "\n Param list incomplete for " << /*static_cast <const void *>*/ (param_name);
       return -1;
     }

     for (int i = 0; i < arr->size; i++) {
       if(is_float) {
         send_float_param(2, graph_id, param_id, buf, (double)arr->values[i]);
       } else {
         send_i32_param(2, graph_id, param_id, buf, (int)arr->values[i]);
       }
     }
   }
   else {
     std::cout << "\n Param list empty for " << /*static_cast <const void *>*/ (param_name);
     return -1;
   }
   return 0;
 }

 void configure_graph(const char *json_in) {
   simaai_params_t *params = parser_node_struct_init();
   if (params == NULL) {
     std::cout << "Unable to create params \n";
   }
   if ((parse_json_file(json_in, params) != PARSER_SUCCESS)) {
     std::cout << "Unable to start parser \n";
   }

   uint8_t *buf = (uint8_t *)calloc(1, sizeof(uint8_t) * 16);

   int num_input_tensors_val = *((int *)parser_get_int(params, "num_tensors"));
   send_i32_param(2, SIMA_IPC_GRAPH_CODE, NUM_IN_TENSORS, buf, num_input_tensors_val);

   parse_and_send_array_param(params, "input_width", num_input_tensors_val, SIMA_IPC_GRAPH_CODE, INPUT_WIDTH, false);
   parse_and_send_array_param(params, "input_height", num_input_tensors_val, SIMA_IPC_GRAPH_CODE, INPUT_HEIGHT, false);
   parse_and_send_array_param(params, "input_depth", num_input_tensors_val, SIMA_IPC_GRAPH_CODE, INPUT_DEPTH, false);
   parse_and_send_array_param(params, "slice_width", num_input_tensors_val, SIMA_IPC_GRAPH_CODE, SLICE_WIDTH, false);
   parse_and_send_array_param(params, "slice_height", num_input_tensors_val, SIMA_IPC_GRAPH_CODE, SLICE_HEIGHT, false);
   parse_and_send_array_param(params, "slice_depth", num_input_tensors_val, SIMA_IPC_GRAPH_CODE, SLICE_DEPTH, false);
   parse_and_send_array_param(params, "dq_scale", num_input_tensors_val, SIMA_IPC_GRAPH_CODE, DEQUANT_SCALE, true);
   parse_and_send_array_param(params, "dq_zp", num_input_tensors_val, SIMA_IPC_GRAPH_CODE, DEQUANT_ZEROPOINT, false);
   parse_and_send_array_param(params, "data_type", num_input_tensors_val, SIMA_IPC_GRAPH_CODE, INPUT_TYPE, false);
   parse_and_send_array_param(params, "fp16_out_en", num_input_tensors_val, SIMA_IPC_GRAPH_CODE, FP16_OUT_ENABLED, false);
   parse_and_send_array_param(params, "output_format", num_input_tensors_val, SIMA_IPC_GRAPH_CODE, OUTPUT_FORMAT, false);

   int debug_val = *((int *)parser_get_int(params, "debug"));
   send_i32_param(2, SIMA_IPC_GRAPH_CODE, DEBUG, buf, debug_val);

   parser_finalize(params);
   free(buf);

   std::cout << "Completed " << SIMA_IPC_GRAPH_NAME << " graph configure \n";
 }

cvu_cfg_main.cpp

 #include <getopt.h>
 #include <sys/stat.h>
 #include <unistd.h>

 #include <cstring>
 #include <iostream>

 extern void configure_graph(const char *json_fpath);

 bool is_valid_path(const char *path) {
   struct stat buffer;
   return (stat(path, &buffer) == 0);
 }

 int main(int argc, char **argv) {
   const char *json_path = argv[1];

   if(is_valid_path(json_path)) {
     configure_graph(json_path);
   } else {
     std::cerr << "Invalid path: " << json_path << std::endl;
     return 1;
   }

   return 0;
 }

CMakeLists.txt

 cmake_minimum_required(VERSION 3.16)

 # set the project name
 set(GRAPH_NAME "detessdequant_201")
 set(PROJECT_NAME "CVU Graph Cfg. App.")

 project("${PROJECT_NAME}"
     VERSION 0.1
     DESCRIPTION "CVU Graph Configuration Application"
     LANGUAGES C CXX)

 set(PIPELINE_SOURCES
     cvu_cfg_graph.cpp)

 execute_process(
     COMMAND git rev-parse --abbrev-ref HEAD
     WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
     OUTPUT_VARIABLE GIT_BRANCH
     OUTPUT_STRIP_TRAILING_WHITESPACE
 )

 # Get the latest abbreviated commit hash of the working branch
 execute_process(
     COMMAND git log -1 --format=%h
     WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
     OUTPUT_VARIABLE GIT_COMMIT_HASH
     OUTPUT_STRIP_TRAILING_WHITESPACE
 )

 link_directories(${CMAKE_INSTALL_DIR}/core
     ${CMAKE_INSTALL_DIR}/gst
 )

 include(GNUInstallDirs)

 # ev-configuration genertion executable
 set(EV_EXEC_NAME "${GRAPH_NAME}_cvu_cfg_app")

 add_executable(${EV_EXEC_NAME}
     cvu_cfg_main.cpp
     cvu_cfg_graph.cpp)

 target_link_libraries(${EV_EXEC_NAME}
     PUBLIC
     simaaiparser
     evhelpers)

 INSTALL(TARGETS "${EV_EXEC_NAME}")