Source code for helper

"""
Copyright (c) 2025 Sima Technologies, Inc.

SPDX-License-Identifier: Apache-2.0
"""
from typing import List
import cv2
import json
import math
import numpy as np
from pathlib import Path

[docs] class YoloHelpers: """ Yolo Helper class """ @staticmethod
[docs] def nms(boxes, scores, iou_threshold): # Sort by score sorted_indices = np.argsort(scores)[::-1] keep_boxes = [] while sorted_indices.size > 0: # Pick the last box box_id = sorted_indices[0] keep_boxes.append(box_id) # Compute IoU of the picked box with the rest ious = YoloHelpers.compute_iou(boxes[box_id, :], boxes[sorted_indices[1:], :]) # Remove boxes with IoU over the threshold keep_indices = np.where(ious < iou_threshold)[0] # print(keep_indices.shape, sorted_indices.shape) sorted_indices = sorted_indices[keep_indices + 1] return keep_boxes
@staticmethod
[docs] def bridge(image: np.ndarray, mask: np.ndarray, **kwds) -> np.ndarray: mask[mask > 0] = 1 coords = np.column_stack(np.where(mask > 0)) y_min, x_min = coords.min(axis=0) y_max, x_max = coords.max(axis=0) cropped_image = image[y_min:y_max, x_min:x_max] # save_image_dumps(cropped_image, "cropped.png") cropped_image = cv2.cvtColor(cropped_image, cv2.COLOR_RGB2GRAY) # save_image_dumps(cropped_image, "cropped_1.png") cropped_image = cv2.cvtColor(cropped_image, cv2.COLOR_GRAY2RGB) # save_image_dumps(cropped_image, "cropped_2.png") cropped_mask = mask[y_min:y_max, x_min:x_max] # save_image_dumps(cropped_mask, "cropped_mask.png") seg_seat = (cropped_image * cropped_mask[:, :, None]).astype(np.uint8) # save_image_dumps(seg_seat, "seg_seat.png") if YoloHelpers.flip_image(seg_seat): seg_seat = np.fliplr(seg_seat) return seg_seat, cropped_mask, (x_min, y_min), (x_max, y_max)
@staticmethod
[docs] def flip_image(image: np.array) -> bool: _, mid, _ = image.shape mid = mid // 2 return np.count_nonzero(image[:, :mid, :]) > np.count_nonzero(image[:, mid:, :])
@staticmethod
[docs] def compute_iou(box, boxes): # Compute xmin, ymin, xmax, ymax for both boxes xmin = np.maximum(box[0], boxes[:, 0]) ymin = np.maximum(box[1], boxes[:, 1]) xmax = np.minimum(box[2], boxes[:, 2]) ymax = np.minimum(box[3], boxes[:, 3]) # Compute intersection area intersection_area = np.maximum(0, xmax - xmin) * np.maximum(0, ymax - ymin) # Compute union area box_area = (box[2] - box[0]) * (box[3] - box[1]) boxes_area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) union_area = box_area + boxes_area - intersection_area # Compute IoU iou = intersection_area / union_area return iou
@staticmethod
[docs] def xywh2xyxy(x): # Convert bounding box (x, y, w, h) to bounding box (x1, y1, x2, y2) y = np.copy(x) y[..., 0] = x[..., 0] - x[..., 2] / 2 y[..., 1] = x[..., 1] - x[..., 3] / 2 y[..., 2] = x[..., 0] + x[..., 2] / 2 y[..., 3] = x[..., 1] + x[..., 3] / 2 return y
@staticmethod
[docs] def sigmoid(x): return 1 / (1 + np.exp(-x))
@staticmethod
[docs] def rescale_boxes(boxes, input_shape, image_shape): # Rescale boxes to original image dimensions input_shape = np.array([input_shape[1], input_shape[0], input_shape[1], input_shape[0]]) boxes = np.divide(boxes, input_shape, dtype=np.float32) boxes *= np.array([image_shape[1], image_shape[0], image_shape[1], image_shape[0]]) return boxes
@staticmethod
[docs] def extract_boxes(box_predictions, img_height, img_width): # Extract boxes from predictions boxes = box_predictions[:, :4] # Scale boxes to original image dimensions boxes = YoloHelpers.rescale_boxes(boxes, (640, 640), (img_height, img_width)) # Convert boxes to xyxy format boxes = YoloHelpers.xywh2xyxy(boxes) # Check the boxes are within the image boxes[:, 0] = np.clip(boxes[:, 0], 0, img_width) boxes[:, 1] = np.clip(boxes[:, 1], 0, img_height) boxes[:, 2] = np.clip(boxes[:, 2], 0, img_width) boxes[:, 3] = np.clip(boxes[:, 3], 0, img_height) return boxes
@staticmethod
[docs] def process_box_output(box_output, img_height=3024, img_width=4032, conf_thres=0.5, iou_thres=0.5, num_masks=32): predictions = np.squeeze(box_output).T num_classes = box_output.shape[1] - num_masks - 4 # Filter out object confidence scores below threshold scores = np.max(predictions[:, 4 : 4 + num_classes], axis=1) predictions = predictions[scores > conf_thres, :] scores = scores[scores > conf_thres] if len(scores) == 0: return [], [], [], np.array([]) box_predictions = predictions[..., : num_classes + 4] mask_predictions = predictions[..., num_classes + 4 :] # Get the class with the highest confidence class_ids = np.argmax(box_predictions[:, 4:], axis=1) # Get bounding boxes for each object boxes = YoloHelpers.extract_boxes(box_predictions, img_height, img_width) # Apply non-maxima suppression to suppress weak, overlapping bounding boxes indices = YoloHelpers.nms(boxes, scores, iou_thres) return boxes[indices], scores[indices], class_ids[indices], mask_predictions[indices]
@staticmethod
[docs] def process_mask_output(mask_predictions, mask_output, boxes, img_height, img_width): if mask_predictions.shape[0] == 0: return [] mask_output = np.squeeze(mask_output) # Calculate the mask maps for each box num_mask, mask_height, mask_width = mask_output.shape # CHW masks = YoloHelpers.sigmoid(mask_predictions @ mask_output.reshape((num_mask, -1))) masks = masks.reshape((-1, mask_height, mask_width)) # Downscale the boxes to match the mask size scale_boxes = YoloHelpers.rescale_boxes(boxes, (img_height, img_width), (mask_height, mask_width)) # For every box/mask pair, get the mask map mask_maps = np.zeros((len(scale_boxes), img_height, img_width)) blur_size = (int(img_width / mask_width), int(img_height / mask_height)) for i in range(len(scale_boxes)): scale_x1 = int(math.floor(scale_boxes[i][0])) scale_y1 = int(math.floor(scale_boxes[i][1])) scale_x2 = int(math.ceil(scale_boxes[i][2])) scale_y2 = int(math.ceil(scale_boxes[i][3])) x1 = int(math.floor(boxes[i][0])) y1 = int(math.floor(boxes[i][1])) x2 = int(math.ceil(boxes[i][2])) y2 = int(math.ceil(boxes[i][3])) scale_crop_mask = masks[i][scale_y1:scale_y2, scale_x1:scale_x2] crop_mask = cv2.resize(scale_crop_mask, (x2 - x1, y2 - y1), interpolation=cv2.INTER_CUBIC) crop_mask = cv2.blur(crop_mask, blur_size) crop_mask = (crop_mask > 0.5).astype(np.uint8) mask_maps[i, y1:y2, x1:x2] = crop_mask return mask_maps
[docs] def postprocess(outputs: List[np.ndarray], input_height, input_width) -> np.ndarray: boxes, scores, class_ids, mask_pred = YoloHelpers.process_box_output(outputs[0], input_height, input_width) mask_maps = YoloHelpers.process_mask_output(mask_pred, outputs[1], boxes, input_height, input_width) return boxes, scores, class_ids, mask_maps
[docs] def load_threshold(path: Path): """ Helper function to load the threshold """ with path.open('r') as f: threshold = json.load(f) cls_thres = threshold['cls_thresh'] seg_thres = threshold['seg_thresh'] max_score = threshold['max_score'] min_score = threshold['min_score'] return cls_thres, seg_thres, max_score, min_score
[docs] def load_quantile(path: Path): """ Helper function to load the quantile """ with path.open('r') as f: quantiles = json.load(f) return ( np.array(quantiles['q_st_start']), np.array(quantiles['q_st_end']), np.array(quantiles['q_ae_start']), np.array(quantiles['q_ae_end']), )
[docs] def load_mean_std(path: Path): """ Helper function to load the mean and standard deviation """ with open(path, 'r') as f: mean_std_dict = json.load(f) mean = np.array(mean_std_dict['mean']) std = np.array(mean_std_dict['std']) return mean, std
[docs] def merge_levels(preds: List[np.ndarray]) -> np.ndarray: b, d = preds[0].shape[:2] return np.concatenate([pred.reshape(b, d, -1) for pred in preds], -1)
[docs] def effad_postproc(yolo_output: dict, effad_output: dict, cls_threshold: float) -> dict: eps = float('1e-7') crop_height, crop_width, c = yolo_output["seg_seat"].shape x = effad_output x = x[0, 0] x = cv2.resize(x, (crop_width, crop_height)) x_norm = (x - np.min(x)) / (np.max(x) - np.min(x) + eps) pred = x_norm res = np.max(x) >= cls_threshold cropped_mask = yolo_output["cropped_mask"] mask = yolo_output["mask"] x_min, y_min = yolo_output["x_y_min"] x_max, y_max = yolo_output["x_y_max"] pred[cropped_mask == 0] = 0 output = np.zeros_like(mask) output[y_min:y_max, x_min:x_max] = pred return { "pred": output, "res": res }