Source code for pytb.detection.bboxes.bboxes_2d_detector.yolo4.yolo4

from pytb.detection.bboxes.bboxes_2d_detector.bboxes_2d_detector import BBoxes2DDetector
from pytb.output.bboxes_2d import BBoxes2D

from timeit import default_timer

import cv2
import numpy as np
import logging

log = logging.getLogger("aptitude-toolbox")


[docs]class YOLO4(BBoxes2DDetector):

[docs]    def __init__(self, proc_parameters: dict):
        """This class can be used for YOLO v2, v3, v4 models from Darknet.

        Initializes the detector with the given parameters.

        Args:
            proc_parameters (dict): A dictionary containing the YOLO detector parameters
        """
        super().__init__(proc_parameters)

        # The minimum confidence threshold of the detected objects if the implementation allows to provide one.
        self.conf_thresh = proc_parameters["params"].get("conf_thresh", 0)
        
        # The minimum non-max suppression threshold of the detected objects if the implementation allows to provide one.
        # The non-max suppression can be implemented in multiple ways, results can vary.
        self.nms_thresh = proc_parameters["params"].get("nms_thresh", 0)
        
        # Whether to perform the NMS algorithm across the different classes of object or separately.
        self.nms_across_classes = proc_parameters["params"].get("nms_across_classes", True)
        
        # Whether to use the GPU if available.
        self.gpu = proc_parameters["params"].get("GPU", False)
        
        # Whether to use the half precision capability of the recent GPU cards. 
        self.half_precision = proc_parameters["params"].get("half_precision", False)

        log.debug("GPU set to {} and half precision set to {}.".format(self.gpu, self.half_precision))
        log.debug("YOLOv2-3-4 {} implementation selected.".format(self.pref_implem))
        
        # Implementation for YOLOv2-3-4 from OpenCV.
        # This implementation is slightly faster than cv2-Readnet but is a bit more 'blackbox'.
        if self.pref_implem == "cv2-DetectionModel":
            self.net = cv2.dnn_DetectionModel(self.model_path, self.config_path)
            self.net.setInputSize(self.input_width, self.input_height)
            self.net.setInputScale(1.0 / 255)
            self.net.setInputSwapRB(True)
            self.net.setNmsAcrossClasses(self.nms_across_classes)
            self._setup_cv2()

        # Implementation for YOLOv2-3-4 from OpenCV.
        # This implementation is slightly slower than cv2-DetectionModel
        # but outputs a bit more details about the predictions.
        elif self.pref_implem == "cv2-ReadNet":
            self.net = cv2.dnn.readNet(self.model_path, self.config_path)
            self._setup_cv2()

        else:
            assert False, "[ERROR] Unknown implementation of YOLO: {}".format(self.pref_implem)

[docs]    def detect(self, frame: np.array) -> BBoxes2D:
        """Performs a YOLO inference on the given frame. 

        Args:
            frame (np.array): The frame to infer YOLO detections.

        Returns:
            BBoxes2D: A set of 2D bounding boxes identifying the detected objects.
        """
        if self.pref_implem == "cv2-DetectionModel":
            if frame.shape[:2] != (self.input_height, self.input_width):
                frame = cv2.resize(frame, (self.input_width, self.input_height), interpolation=cv2.INTER_AREA)
            output = self._detect_cv2_detection_model(frame)

        elif self.pref_implem == "cv2-ReadNet":
            blob = cv2.dnn.blobFromImage(frame, 1 / 255.0, (self.input_width, self.input_height),
                                         swapRB=True, crop=False)
            output = self._detect_cv2_read_net(blob)

        else:
            assert False, "[ERROR] Unknown implementation of YOLO: {}".format(self.pref_implem)

        return output

[docs]    def _setup_cv2(self):
        """
        Setup OpenCV framework with the required backend.
        """
        if self.gpu:
            self.net.setPreferableBackend(cv2.dnn.DNN_BACKEND_CUDA)
            # Half precision is for recent GPU cards that had such capability.
            if self.half_precision:
                self.net.setPreferableTarget(cv2.dnn.DNN_TARGET_CUDA_FP16)
                log.debug("OpenCV with DNN_BACKEND_CUDA target CUDAFP16.")
            else:
                self.net.setPreferableTarget(cv2.dnn.DNN_TARGET_CUDA)
                log.debug("OpenCV with DNN_BACKEND_CUDA target CUDA.")
        else:
            self.net.setPreferableBackend(cv2.dnn.DNN_BACKEND_OPENCV)
            self.net.setPreferableTarget(cv2.dnn.DNN_TARGET_CPU)
            log.debug("OpenCV with DNN_BACKEND_OPENCV and target CPU.")

[docs]    def _detect_cv2_detection_model(self, cv2_org_frame: np.array) -> BBoxes2D:
        """Performs a YOLOv2-3-4 inference on the given frame using cv2-DetectionModel of openCV.

        Args:
            frame (np.array): The frame to infer YOLOv2-3-4 detections.

        Returns:
            BBoxes2D: A set of 2D bounding boxes identifying the detected objects.
        """
        start = default_timer()
        classes, confidences, boxes = self.net.detect(cv2_org_frame, confThreshold=self.conf_thresh,
                                                      nmsThreshold=self.nms_thresh)
        end = default_timer()

        # Format results
        if len(classes) > 0:
            classes = classes.flatten()
            confidences = confidences.flatten()

        output = BBoxes2D((end - start), np.array(boxes), np.array(classes), np.array(confidences),
                          self.input_width, self.input_height)
        return output

[docs]    def _detect_cv2_read_net(self, blob_org_frame) -> BBoxes2D:
        """Performs a YOLOv2-4 inference on the given frame using cv2-ReadNet of openCV.

        Args:
            frame (Any): The frame to infer YOLOv2-3-4 detections.

        Returns:
            BBoxes2D: A set of 2D bounding boxes identifying the detected objects.
        """
        # Detect objects
        self.net.setInput(blob_org_frame)
        layer_names = self.net.getLayerNames()
        output_layers = [layer_names[i - 1] for i in self.net.getUnconnectedOutLayers()]

        # Inference
        start = default_timer()
        outputs = self.net.forward(output_layers)
        end = default_timer()

        classes = []
        confidences = []
        boxes = []

        # Get the output of each yolo layers
        for output in outputs:
            for detection in output:
                scores = detection[5:]
                conf = scores[scores > self.conf_thresh]
                if len(conf) != 0:
                    box = detection[:4] * np.array(
                        [self.input_width, self.input_height, self.input_width, self.input_height])
                    box -= np.array([box[2] / 2, box[3] / 2, 0, 0])  # to xt, yt, w, h
                    classes.append(scores.argmax())
                    confidences.append(np.max(conf))
                    boxes.append(box)

        return BBoxes2D((end - start), np.array(boxes), np.array(classes), np.array(confidences),
                        self.input_width, self.input_height)