From dcf1eada05729d9ebcd62a84b4b552e5fbe2c537 Mon Sep 17 00:00:00 2001 From: Naushir Patuck Date: Fri, 6 Sep 2024 12:44:34 +0100 Subject: [PATCH] postprocessing: Add initial support for IMX500 postprocessing Add postprocessing stages using the IMX500 inference output. MobileNet SSD object inference and PoseNet pose estimation software stages have been implemented in this commit. Signed-off-by: Naushir Patuck --- assets/imx500_mobilenet_ssd.json | 123 +++ assets/imx500_posenet.json | 29 + meson.build | 1 + meson_options.txt | 10 + .../imx500/imx500_object_detection.cpp | 348 +++++++ .../imx500/imx500_posenet.cpp | 867 ++++++++++++++++++ .../imx500/imx500_post_processing_stage.cpp | 280 ++++++ .../imx500/imx500_post_processing_stage.hpp | 78 ++ post_processing_stages/imx500/meson.build | 31 + post_processing_stages/meson.build | 5 + .../post_processing_stage.hpp | 21 + utils/download-imx500-models.sh | 42 + 12 files changed, 1835 insertions(+) create mode 100644 assets/imx500_mobilenet_ssd.json create mode 100644 assets/imx500_posenet.json create mode 100644 post_processing_stages/imx500/imx500_object_detection.cpp create mode 100644 post_processing_stages/imx500/imx500_posenet.cpp create mode 100644 post_processing_stages/imx500/imx500_post_processing_stage.cpp create mode 100644 post_processing_stages/imx500/imx500_post_processing_stage.hpp create mode 100644 post_processing_stages/imx500/meson.build create mode 100755 utils/download-imx500-models.sh diff --git a/assets/imx500_mobilenet_ssd.json b/assets/imx500_mobilenet_ssd.json new file mode 100644 index 00000000..0a45abf4 --- /dev/null +++ b/assets/imx500_mobilenet_ssd.json @@ -0,0 +1,123 @@ +{ + "imx500_object_detection": + { + "max_detections" : 5, + "threshold" : 0.6, + "network_file": "/usr/share/imx500-models/imx500_network_ssd_mobilenetv2_fpnlite_320x320_pp.rpk", + + "save_input_tensor": + { + "filename": "/home/pi/input_tensor.raw", + "num_tensors": 10, + "norm_val": [384, 384, 384, 0], + "norm_shift": [0, 0, 0, 0] + }, + + "temporal_filter": + { + "tolerance": 0.1, + "factor": 0.2, + "visible_frames": 4, + "hidden_frames": 2 + }, + + "classes": + [ + "person", + "bicycle", + "car", + "motorcycle", + "airplane", + "bus", + "train", + "truck", + "boat", + "traffic light", + "fire hydrant", + "-", + "stop sign", + "parking meter", + "bench", + "bird", + "cat", + "dog", + "horse", + "sheep", + "cow", + "elephant", + "bear", + "zebra", + "giraffe", + "-", + "backpack", + "umbrella", + "-", + "-", + "handbag", + "tie", + "suitcase", + "frisbee", + "skis", + "snowboard", + "sports ball", + "kite", + "baseball bat", + "baseball glove", + "skateboard", + "surfboard", + "tennis racket", + "bottle", + "-", + "wine glass", + "cup", + "fork", + "knife", + "spoon", + "bowl", + "banana", + "apple", + "sandwich", + "orange", + "broccoli", + "carrot", + "hot dog", + "pizza", + "donut", + "cake", + "chair", + "couch", + "potted plant", + "bed", + "-", + "dining table", + "-", + "-", + "toilet", + "-", + "tv", + "laptop", + "mouse", + "remote", + "keyboard", + "cell phone", + "microwave", + "oven", + "toaster", + "sink", + "refrigerator", + "-", + "book", + "clock", + "vase", + "scissors", + "teddy bear", + "hair drier", + "toothbrush" + ] + }, + + "object_detect_draw_cv": + { + "line_thickness" : 2 + } +} diff --git a/assets/imx500_posenet.json b/assets/imx500_posenet.json new file mode 100644 index 00000000..c008f2a5 --- /dev/null +++ b/assets/imx500_posenet.json @@ -0,0 +1,29 @@ +{ + "imx500_posenet": + { + "max_detections" : 5, + "threshold" : 0.4, + "offset_refinement_steps": 5, + "nms_radius": 10.0, + "network_file": "/usr/share/imx500-models/imx500_network_posenet.rpk", + + "save_input_tensor": + { + "filename": "/home/pi/posenet_input_tensor.raw", + "num_tensors": 10 + }, + + "temporal_filter": + { + "tolerance": 0.3, + "factor": 0.3, + "visible_frames": 8, + "hidden_frames": 2 + } + }, + + "plot_pose_cv": + { + "confidence_threshold" : 0.2 + } +} diff --git a/meson.build b/meson.build index 7d4c61c8..1a742df3 100644 --- a/meson.build +++ b/meson.build @@ -80,5 +80,6 @@ summary({ 'OpenCV postprocessing' : enable_opencv, 'TFLite postprocessing' : enable_tflite, 'Hailo postprocessing' : enable_hailo, + 'IMX500 postprocessing' : get_option('enable_imx500'), }, bool_yn : true, section : 'Build configuration') diff --git a/meson_options.txt b/meson_options.txt index 13b6de16..0f24d546 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -43,3 +43,13 @@ option('download_hailo_models', type : 'boolean', value : true, description : 'Download and install the Hailo postprocessing models') + +option('enable_imx500', + type : 'boolean', + value : true, + description : 'Enable IMX500 postprocessing support') + +option('download_imx500_models', + type : 'boolean', + value : true, + description : 'Download and install the IMX500 postprocessing models') diff --git a/post_processing_stages/imx500/imx500_object_detection.cpp b/post_processing_stages/imx500/imx500_object_detection.cpp new file mode 100644 index 00000000..d3f74b34 --- /dev/null +++ b/post_processing_stages/imx500/imx500_object_detection.cpp @@ -0,0 +1,348 @@ +/* SPDX-License-Identifier: BSD-2-Clause */ +/* + * Copyright (C) 2024, Raspberry Pi Ltd + * + * imx500_object_detection.cpp - IMX500 object detection for various networks + */ + +#include +#include +#include +#include +#include + +#include +#include + +#include "core/rpicam_app.hpp" +#include "post_processing_stages/object_detect.hpp" +#include "post_processing_stages/post_processing_stage.hpp" + +#include "imx500_post_processing_stage.hpp" + +using Rectangle = libcamera::Rectangle; +using Size = libcamera::Size; +namespace controls = libcamera::controls; + +#define NAME "imx500_object_detection" + +struct Bbox +{ + float x0; + float y0; + float x1; + float y1; +}; + +struct ObjectDetectionOutput +{ + unsigned int num_detections = 0; + std::vector bboxes; + std::vector scores; + std::vector classes; +}; + +class ObjectDetection : public IMX500PostProcessingStage +{ +public: + ObjectDetection(RPiCamApp *app) : IMX500PostProcessingStage(app) {} + + char const *Name() const override; + + void Read(boost::property_tree::ptree const ¶ms) override; + + void Configure() override; + + bool Process(CompletedRequestPtr &completed_request) override; + +private: + int processOutputTensor(std::vector &objects, const std::vector &output_tensor, + const CnnOutputTensorInfo &output_tensor_info, const Rectangle &scaler_crop) const; + void filterOutputObjects(std::vector &objects); + + struct LtObject + { + Detection params; + unsigned int visible; + unsigned int hidden; + bool matched; + }; + + std::vector lt_objects_; + std::mutex lt_lock_; + + // Config params + unsigned int max_detections_; + float threshold_; + std::vector classes_; + bool temporal_filtering_; + float tolerance_; + float factor_; + unsigned int visible_frames_; + unsigned int hidden_frames_; + bool started_ = false; +}; + +char const *ObjectDetection::Name() const +{ + return NAME; +} + +void ObjectDetection::Read(boost::property_tree::ptree const ¶ms) +{ + max_detections_ = params.get("max_detections"); + threshold_ = params.get("threshold", 0.5f); + classes_ = PostProcessingStage::GetJsonArray(params, "classes"); + + if (params.find("temporal_filter") != params.not_found()) + { + temporal_filtering_ = true; + tolerance_ = params.get("temporal_filter.tolerance", 0.05); + factor_ = params.get("temporal_filter.factor", 0.2); + visible_frames_ = params.get("temporal_filter.visible_frames", 5); + hidden_frames_ = params.get("temporal_filter.hidden_frames", 2); + } + else + temporal_filtering_ = false; + + IMX500PostProcessingStage::Read(params); +} + +void ObjectDetection::Configure() +{ + lt_objects_.clear(); + IMX500PostProcessingStage::Configure(); + if (!started_) + { + IMX500PostProcessingStage::ShowFwProgressBar(); + started_ = true; + } +} + +bool ObjectDetection::Process(CompletedRequestPtr &completed_request) +{ + auto scaler_crop = completed_request->metadata.get(controls::ScalerCrop); + if (!raw_stream_ || !scaler_crop) + { + LOG_ERROR("Must have RAW stream and scaler crop available to get sensor dimensions!"); + return false; + } + + auto output = completed_request->metadata.get(controls::rpi::CnnOutputTensor); + auto info = completed_request->metadata.get(controls::rpi::CnnOutputTensorInfo); + std::vector objects; + + // Process() can be concurrently called through different threads for consecutive CompletedRequests if + // things are running behind. So protect access to the lt_objects_ state object. + std::scoped_lock l(lt_lock_); + + if (output && info) + { + std::vector output_tensor(output->data(), output->data() + output->size()); + CnnOutputTensorInfo output_tensor_info = *reinterpret_cast(info->data()); + + processOutputTensor(objects, output_tensor, output_tensor_info, *scaler_crop); + + if (temporal_filtering_) + { + filterOutputObjects(objects); + if (lt_objects_.size()) + { + objects.clear(); + for (auto const &obj : lt_objects_) + { + if (!obj.hidden) + objects.push_back(obj.params); + } + } + } + else + { + // If no temporal filtering, make sure we fill lt_objects_ with the current result as it may be used if + // there is no output tensor on subsequent frames. + lt_objects_.clear(); + for (auto const &obj : objects) + lt_objects_.push_back({ obj, 0, 0, 0 }); + } + } + else + { + // No output tensor, so simply reuse the results from the lt_objects_. + for (auto const &obj : lt_objects_) + { + if (!obj.hidden) + objects.push_back(obj.params); + } + } + + if (objects.size()) + completed_request->post_process_metadata.Set("object_detect.results", objects); + + return IMX500PostProcessingStage::Process(completed_request); +} + +static int createObjectDetectionData(ObjectDetectionOutput &output, const std::vector &data, + unsigned int total_detections) +{ + unsigned int count = 0; + + // Extract bounding box co-ordinates + for (unsigned int i = 0; i < total_detections; i++) + { + Bbox bbox; + bbox.y0 = data.at(count + i); + bbox.x0 = data.at(count + i + (1 * total_detections)); + bbox.y1 = data.at(count + i + (2 * total_detections)); + bbox.x1 = data.at(count + i + (3 * total_detections)); + output.bboxes.push_back(bbox); + } + count += (total_detections * 4); + + // Extract scores + for (unsigned int i = 0; i < total_detections; i++) + { + output.scores.push_back(data.at(count)); + count++; + } + + // Extract class indices + for (unsigned int i = 0; i < total_detections; i++) + { + output.classes.push_back(data.at(count)); + count++; + } + + // Extract number of detections + unsigned int num_detections = data.at(count); + if (num_detections > total_detections) + { + LOG(1, "Unexpected value for num_detections: " << num_detections << ", setting it to " << total_detections); + num_detections = total_detections; + } + + output.num_detections = num_detections; + return 0; +} + +int ObjectDetection::processOutputTensor(std::vector &objects, const std::vector &output_tensor, + const CnnOutputTensorInfo &output_tensor_info, + const Rectangle &scaler_crop) const +{ + if (output_tensor_info.num_tensors != 4) + { + LOG_ERROR("Invalid number of tensors " << output_tensor_info.num_tensors << ", expected 4"); + return -1; + } + + const unsigned int total_detections = output_tensor_info.info[0].tensor_data_num / 4; + ObjectDetectionOutput output; + + // 4x coords + 1x labels + 1x confidences + 1 total detections + if (output_tensor.size() != 6 * total_detections + 1) + { + LOG_ERROR("Invalid tensor size " << output_tensor.size() << ", expected " << 6 * total_detections + 1); + return -1; + } + + int ret = createObjectDetectionData(output, output_tensor, total_detections); + if (ret) + { + LOG_ERROR("Failed to create object detection data"); + return -1; + } + + for (unsigned int i = 0; i < std::min(output.num_detections, max_detections_); i++) + { + uint8_t class_index = (uint8_t)output.classes[i]; + + // Filter detections + if (output.scores[i] < threshold_ || class_index >= classes_.size()) + continue; + + // Extract bounding box co-ordinates in the inference image co-ordinates and convert to the final ISP output + // co-ordinates. + std::vector coords{ output.bboxes[i].x0, output.bboxes[i].y0, + output.bboxes[i].x1 - output.bboxes[i].x0, + output.bboxes[i].y1 - output.bboxes[i].y0 }; + const Rectangle obj_scaled = ConvertInferenceCoordinates(coords, scaler_crop); + + objects.emplace_back(class_index, classes_[class_index], output.scores[i], + obj_scaled.x, obj_scaled.y, obj_scaled.width, obj_scaled.height); + } + + LOG(2, "Number of objects detected: " << objects.size()); + for (unsigned i = 0; i < objects.size(); i++) + LOG(2, "[" << i << "] : " << objects[i].toString()); + + return 0; +} + +void ObjectDetection::filterOutputObjects(std::vector &objects) +{ + const Size isp_output_size = output_stream_->configuration().size; + bool empty = lt_objects_.empty(); + + for (auto <_obj : lt_objects_) + lt_obj.matched = false; + + for (auto const &object : objects) + { + bool matched = false; + for (auto <_obj : lt_objects_) + { + // Try and match a detected object in our long term list. + if (object.category == lt_obj.params.category && + std::abs(object.box.x - lt_obj.params.box.x) < tolerance_ * isp_output_size.width && + std::abs(object.box.y - lt_obj.params.box.y) < tolerance_ * isp_output_size.height && + std::abs((int)object.box.width - (int)lt_obj.params.box.width) < tolerance_ * isp_output_size.width && + std::abs((int)object.box.height - (int)lt_obj.params.box.height) < tolerance_ * isp_output_size.height) + { + lt_obj.matched = matched = true; + lt_obj.params.confidence = object.confidence; + lt_obj.params.box.x = factor_ * object.box.x + (1 - factor_) * lt_obj.params.box.x; + lt_obj.params.box.y = factor_ * object.box.y + (1 - factor_) * lt_obj.params.box.y; + lt_obj.params.box.width = factor_ * object.box.width + (1 - factor_) * lt_obj.params.box.width; + lt_obj.params.box.height = factor_ * object.box.height + (1 - factor_) * lt_obj.params.box.height; + // Reset the visibility counter for when the object next disappears. + lt_obj.visible = visible_frames_; + // Decrement the hidden counter until the object becomes visible in the list. + lt_obj.hidden = std::max(0, (int)lt_obj.hidden - 1); + break; + } + } + + // Add the object to the long term list if not found. This object will remain hidden for hidden_frames_ + // consecutive frames unless the long term list started out empty. + if (!matched) + { + const unsigned int hidden_frames = empty ? 0 : hidden_frames_; + lt_objects_.push_back({ object, visible_frames_, hidden_frames, 1 }); + } + } + + for (auto <_obj : lt_objects_) + { + if (!lt_obj.matched) + { + // If a non matched object in the long term list is still hidden, set visible count to 0 so that it must be + // matched for hidden_frames_ consecutive frames before becoming visible. Otherwise, decrement the visible + // count of unmatched objects in the long term list. + if (lt_obj.hidden) + lt_obj.visible = 0; + else + lt_obj.visible--; + } + } + + // Remove now invisible objects from the long term list. + lt_objects_.erase(std::remove_if(lt_objects_.begin(), lt_objects_.end(), + [] (const LtObject &obj) { return !obj.matched && !obj.visible; }), + lt_objects_.end()); +} + +static PostProcessingStage *Create(RPiCamApp *app) +{ + return new ObjectDetection(app); +} + +static RegisterStage reg(NAME, &Create); diff --git a/post_processing_stages/imx500/imx500_posenet.cpp b/post_processing_stages/imx500/imx500_posenet.cpp new file mode 100644 index 00000000..31e3c978 --- /dev/null +++ b/post_processing_stages/imx500/imx500_posenet.cpp @@ -0,0 +1,867 @@ +/* SPDX-License-Identifier: BSD-2-Clause */ +/* + * Copyright (C) 2024, Raspberry Pi Ltd + * + * imx500_posenet.cpp - IMX500 inference for PoseNet + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "core/rpicam_app.hpp" +#include "post_processing_stages/post_processing_stage.hpp" + +#include "imx500_post_processing_stage.hpp" + +using Rectangle = libcamera::Rectangle; +using Size = libcamera::Size; +namespace controls = libcamera::controls; + +namespace +{ + +constexpr Size INPUT_TENSOR_SIZE = { 481, 353 }; +constexpr Size MAP_SIZE = { 31, 23 }; +constexpr unsigned int NUM_KEYPOINTS = 17; +// These 16 edges allow traversing of the pose graph along the mid_offsets (see +// paper for details). +static constexpr int NUM_EDGES = 16; +constexpr unsigned int STRIDE = 16; +constexpr unsigned int NUM_HEATMAPS = NUM_KEYPOINTS * MAP_SIZE.width * MAP_SIZE.height; +constexpr unsigned int NUM_SHORT_OFFSETS = 2 * NUM_KEYPOINTS * MAP_SIZE.width * MAP_SIZE.height; +constexpr unsigned int NUM_MID_OFFSETS = 64 * MAP_SIZE.width * MAP_SIZE.height; + +enum KeypointType +{ + Nose, + LeftEye, + RightEye, + LeftEar, + RightEar, + LeftShoulder, + RightShoulder, + LeftElbow, + RightElbow, + LeftWrist, + RightWrist, + LeftHip, + RightHip, + LeftKnee, + RightKnee, + LeftAnkle, + RightAnkle +}; + +const std::array, 32> EdgeList = { { + // Forward edges + { Nose, LeftEye }, + { LeftEye, LeftEar }, + { Nose, RightEye }, + { RightEye, RightEar }, + { Nose, LeftShoulder }, + { LeftShoulder, LeftElbow }, + { LeftElbow, LeftWrist }, + { LeftShoulder, LeftHip }, + { LeftHip, LeftKnee }, + { LeftKnee, LeftAnkle }, + { Nose, RightShoulder }, + { RightShoulder, RightElbow }, + { RightElbow, RightWrist }, + { RightShoulder, RightHip }, + { RightHip, RightKnee }, + { RightKnee, RightAnkle }, + + // Backward edges + { LeftEye, Nose }, + { LeftEar, LeftEye }, + { RightEye, Nose }, + { RightEar, RightEye }, + { LeftShoulder, Nose }, + { LeftElbow, LeftShoulder }, + { LeftWrist, LeftElbow }, + { LeftHip, LeftShoulder }, + { LeftKnee, LeftHip }, + { LeftAnkle, LeftKnee }, + { RightShoulder, Nose }, + { RightElbow, RightShoulder }, + { RightWrist, RightElbow }, + { RightHip, RightShoulder }, + { RightKnee, RightHip }, + { RightAnkle, RightKnee }, +} }; + +struct Point +{ + float y, x; +}; + +// An adjacency list representing the directed edges connecting keypoints. +struct AdjacencyList +{ + explicit AdjacencyList(const int num_nodes) : child_ids(num_nodes), edge_ids(num_nodes) {} + + // child_ids[i] is a vector holding the node ids of all children of the i-th + // node and edge_ids[i] is a vector holding the edge ids of all edges stemming + // from the i-th node. If the k-th edge in the graph starts at the i-th node + // and ends at the j-th node, then child_ids[i] and edge_ids will contain j + // and k, respectively, at corresponding positions. + std::vector> child_ids; + std::vector> edge_ids; +}; + +// Defines a 2-D keypoint with (x, y) float coordinates and its type id. +struct KeypointWithScore +{ + KeypointWithScore(const Point &_pt, const int _id, const float _score) : point(_pt), id(_id), score(_score) {} + + [[maybe_unused]] friend std::ostream &operator<<(std::ostream &ost, const KeypointWithScore &keypoint) + { + return ost << keypoint.point.y << ", " << keypoint.point.x << ", " << keypoint.id << ", " << keypoint.score; + } + + bool operator<(const KeypointWithScore &other) const { return score < other.score; } + bool operator>(const KeypointWithScore &other) const { return score > other.score; } + + Point point; + int id; + float score; +}; + +using KeypointQueue = std::priority_queue>; +using PoseKeypoints = std::array; +using PoseKeypointScores = std::array; + +struct PoseResults +{ + float pose_score; + PoseKeypoints pose_keypoints; + PoseKeypointScores pose_keypoint_scores; +}; + +float sigmoid(const float x) +{ + return 1.0f / (1.0f + std::exp(-x)); +} + +float log_odds(const float x) +{ + return -std::log(1.0f / (x + 1E-6) - 1.0f); +} + +// Computes the squared distance between a pair of 2-D points. +float squared_distance(const Point &a, const Point &b) +{ + const float dy = b.y - a.y; + const float dx = b.x - a.x; + return dy * dy + dx * dx; +} + +std::vector format_tensor(const float *data, unsigned int size, unsigned int div) +{ + std::vector tensor(size * MAP_SIZE.width * MAP_SIZE.height); + + for (unsigned int i = 0; i < size; i++) + { + for (unsigned int j = 0; j < MAP_SIZE.width; j++) + { + for (unsigned int k = 0; k < MAP_SIZE.height; k++) + { + tensor[(size * MAP_SIZE.width * k) + (size * j) + i] = + data[(MAP_SIZE.width * MAP_SIZE.height * i) + (j * MAP_SIZE.height) + k] / div; + } + } + } + + return tensor; +} + +// Build an adjacency list of the pose graph. +AdjacencyList build_agency_list() +{ + AdjacencyList adjacency_list(NUM_KEYPOINTS); + + for (unsigned int k = 0; k < EdgeList.size(); ++k) + { + const int parent_id = EdgeList[k].first; + const int child_id = EdgeList[k].second; + adjacency_list.child_ids[parent_id].push_back(child_id); + adjacency_list.edge_ids[parent_id].push_back(k); + } + + return adjacency_list; +} + +bool pass_keypoint_nms(std::vector poses, const size_t num_poses, const KeypointWithScore &keypoint, + const float squared_nms_radius) +{ + for (unsigned int i = 0; i < num_poses; ++i) + { + if (squared_distance(keypoint.point, poses[i][keypoint.id]) <= squared_nms_radius) + return false; + } + + return true; +} + +// Finds the indices of the scores if we sort them in decreasing order. +void decreasing_arg_sort(std::vector &indices, const float *scores, unsigned int num_scores) +{ + indices.resize(num_scores); + std::iota(indices.begin(), indices.end(), 0); + std::sort(indices.begin(), indices.end(), [&scores](const int i, const int j) { return scores[i] > scores[j]; }); +} + +// Finds the indices of the scores if we sort them in decreasing order. +void decreasing_arg_sort(std::vector &indices, const std::vector &scores) +{ + decreasing_arg_sort(indices, scores.data(), scores.size()); +} + +// Helper function for 1-D linear interpolation. It computes the floor and the +// ceiling of the input coordinate, as well as the weighting factor between the +// two interpolation endpoints, such that: +// y = (1 - x_lerp) * vec[x_floor] + x_lerp * vec[x_ceil] +void build_linear_interpolation(const float x, const int n, int *x_floor, int *x_ceil, float *x_lerp) +{ + const float x_proj = std::clamp(x, 0.0f, n - 1.0f); + *x_floor = static_cast(std::floor(x_proj)); + *x_ceil = static_cast(std::ceil(x_proj)); + *x_lerp = x - (*x_floor); +} + +// Helper function for 2-D bilinear interpolation. It computes the four corners +// of the 2x2 cell that contain the input coordinates (x, y), as well as the +// weighting factor between the four interpolation endpoints, such that: +// y = +// (1 - y_lerp) * ((1 - x_lerp) * vec[top_left] + x_lerp * vec[top_right]) + +// y_lerp * ((1 - x_lerp) * tensor[bottom_left] + x_lerp * vec[bottom_right]) +void build_bilinear_interpolation(const Point point, const unsigned int num_channels, int *top_left, int *top_right, + int *bottom_left, int *bottom_right, float *y_lerp, float *x_lerp) +{ + int y_floor; + int y_ceil; + build_linear_interpolation(point.y, MAP_SIZE.height, &y_floor, &y_ceil, y_lerp); + int x_floor; + int x_ceil; + build_linear_interpolation(point.x, MAP_SIZE.width, &x_floor, &x_ceil, x_lerp); + *top_left = (y_floor * MAP_SIZE.width + x_floor) * num_channels; + *top_right = (y_floor * MAP_SIZE.width + x_ceil) * num_channels; + *bottom_left = (y_ceil * MAP_SIZE.width + x_floor) * num_channels; + *bottom_right = (y_ceil * MAP_SIZE.width + x_ceil) * num_channels; +} + +// Sample the input tensor values at position (x, y) and at multiple channels. +// The input tensor has shape [height, width, num_channels]. We bilinearly +// sample its value at tensor(y, x, c), for c in the channels specified. This +// is faster than calling the single channel interpolation function multiple +// times because the computation of the positions needs to be done only once. +std::vector sample_tensor_at_multiple_channels(const std::vector &tensor, const Point &point, + const std::vector &result_channels, + unsigned int num_channels) +{ + int top_left, top_right, bottom_left, bottom_right; + float y_lerp, x_lerp; + + build_bilinear_interpolation(point, num_channels, &top_left, &top_right, &bottom_left, &bottom_right, &y_lerp, + &x_lerp); + + std::vector result; + for (auto &c : result_channels) + { + result.push_back((1 - y_lerp) * ((1 - x_lerp) * tensor[top_left + c] + x_lerp * tensor[top_right + c]) + + y_lerp * ((1 - x_lerp) * tensor[bottom_left + c] + x_lerp * tensor[bottom_right + c])); + } + + return result; +} + +// Sample the input tensor values at position (x, y) and at a single channel. +// The input tensor has shape [height, width, num_channels]. We bilinearly +// sample its value at tensor(y, x, channel). +float sample_tensor_at_single_channel(const std::vector &tensor, const Point &point, unsigned int num_channels, + const int c) +{ + std::vector result = sample_tensor_at_multiple_channels(tensor, point, std::vector { c }, num_channels); + return result[0]; +} + +KeypointQueue build_keypoint_queue(const std::vector &scores, const std::vector &short_offsets, + const float score_threshold) +{ + constexpr unsigned int local_maximum_radius = 1; + unsigned int score_index = 0; + KeypointQueue queue; + + for (unsigned int y = 0; y < MAP_SIZE.height; ++y) + { + for (unsigned int x = 0; x < MAP_SIZE.width; ++x) + { + unsigned int offset_index = 2 * score_index; + for (unsigned int j = 0; j < NUM_KEYPOINTS; ++j) + { + const float score = scores[score_index]; + if (score >= score_threshold) + { + // Only consider keypoints whose score is maximum in a local window. + bool local_maximum = true; + const unsigned int y_start = std::max((int)y - (int)local_maximum_radius, 0); + const unsigned int y_end = std::min(y + local_maximum_radius + 1, MAP_SIZE.height); + for (unsigned int y_current = y_start; y_current < y_end; ++y_current) + { + const unsigned int x_start = std::max((int)x - (int)local_maximum_radius, 0); + const unsigned int x_end = std::min(x + local_maximum_radius + 1, MAP_SIZE.width); + for (unsigned int x_current = x_start; x_current < x_end; ++x_current) + { + if (scores[y_current * MAP_SIZE.width * NUM_KEYPOINTS + x_current * NUM_KEYPOINTS + j] > + score) + { + local_maximum = false; + break; + } + } + if (!local_maximum) + break; + } + if (local_maximum) + { + const float dy = short_offsets[offset_index]; + const float dx = short_offsets[offset_index + NUM_KEYPOINTS]; + const float y_refined = std::clamp(y + dy, 0.0f, MAP_SIZE.height - 1.0f); + const float x_refined = std::clamp(x + dx, 0.0f, MAP_SIZE.width - 1.0f); + queue.emplace(Point { y_refined, x_refined }, j, score); + } + } + + ++score_index; + ++offset_index; + } + } + } + + return queue; +} + +// Follows the mid-range offsets, and then refines the position by the short- +// range offsets for a fixed number of steps. +Point find_displaced_position(const std::vector &short_offsets, const std::vector &mid_offsets, + const Point &source, const int edge_id, const int target_id, + const int offset_refinement_steps) +{ + float y = source.y, x = source.x; + + // Follow the mid-range offsets. + std::vector channels = { edge_id, NUM_EDGES + edge_id }; + + // Total size of mid_offsets is height x width x 2*2*num_edges + std::vector offsets = sample_tensor_at_multiple_channels(mid_offsets, source, channels, 2 * 2 * NUM_EDGES); + y = std::clamp(y + offsets[0], 0.0f, MAP_SIZE.height - 1.0f); + x = std::clamp(x + offsets[1], 0.0f, MAP_SIZE.width - 1.0f); + + // Refine by the short-range offsets. + channels[0] = target_id; + channels[1] = NUM_KEYPOINTS + target_id; + for (int i = 0; i < offset_refinement_steps; ++i) + { + offsets = sample_tensor_at_multiple_channels(short_offsets, Point { y, x }, channels, 2 * NUM_KEYPOINTS); + y = std::clamp(y + offsets[0], 0.0f, MAP_SIZE.height - 1.0f); + x = std::clamp(x + offsets[1], 0.0f, MAP_SIZE.width - 1.0f); + } + + return Point { y, x }; +} + +void backtrack_decode_pose(const std::vector &scores, const std::vector &short_offsets, + const std::vector &mid_offsets, const KeypointWithScore &root, + const AdjacencyList &adjacency_list, PoseKeypoints &pose_keypoints, + PoseKeypointScores &keypoint_scores, unsigned int offset_refinement_steps) +{ + const float root_score = sample_tensor_at_single_channel(scores, root.point, root.id, NUM_KEYPOINTS); + + // Used in order to put candidate keypoints in a priority queue w.r.t. their + // score. Keypoints with higher score have higher priority and will be + // decoded/processed first. + KeypointQueue decode_queue; + decode_queue.push(KeypointWithScore(root.point, root.id, root_score)); + + // Keeps track of the keypoints whose position has already been decoded. + std::vector keypoint_decoded(NUM_KEYPOINTS, false); + + while (!decode_queue.empty()) + { + // The top element in the queue is the next keypoint to be processed. + const KeypointWithScore current_keypoint = decode_queue.top(); + decode_queue.pop(); + + if (keypoint_decoded[current_keypoint.id]) + continue; + + pose_keypoints[current_keypoint.id] = current_keypoint.point; + keypoint_scores[current_keypoint.id] = current_keypoint.score; + + keypoint_decoded[current_keypoint.id] = true; + + // Add the children of the current keypoint that have not been decoded yet + // to the priority queue. + const int num_children = adjacency_list.child_ids[current_keypoint.id].size(); + for (int j = 0; j < num_children; ++j) + { + const int child_id = adjacency_list.child_ids[current_keypoint.id][j]; + int edge_id = adjacency_list.edge_ids[current_keypoint.id][j]; + if (keypoint_decoded[child_id]) + continue; + + // The mid-offsets block is organized as 4 blocks of NUM_EDGES: + // [fwd Y offsets][fwd X offsets][bwd Y offsets][bwd X offsets] + // OTOH edge_id is [0,NUM_EDGES) for forward edges and + // [NUM_EDGES, 2*NUM_EDGES) for backward edges. + // Thus if the edge is a backward edge (>NUM_EDGES) then we need + // to start 16 indices later to be correctly aligned with the mid-offsets. + if (edge_id > NUM_EDGES) + edge_id += NUM_EDGES; + + const Point child_point = find_displaced_position(short_offsets, mid_offsets, current_keypoint.point, + edge_id, child_id, offset_refinement_steps); + const float child_score = sample_tensor_at_single_channel(scores, child_point, NUM_KEYPOINTS, child_id); + decode_queue.emplace(child_point, child_id, child_score); + } + } +} + +void find_overlapping_keypoints(std::vector &mask, const PoseKeypoints &pose1, const PoseKeypoints &pose2, + const float squared_radius) +{ + for (unsigned int k = 0; k < mask.size(); ++k) + { + if (squared_distance(pose1[k], pose2[k]) <= squared_radius) + mask[k] = true; + } +} + +void perform_soft_keypoint_NMS(std::vector &all_instance_scores, const std::vector &decreasing_indices, + const std::vector &all_keypoint_coords, + const std::vector &all_keypoint_scores, + const float squared_nms_radius) +{ + const int num_instances = decreasing_indices.size(); + + all_instance_scores.resize(num_instances); + // Indicates the occlusion status of the keypoints of the active instance. + std::vector keypoint_occluded(NUM_KEYPOINTS); + // Indices of the keypoints of the active instance in decreasing score value. + std::vector indices(NUM_KEYPOINTS); + for (int i = 0; i < num_instances; ++i) + { + const int current_index = decreasing_indices[i]; + // Find the keypoints of the current instance which are overlapping with + // the corresponding keypoints of the higher-scoring instances and + // zero-out their contribution to the score of the current instance. + std::fill(keypoint_occluded.begin(), keypoint_occluded.end(), false); + for (int j = 0; j < i; ++j) + { + const int previous_index = decreasing_indices[j]; + find_overlapping_keypoints(keypoint_occluded, all_keypoint_coords[current_index], + all_keypoint_coords[previous_index], squared_nms_radius); + } + // We compute the argsort keypoint indices based on the original keypoint + // scores, but we do not let them contribute to the instance score if they + // have been non-maximum suppressed. + decreasing_arg_sort(indices, all_keypoint_scores[current_index].data(), + all_keypoint_scores[current_index].size()); + float total_score = 0.0f; + for (unsigned int k = 0; k < NUM_KEYPOINTS; ++k) + { + if (!keypoint_occluded[indices[k]]) + total_score += all_keypoint_scores[current_index][indices[k]]; + } + all_instance_scores[current_index] = total_score / NUM_KEYPOINTS; + } +} + +} // namespace + +#define NAME "imx500_posenet" + +class PoseNet : public IMX500PostProcessingStage +{ +public: + PoseNet(RPiCamApp *app) : IMX500PostProcessingStage(app) {} + + char const *Name() const override; + + void Read(boost::property_tree::ptree const ¶ms) override; + + void Configure() override; + + bool Process(CompletedRequestPtr &completed_request) override; + +private: + std::vector decodeAllPoses(const std::vector &scores, const std::vector &short_offsets, + const std::vector &mid_offsets); + void translateCoordinates(std::vector &results, const Rectangle &scaler_crop) const; + void filterOutputObjects(const std::vector &results); + + struct LtResults + { + PoseResults results; + unsigned int visible; + unsigned int hidden; + bool matched; + }; + + std::vector lt_results_; + std::mutex lt_lock_; + + // Config params: + float threshold_; + unsigned int max_detections_; + unsigned int offset_refinement_steps_; + float nms_radius_; + bool temporal_filtering_; + + float tolerance_; + float factor_; + unsigned int visible_frames_; + unsigned int hidden_frames_; + bool started_ = false; +}; + +char const *PoseNet::Name() const +{ + return NAME; +} + +void PoseNet::Read(boost::property_tree::ptree const ¶ms) +{ + max_detections_ = params.get("max_detections", 10); + threshold_ = params.get("threshold", 0.5f); + offset_refinement_steps_ = params.get("offset_refinement_steps", 5); + nms_radius_ = params.get("nms_radius", 10) / STRIDE; + + if (params.find("temporal_filter") != params.not_found()) + { + temporal_filtering_ = true; + tolerance_ = params.get("temporal_filter.tolerance", 0.05); + factor_ = params.get("temporal_filter.factor", 0.2); + visible_frames_ = params.get("temporal_filter.visible_frames", 5); + hidden_frames_ = params.get("temporal_filter.hidden_frames", 2); + } + else + temporal_filtering_ = false; + + IMX500PostProcessingStage::Read(params); +} + +void PoseNet::Configure() +{ + lt_results_.clear(); + IMX500PostProcessingStage::Configure(); + if (!started_) + { + IMX500PostProcessingStage::ShowFwProgressBar(); + started_ = true; + } +} + +bool PoseNet::Process(CompletedRequestPtr &completed_request) +{ + auto scaler_crop = completed_request->metadata.get(controls::ScalerCrop); + if (!raw_stream_ || !scaler_crop) + { + LOG_ERROR("Must have RAW stream and scaler crop available to get sensor dimensions!"); + return false; + } + + auto output = completed_request->metadata.get(controls::rpi::CnnOutputTensor); + if (!output) + { + LOG_ERROR("No output tensor found in metadata!"); + return false; + } + + if (output->size() < NUM_HEATMAPS + NUM_SHORT_OFFSETS + NUM_MID_OFFSETS) + { + LOG_ERROR("Unexpected output tensor size: " << output->size()); + return false; + } + + std::vector scores = + format_tensor((float *)output->data(), NUM_HEATMAPS / (MAP_SIZE.width * MAP_SIZE.height), 1); + std::vector short_offsets = format_tensor((float *)output->data() + NUM_HEATMAPS, + NUM_SHORT_OFFSETS / (MAP_SIZE.width * MAP_SIZE.height), STRIDE); + std::vector mid_offsets = format_tensor((float *)output->data() + NUM_HEATMAPS + NUM_SHORT_OFFSETS, + NUM_MID_OFFSETS / (MAP_SIZE.width * MAP_SIZE.height), STRIDE); + + std::vector results = decodeAllPoses(scores, short_offsets, mid_offsets); + translateCoordinates(results, *scaler_crop); + + std::vector> locations; + std::vector> confidences; + + if (temporal_filtering_) + { + // Process() can be concurrently called through different threads for consecutive CompletedRequests if + // things are running behind. So protect access to the lt_results_ state object. + std::scoped_lock l(lt_lock_); + + filterOutputObjects(results); + for (auto const <_r : lt_results_) + { + if (lt_r.hidden) + continue; + + locations.push_back({}); + confidences.push_back({}); + + for (auto const &s : lt_r.results.pose_keypoint_scores) + confidences.back().push_back(s); + for (auto const &k : lt_r.results.pose_keypoints) + locations.back().emplace_back(k.x, k.y); + } + } + else if (results.size()) + { + for (auto const &result : results) + { + locations.push_back({}); + confidences.push_back({}); + + for (auto const &s : result.pose_keypoint_scores) + confidences.back().push_back(s); + for (auto const &k : result.pose_keypoints) + locations.back().emplace_back(k.x, k.y); + } + } + + if (locations.size() && locations.size() == confidences.size()) + { + completed_request->post_process_metadata.Set("pose_estimation.locations", locations); + completed_request->post_process_metadata.Set("pose_estimation.confidences", confidences); + } + + return IMX500PostProcessingStage::Process(completed_request); +} + +// Decodes poses from the score map, the short and mid offsets. +// "Block space" refers to the output y and z size of the network. +// For example if the network that takes a (353,481) (y,x) input image will have +// an output field of 23, 31. Thus the sizes of the input vectors to this +// functions will be +// scores: 23 x 31 x NUM_KEYPOINTS +// short_offsets 23 x 31 x 2 x NUM_KEYPOINTS (x and y per keypoint) +// mid_offsets 23 x 31 x 2 x 2 2 NUM_EDGES (x and y for each fwd and bwd edge) +// Thus height and width need to be set to 23 and 31 respectively. +// nms_radius must also be given in these units. +// The output coordinates will be in pixel coordinates. +// +// For details see https://arxiv.org/abs/1803.08225 +// PersonLab: Person Pose Estimation and Instance Segmentation with a +// Bottom-Up, Part-Based, Geometric Embedding Model +// George Papandreou, Tyler Zhu, Liang-Chieh Chen, Spyros Gidaris, +// Jonathan Tompson, Kevin Murphy +std::vector PoseNet::decodeAllPoses(const std::vector &scores, + const std::vector &short_offsets, + const std::vector &mid_offsets) +{ + const float min_score_logit = log_odds(threshold_); + + KeypointQueue queue = build_keypoint_queue(scores, short_offsets, min_score_logit); + AdjacencyList adjacency_list = build_agency_list(); + + std::vector indices(NUM_KEYPOINTS); + + // Generate at most max_detections object instances per image in decreasing + // root part score order. + std::vector all_instance_scores; + + std::vector scratch_poses(NUM_KEYPOINTS); + std::vector scratch_keypoint_scores(max_detections_); + + unsigned int pose_counter = 0; + while (pose_counter < max_detections_ && !queue.empty()) + { + // The top element in the queue is the next root candidate. + const KeypointWithScore root = queue.top(); + queue.pop(); + + // Reject a root candidate if it is within a disk of nms_radius_ pixels + // from the corresponding part of a previously detected instance. + if (!pass_keypoint_nms(scratch_poses, pose_counter, root, nms_radius_ * nms_radius_)) + continue; + + auto &next_pose = scratch_poses[pose_counter]; + auto &next_scores = scratch_keypoint_scores[pose_counter]; + for (unsigned int k = 0; k < NUM_KEYPOINTS; ++k) + { + next_pose[k].x = -1.0f; + next_pose[k].y = -1.0f; + next_scores[k] = -1E5; + } + + backtrack_decode_pose(scores, short_offsets, mid_offsets, root, adjacency_list, next_pose, next_scores, + offset_refinement_steps_); + + // Convert keypoint-level scores from log-odds to probabilities and compute + // an initial instance-level score as the average of the scores of the top-k + // scoring keypoints. + for (unsigned int k = 0; k < NUM_KEYPOINTS; ++k) + next_scores[k] = sigmoid(next_scores[k]); + + decreasing_arg_sort(indices, next_scores.data(), next_scores.size()); + + float instance_score = 0.0f; + for (unsigned int j = 0; j < NUM_KEYPOINTS; ++j) + instance_score += next_scores[indices[j]]; + + instance_score /= NUM_KEYPOINTS; + + if (instance_score >= threshold_) + { + pose_counter++; + all_instance_scores.push_back(instance_score); + } + } + + // Sort the detections in decreasing order of their instance-level scores. + std::vector decreasing_indices; + decreasing_arg_sort(decreasing_indices, all_instance_scores); + + // Keypoint-level soft non-maximum suppression and instance-level rescoring as + // the average of the top-k keypoints in terms of their keypoint-level scores. + perform_soft_keypoint_NMS(all_instance_scores, decreasing_indices, scratch_poses, scratch_keypoint_scores, + nms_radius_ * nms_radius_); + + // Sort the detections in decreasing order of their final instance-level + // scores. Usually the order does not change but this is not guaranteed. + decreasing_arg_sort(decreasing_indices, all_instance_scores); + + std::vector results; + for (int index : decreasing_indices) + { + if (all_instance_scores[index] < threshold_) + break; + + // New result. + results.push_back({}); + + // Rescale keypoint coordinates into pixel space (much more useful for user). + for (unsigned int k = 0; k < NUM_KEYPOINTS; ++k) + { + results.back().pose_keypoints[k].y = scratch_poses[index][k].y * STRIDE; + results.back().pose_keypoints[k].x = scratch_poses[index][k].x * STRIDE; + } + + std::copy(scratch_keypoint_scores[index].begin(), scratch_keypoint_scores[index].end(), + results.back().pose_keypoint_scores.begin()); + results.back().pose_score = all_instance_scores[index]; + } + + return results; +} + +void PoseNet::translateCoordinates(std::vector &results, const Rectangle &scaler_crop) const +{ + for (auto &r : results) + { + for (auto &keypoint : r.pose_keypoints) + { + std::vector coords{ keypoint.x / (INPUT_TENSOR_SIZE.width - 1), + keypoint.y / (INPUT_TENSOR_SIZE.height - 1), + 0, 0 }; + Rectangle translated = ConvertInferenceCoordinates(coords, scaler_crop); + keypoint.x = translated.x; + keypoint.y = translated.y; + } + } +} + +void PoseNet::filterOutputObjects(const std::vector &results) +{ + const Size isp_output_size = output_stream_->configuration().size; + + for (auto <_r : lt_results_) + lt_r.matched = false; + + for (auto const &r : results) + { + bool matched = false; + for (auto <_r : lt_results_) + { + // Try and match a detected object in our long term list. + bool found = true; + for (unsigned int i = 0; i < NUM_KEYPOINTS; i++) + { + if (std::abs(lt_r.results.pose_keypoints[i].x - r.pose_keypoints[i].x) > + tolerance_ * isp_output_size.width || + std::abs(lt_r.results.pose_keypoints[i].y - r.pose_keypoints[i].y) > + tolerance_ * isp_output_size.height) + { + found = false; + break; + } + } + + if (found) + { + lt_r.matched = matched = true; + lt_r.results.pose_score = r.pose_score; + + for (unsigned int i = 0; i < NUM_KEYPOINTS; i++) + { + lt_r.results.pose_keypoint_scores[i] = + factor_ * r.pose_keypoint_scores[i] + (1 - factor_) * lt_r.results.pose_keypoint_scores[i]; + lt_r.results.pose_keypoints[i].x = + factor_ * r.pose_keypoints[i].x + (1 - factor_) * lt_r.results.pose_keypoints[i].x; + lt_r.results.pose_keypoints[i].y = + factor_ * r.pose_keypoints[i].y + (1 - factor_) * lt_r.results.pose_keypoints[i].y; + } + + // Reset the visibility counter for when the result next disappears. + lt_r.visible = visible_frames_; + // Decrement the hidden counter until the result becomes visible in the list. + lt_r.hidden = std::max(0, (int)lt_r.hidden - 1); + break; + } + } + + // Add the result to the long term list if not found. This result will remain hidden for hidden_frames_ + // consecutive frames. + if (!matched) + lt_results_.push_back({ r, visible_frames_, hidden_frames_, true }); + } + + for (auto <_r : lt_results_) + { + if (!lt_r.matched) + { + // If a non matched result in the long term list is still hidden, set visible count to 0 so that it must be + // matched for hidden_frames_ consecutive frames before becoming visible. Otherwise, decrement the visible + // count of unmatched objects in the long term list. + if (lt_r.hidden) + lt_r.visible = 0; + else + lt_r.visible--; + } + } + + // Remove now invisible objects from the long term list. + lt_results_.erase(std::remove_if(lt_results_.begin(), lt_results_.end(), + [] (const LtResults &obj) { return !obj.matched && !obj.visible; }), + lt_results_.end()); +} + +PostProcessingStage *Create(RPiCamApp *app) +{ + return new PoseNet(app); +} + +RegisterStage reg(NAME, &Create); diff --git a/post_processing_stages/imx500/imx500_post_processing_stage.cpp b/post_processing_stages/imx500/imx500_post_processing_stage.cpp new file mode 100644 index 00000000..5114871f --- /dev/null +++ b/post_processing_stages/imx500/imx500_post_processing_stage.cpp @@ -0,0 +1,280 @@ +/* SPDX-License-Identifier: BSD-2-Clause */ +/* + * Copyright (C) 2024, Raspberry Pi Ltd + * + * imx500_post_rpocessing_stage.cpp - IMX500 post processing stage base class + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "imx500_post_processing_stage.hpp" + +#include + +using Stream = libcamera::Stream; +using Rectangle = libcamera::Rectangle; +using Size = libcamera::Size; + +using namespace std::chrono_literals; + +namespace fs = std::filesystem; + +namespace +{ + +const unsigned int ROI_CTRL_ID = 0x00982900; +const unsigned int NETWORK_FW_CTRL_ID = 0x00982901; + +inline int16_t conv_reg_signed(int16_t reg) +{ + constexpr unsigned int ROT_DNN_NORM_SIGNED_SHT = 8; + constexpr unsigned int ROT_DNN_NORM_MASK = 0x01FF; + + if (!((reg >> ROT_DNN_NORM_SIGNED_SHT) & 1)) + return reg; + else + return -((~reg + 1) & ROT_DNN_NORM_MASK); +} + +} // namespace + + +IMX500PostProcessingStage::IMX500PostProcessingStage(RPiCamApp *app) + : PostProcessingStage(app), device_fd_(-1) +{ + for (unsigned int i = 0; i < 16; i++) + { + const fs::path test_dir { "/sys/class/video4linux/v4l-subdev" + std::to_string(i) + "/device" }; + const fs::path module_dir { test_dir.string() + "/driver/module" }; + const fs::path id_dir { test_dir.string() + "/of_node" }; + + if (fs::exists(module_dir) && fs::is_symlink(module_dir)) + { + fs::path ln = fs::read_symlink(module_dir); + if (ln.string().find("imx500") != std::string::npos) + { + const std::string dev_node { "/dev/v4l-subdev" + std::to_string(i) }; + device_fd_ = open(dev_node.c_str(), O_RDONLY, 0); + + /* Find the progress indicator sysfs dev nodes. */ + const std::string test_dir_str = fs::read_symlink(test_dir).string(); + const std::size_t pos = test_dir_str.find_last_of("/") + 1; + assert(pos != std::string::npos); + + const std::string imx500_device_id = test_dir_str.substr(pos); + std::string spi_device_id = imx500_device_id; + const std::size_t rep = spi_device_id.find("001a"); + spi_device_id.replace(rep, 4, "0040"); + + const fs::path imx500_progress { "/sys/kernel/debug/imx500-fw:" + imx500_device_id + "/fw_progress" }; + const fs::path spi_progress { "/sys/kernel/debug/rp2040-spi:" + spi_device_id + "/transfer_progress" }; + + fw_progress_.open(imx500_progress.c_str(), std::ios_base::in); + fw_progress_chunk_.open(spi_progress.c_str(), std::ios_base::in); + + break; + } + } + } + + if (device_fd_ < 0) + throw std::runtime_error("Cannot open imx500 device node"); + + SetInferenceRoiAbs({ 0, 0, 4056, 3040 }); +} + +IMX500PostProcessingStage::~IMX500PostProcessingStage() +{ + if (device_fd_ >= 0) + close(device_fd_); +} + +void IMX500PostProcessingStage::Read(boost::property_tree::ptree const ¶ms) +{ + if (params.find("save_input_tensor") != params.not_found()) + { + auto const &pt = params.get_child("save_input_tensor"); + + std::string filename = pt.get("filename"); + num_input_tensors_saved_ = pt.get("num_tensors", 1); + input_tensor_file_ = std::ofstream(filename, std::ios::out | std::ios::binary); + + norm_val_ = PostProcessingStage::GetJsonArray(pt, "norm_val", { 0, 0, 0, 0 }); + norm_shift_ = PostProcessingStage::GetJsonArray(pt, "norm_shift", { 0, 0, 0, 0 }); + div_val_ = PostProcessingStage::GetJsonArray(pt, "div_val", { 1, 1, 1, 1 }); + div_shift_ = pt.get("div_shift", 0); + } + + /* Load the network firmware. */ + std::string network_file = params.get("network_file"); + if (!fs::exists(network_file)) + throw std::runtime_error(network_file + " not found!"); + + int fd = open(network_file.c_str(), O_RDONLY, 0); + + v4l2_control ctrl { NETWORK_FW_CTRL_ID, fd }; + int ret = ioctl(device_fd_, VIDIOC_S_CTRL, &ctrl); + if (ret) + throw std::runtime_error("failed to set network fw ioctl"); + + close(fd); + + LOG(1, "\n------------------------------------------------------------------------------------------------------------------\n" + "NOTE: Loading network firmware onto the IMX500 can take several minutes, please do not close down the application." + "\n------------------------------------------------------------------------------------------------------------------\n"); +} + +void IMX500PostProcessingStage::Configure() +{ + output_stream_ = app_->GetMainStream(); + raw_stream_ = app_->RawStream(); + save_frames_ = num_input_tensors_saved_; +} + +bool IMX500PostProcessingStage::Process(CompletedRequestPtr &completed_request) +{ + auto input = completed_request->metadata.get(controls::rpi::CnnInputTensor); + + if (input && input_tensor_file_.is_open()) + { + // There is a chance that this may be called through multiple threads, so serialize the file access. + std::scoped_lock l(lock_); + + for (unsigned int i = 0; i < input->size(); i++) + { + const unsigned int channel = i % 3; // Assume RGB interleaved format. + int16_t sample = static_cast(input->data()[i]); + sample = (sample << norm_shift_[channel]) - conv_reg_signed(norm_val_[channel]); + sample = ((sample << div_shift_) / div_val_[channel]) & 0xFF; + input_tensor_file_.put(static_cast(sample)); + } + + if (--save_frames_ == 0) + input_tensor_file_.close(); + } + + return false; +} + +Rectangle IMX500PostProcessingStage::ConvertInferenceCoordinates(const std::vector &coords, + const Rectangle &scaler_crop) const +{ + // Convert the inference image co-ordinates into the final ISP output co-ordinates. + const Size &isp_output_size = output_stream_->configuration().size; + const Size &sensor_output_size = raw_stream_->configuration().size; + const Rectangle sensor_crop = scaler_crop.scaledBy(sensor_output_size, full_sensor_resolution_.size()); + + if (coords.size() != 4) + return {}; + + // Object scaled to the full sensor resolution + Rectangle obj; + obj.x = std::round(coords[0] * (full_sensor_resolution_.width - 1)); + obj.y = std::round(coords[1] * (full_sensor_resolution_.height - 1)); + obj.width = std::round(coords[2] * (full_sensor_resolution_.width - 1)); + obj.height = std::round(coords[3] * (full_sensor_resolution_.height - 1)); + + // Object on inference image -> sensor image + const Rectangle obj_sensor = obj.scaledBy(sensor_output_size, full_sensor_resolution_.size()); + // -> bounded to the ISP crop on the sensor image + const Rectangle obj_bound = obj_sensor.boundedTo(sensor_crop); + // -> translated by the start of the crop offset + const Rectangle obj_translated = obj_bound.translatedBy(-sensor_crop.topLeft()); + // -> and finally scaled to the ISP output. + const Rectangle obj_scaled = obj_translated.scaledBy(isp_output_size, sensor_crop.size()); + + LOG(2, obj << " -> (sensor) " << obj_sensor << " -> (bound) " << obj_bound + << " -> (translate) " << obj_translated << " -> (scaled) " << obj_scaled); + + return obj_scaled; +} + +void IMX500PostProcessingStage::SetInferenceRoiAbs(const Rectangle &roi_) const +{ + Rectangle roi = roi_.boundedTo(full_sensor_resolution_); + const uint32_t roi_array[4] = { (uint32_t)roi.x, (uint32_t)roi.y, (uint32_t)roi.width, (uint32_t)roi.height }; + + v4l2_ext_control roi_ctrl; + roi_ctrl.id = ROI_CTRL_ID; + roi_ctrl.p_u32 = (unsigned int *)&roi_array; + roi_ctrl.size = 16; + + v4l2_ext_controls ctrls; + ctrls.count = 1; + ctrls.controls = (v4l2_ext_control *)&roi_ctrl; + + int ret = ioctl(device_fd_, VIDIOC_S_EXT_CTRLS, &ctrls); + if (ret) + LOG_ERROR("IMX500: Unable to set absolute ROI"); +} + +void IMX500PostProcessingStage::SetInferenceRoiAuto(const unsigned int width, const unsigned int height) const +{ + Size s = full_sensor_resolution_.size().boundedToAspectRatio(Size(width, height)); + Rectangle r = s.centeredTo(full_sensor_resolution_.center()).enclosedIn(full_sensor_resolution_); + SetInferenceRoiAbs(r); +} + +void IMX500PostProcessingStage::ShowFwProgressBar() +{ + if (fw_progress_.is_open() && fw_progress_chunk_.is_open()) + { + std::thread progress_thread { &IMX500PostProcessingStage::doProgressBar, this }; + progress_thread.detach(); + } +} + +std::vector split(std::stringstream &stream) +{ + std::vector result; + unsigned int n; + + while (stream >> n) + result.push_back(n); + + return result; +} + +void IMX500PostProcessingStage::doProgressBar() +{ + while (1) + { + std::stringstream fw_progress_str, block_str; + unsigned int block_progress; + + fw_progress_.seekg(0); + fw_progress_chunk_.seekg(0); + fw_progress_str << fw_progress_.rdbuf(); + block_str << fw_progress_chunk_.rdbuf(); + + std::vector progress = split(fw_progress_str); + block_str >> block_progress; + + // [0] == FW state, [1] == current size, [2] == total size. + if (progress.size() == 3 && progress[0] == 2) + { + unsigned int current = progress[1] + block_progress; + std::cerr << "Network Firmware Upload: " << current * 100 / progress[2] << "% (" << current / 1024 << "/" + << progress[2] / 1024 << " KB)\r"; + std::cerr.flush(); + + if (progress[2] && progress[1] == progress[2]) + { + std::cerr << std::endl; + break; + } + } + + std::this_thread::sleep_for(500ms); + } +} diff --git a/post_processing_stages/imx500/imx500_post_processing_stage.hpp b/post_processing_stages/imx500/imx500_post_processing_stage.hpp new file mode 100644 index 00000000..4fcd76b5 --- /dev/null +++ b/post_processing_stages/imx500/imx500_post_processing_stage.hpp @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: BSD-2-Clause */ +/* + * Copyright (C) 2024, Raspberry Pi Ltd + * + * imx500_post_rpocessing_stage.hpp - IMX500 post processing stage base class + */ + +#pragma once + +#include +#include + +#include + +#include +#include + +#include "core/completed_request.hpp" +#include "core/rpicam_app.hpp" +#include "post_processing_stages/post_processing_stage.hpp" + +class IMX500PostProcessingStage : public PostProcessingStage +{ +public: + static constexpr unsigned int Max_Num_Tensors = 8; + static constexpr unsigned int Network_Name_Len = 64; + + struct OutputTensorInfo + { + uint32_t tensor_data_num; + uint16_t size; + uint8_t ordinal; + uint8_t serialization_index; + }; + + struct CnnOutputTensorInfo + { + char network_name[Network_Name_Len]; + uint32_t num_tensors; + OutputTensorInfo info[Max_Num_Tensors]; + }; + + IMX500PostProcessingStage(RPiCamApp *app); + ~IMX500PostProcessingStage(); + + void Read(boost::property_tree::ptree const ¶ms) override; + + void Configure() override; + + bool Process(CompletedRequestPtr &completed_request) override; + + libcamera::Rectangle ConvertInferenceCoordinates(const std::vector &coords, + const libcamera::Rectangle &scalerCrop) const; + void SetInferenceRoiAbs(const libcamera::Rectangle &roi_) const; + void SetInferenceRoiAuto(const unsigned int width, const unsigned int height) const; + void ShowFwProgressBar(); + +protected: + libcamera::Rectangle full_sensor_resolution_ = libcamera::Rectangle(0, 0, 4056, 3040); + libcamera::Stream *output_stream_; + libcamera::Stream *raw_stream_; + +private: + void doProgressBar(); + + int device_fd_; + std::ifstream fw_progress_; + std::ifstream fw_progress_chunk_; + + std::ofstream input_tensor_file_; + unsigned int num_input_tensors_saved_; + unsigned int save_frames_; + std::vector norm_val_; + std::vector norm_shift_; + std::vector div_val_; + unsigned int div_shift_; + std::mutex lock_; +}; diff --git a/post_processing_stages/imx500/meson.build b/post_processing_stages/imx500/meson.build new file mode 100644 index 00000000..985fbc8f --- /dev/null +++ b/post_processing_stages/imx500/meson.build @@ -0,0 +1,31 @@ +imx500_postprocessing_src = files([ + # Base stage + 'imx500_post_processing_stage.cpp', + # Object detection + 'imx500_object_detection.cpp', + # Posenet + 'imx500_posenet.cpp', +]) + +postproc_assets += files([ + assets_dir / 'imx500_mobilenet_ssd.json', + assets_dir / 'imx500_posenet.json', +]) + +imx500_postprocessing_lib = shared_module('imx500-postproc', imx500_postprocessing_src, + dependencies : libcamera_dep, + include_directories : '../..', + install : true, + install_dir : posproc_libdir, + name_prefix : '', + ) + +if get_option('download_imx500_models') + download_script = meson.project_source_root() / 'utils' / 'download-imx500-models.sh' + custom_target('imx500-models', + command : [ download_script, '@OUTPUT@' ], + output : 'imx500-models', + install : true, + install_dir : get_option('datadir'), + ) +endif diff --git a/post_processing_stages/meson.build b/post_processing_stages/meson.build index 1dd6edf7..aa101413 100644 --- a/post_processing_stages/meson.build +++ b/post_processing_stages/meson.build @@ -109,6 +109,11 @@ if hailort_dep.found() and hailo_tappas_dep.found() enable_hailo = true endif +# IMX500 postprocessing stages. +if get_option('enable_imx500') + subdir('imx500') +endif + post_processing_headers = files([ 'histogram.hpp', 'object_detect.hpp', diff --git a/post_processing_stages/post_processing_stage.hpp b/post_processing_stages/post_processing_stage.hpp index e745a491..30237ba5 100644 --- a/post_processing_stages/post_processing_stage.hpp +++ b/post_processing_stages/post_processing_stage.hpp @@ -5,9 +5,12 @@ * post_processing_stage.hpp - Post processing stage base class definition. */ +#pragma once + #include #include #include +#include // Prevents compiler warnings in Boost headers with more recent versions of GCC. #define BOOST_BIND_GLOBAL_PLACEHOLDERS @@ -71,6 +74,24 @@ class PostProcessingStage return std::chrono::duration(t2 - t1); } + template + static std::vector GetJsonArray(const boost::property_tree::ptree &pt, const std::string &key, + const std::vector &default_value = {}) + { + std::vector vec; + + if (pt.find(key) != pt.not_found()) + { + for (auto &v : pt.get_child(key)) + vec.push_back(v.second.get_value()); + } + + for (unsigned int i = vec.size(); i < default_value.size(); i++) + vec.push_back(default_value[i]); + + return vec; + } + RPiCamApp *app_; }; diff --git a/utils/download-imx500-models.sh b/utils/download-imx500-models.sh new file mode 100755 index 00000000..04e2ef3f --- /dev/null +++ b/utils/download-imx500-models.sh @@ -0,0 +1,42 @@ +#!/bin/bash +set -e + +networks=( + https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_deeplabv3plus.rpk + https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_efficientdet_lite0_pp.rpk + https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_efficientnet_bo.rpk + https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_efficientnet_lite0.rpk + https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_efficientnetv2_b0.rpk + https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_efficientnetv2_b1.rpk + https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_efficientnetv2_b2.rpk + https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_higherhrnet_coco.rpk + https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_inputtensoronly.rpk + https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_levit_128s.rpk + https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_mnasnet1.0.rpk + https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_mobilenet_v2.rpk + https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_mobilevit_xs.rpk + https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_mobilevit_xxs.rpk + https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_nanodet_plus_416x416_pp.rpk + https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_nanodet_plus_416x416.rpk + https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_posenet.rpk + https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_regnetx_002.rpk + https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_regnety_002.rpk + https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_regnety_004.rpk + https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_resnet18.rpk + https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_shufflenet_v2_x1_5.rpk + https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_squeezenet1.0.rpk + https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_ssd_mobilenetv2_fpnlite_320x320_pp.rpk + https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_yolov8n_pp.rpk +) + +if [ $# -ne 1 ]; then + echo "Usage: $0 " + exit 1 +fi + +dir=$1 +mkdir -p $dir + +for url in ${networks[@]}; do + wget -nv -N -P $dir $url +done