feat: output aabbs from retinaface

2025-08-04 11:43:16 +05:30
parent c751654799
commit f55f0ab089
4 changed files with 220 additions and 182 deletions
--- a/src/facedet.rs
+++ b/src/facedet.rs
@@ -1,180 +1,2 @@
-use crate::errors::*;
+pub mod retinaface;
-use bounding_box::Aabb2;
+pub mod yolo;
 use error_stack::ResultExt;
 use mnn_bridge::ndarray::*;
 use nalgebra::{Point2, Vector2};
 use ndarray_resize::NdFir;
 use std::path::Path;
 pub struct FaceDetectionConfig {
    min_sizes: Vec<Vector2<usize>>,
    steps: Vec<usize>,
    variance: Vec<f32>,
 }
 pub struct FaceDetection {
    handle: mnn_sync::SessionHandle,
 }
 pub struct FaceDetectionModelOutput {
    pub bbox: ndarray::Array3<f32>,
    pub confidence: ndarray::Array3<f32>,
    pub landmark: ndarray::Array3<f32>,
 }
 impl FaceDetectionModelOutput {
    pub fn postprocess(self, config: FaceDetectionConfig) -> Result<Vec<Aabb2<f32>>> {
        // for k, step in enumerate(cfg['steps']):
        //     feature_size = 640 // step
        //     for i in range(feature_size):
        //         for j in range(feature_size):
        //             for min_size in cfg['min_sizes'][k]:
        //                 cx = (j + 0.5) * step / 640
        //                 cy = (i + 0.5) * step / 640
        //                 s_kx = s_ky = min_size / 640
        //                 anchors.append([cx, cy, s_kx, s_ky])
        let mut anchors = Vec::new();
        config.steps.iter().enumerate().for_each(|(k, step)| {
            let feature_size = 640 / step;
            for i in 0..feature_size {
                for j in 0..feature_size {
                    for min_size in &config.min_sizes[k] {
                        let cx = (j as f32 + 0.5) * *step as f32 / 640.0;
                        let cy = (i as f32 + 0.5) * *step as f32 / 640.0;
                        let s_kx = *min_size as f32 / 640.0;
                        let s_ky = *min_size as f32 / 640.0;
                        anchors.push([cx, cy, s_kx, s_ky]);
                    }
                }
            }
        });
        Ok(Vec::new())
    }
 }
 impl FaceDetectionModelOutput {
    pub fn print(&self, limit: usize) {
        tracing::info!("Detected {} faces", self.bbox.shape()[1]);
        for (bbox, confidence) in self
            .bbox
            .clone()
            .remove_axis(ndarray::Axis(0))
            .axis_iter(ndarray::Axis(0))
            .zip(
                self.confidence
                    .clone()
                    .remove_axis(ndarray::Axis(0))
                    .axis_iter(ndarray::Axis(0))
                    .map(|c| c[1]),
            )
            .filter(|(_, c)| *c > 0.1)
            .take(limit)
        {
            tracing::info!("Face BBox: {:?}, Confidence: {:.2}", bbox, confidence);
        }
    }
 }
 impl FaceDetection {
    pub fn new(path: impl AsRef<Path>) -> Result<Self> {
        let model = std::fs::read(path)
            .change_context(Error)
            .attach_printable("Failed to read model file")?;
        Self::new_from_bytes(&model)
    }
    pub fn new_from_bytes(model: &[u8]) -> Result<Self> {
        tracing::info!("Loading face detection model from bytes");
        let mut model = mnn::Interpreter::from_bytes(model)
            .map_err(|e| e.into_inner())
            .change_context(Error)
            .attach_printable("Failed to load model from bytes")?;
        model.set_session_mode(mnn::SessionMode::Release);
        let bc = mnn::BackendConfig::default().with_memory_mode(mnn::MemoryMode::High);
        let sc = mnn::ScheduleConfig::new()
            .with_type(mnn::ForwardType::CoreML)
            .with_backend_config(bc);
        tracing::info!("Creating session handle for face detection model");
        let handle = mnn_sync::SessionHandle::new(model, sc)
            .change_context(Error)
            .attach_printable("Failed to create session handle")?;
        Ok(FaceDetection { handle })
    }
    pub fn detect_faces(&self, image: ndarray::Array3<u8>) -> Result<FaceDetectionModelOutput> {
        #[rustfmt::skip]
        use ::tap::*;
        let output = self
            .handle
            .run(move |sr| {
                let mut resized = image
                    .fast_resize(640, 640, None)
                    .change_context(mnn::ErrorKind::TensorError)?
                    .mapv(|f| f as f32)
                    .tap_mut(|arr| {
                        arr.axis_iter_mut(ndarray::Axis(2))
                            .zip([104, 117, 123])
                            .for_each(|(mut array, pixel)| {
                                let pixel = pixel as f32;
                                array.map_inplace(|v| *v -= pixel);
                            });
                    })
                    .permuted_axes((2, 0, 1))
                    .insert_axis(ndarray::Axis(0))
                    .as_standard_layout()
                    .into_owned();
                let tensor = resized
                    .as_mnn_tensor_mut()
                    .attach_printable("Failed to convert ndarray to mnn tensor")
                    .change_context(mnn::error::ErrorKind::TensorError)?;
                tracing::trace!("Image Tensor shape: {:?}", tensor.shape());
                let (intptr, session) = sr.both_mut();
                tracing::trace!("Copying input tensor to host");
                unsafe {
                    let mut input = intptr.input_unresized::<f32>(session, "input")?;
                    tracing::trace!("Input shape: {:?}", input.shape());
                    intptr.resize_tensor_by_nchw::<mnn::View<&mut f32>, _>(
                        input.view_mut(),
                        1,
                        3,
                        640,
                        640,
                    );
                }
                intptr.resize_session(session);
                let mut input = intptr.input::<f32>(session, "input")?;
                tracing::trace!("Input shape: {:?}", input.shape());
                input.copy_from_host_tensor(tensor.view())?;
                tracing::info!("Running face detection session");
                intptr.run_session(&session)?;
                let output_tensor = intptr
                    .output::<f32>(&session, "bbox")?
                    .create_host_tensor_from_device(true)
                    .as_ndarray()
                    .to_owned();
                tracing::trace!("Output Bbox: \t\t{:?}", output_tensor.shape());
                let output_confidence = intptr
                    .output::<f32>(&session, "confidence")?
                    .create_host_tensor_from_device(true)
                    .as_ndarray::<ndarray::Ix3>()
                    .to_owned();
                tracing::trace!("Output Confidence: \t{:?}", output_confidence.shape());
                let output_landmark = intptr
                    .output::<f32>(&session, "landmark")?
                    .create_host_tensor_from_device(true)
                    .as_ndarray::<ndarray::Ix3>()
                    .to_owned();
                tracing::trace!("Output Landmark: \t{:?}", output_landmark.shape());
                Ok(FaceDetectionModelOutput {
                    bbox: output_tensor,
                    confidence: output_confidence,
                    landmark: output_landmark,
                })
            })
            .map_err(|e| e.into_inner())
            .change_context(Error)?;
        Ok(output)
    }
 }
--- a/src/facedet/retinaface.rs
+++ b/src/facedet/retinaface.rs
@@ -0,0 +1,213 @@
 use crate::errors::*;
 use bounding_box::Aabb2;
 use error_stack::ResultExt;
 use mnn_bridge::ndarray::*;
 use nalgebra::{Point2, Vector2};
 use ndarray_resize::NdFir;
 use std::path::Path;
 pub struct FaceDetectionConfig {
    min_sizes: Vec<Vector2<usize>>,
    steps: Vec<usize>,
    variance: Vec<f32>,
 }
 impl Default for FaceDetectionConfig {
    fn default() -> Self {
        FaceDetectionConfig {
            min_sizes: vec![
                Vector2::new(16, 32),
                Vector2::new(64, 128),
                Vector2::new(256, 512),
            ],
            steps: vec![8, 16, 32],
            variance: vec![0.1, 0.2],
        }
    }
 }
 pub struct FaceDetection {
    handle: mnn_sync::SessionHandle,
 }
 pub struct FaceDetectionModelOutput {
    pub bbox: ndarray::Array3<f32>,
    pub confidence: ndarray::Array3<f32>,
    pub landmark: ndarray::Array3<f32>,
 }
 impl FaceDetectionModelOutput {
    pub fn postprocess(self, config: FaceDetectionConfig) -> Result<Vec<Aabb2<f32>>> {
        let mut anchors = Vec::new();
        for (k, &step) in config.steps.iter().enumerate() {
            let feature_size = 640 / step;
            let min_sizes = config.min_sizes[k];
            let sizes = [min_sizes.x, min_sizes.y];
            for i in 0..feature_size {
                for j in 0..feature_size {
                    for &size in &sizes {
                        let cx = (j as f32 + 0.5) * step as f32 / 640.0;
                        let cy = (i as f32 + 0.5) * step as f32 / 640.0;
                        let s_k = size as f32 / 640.0;
                        anchors.push((cx, cy, s_k, s_k));
                    }
                }
            }
        }
        let mut boxes = Vec::new();
        let var0 = config.variance[0];
        let var1 = config.variance[1];
        let bbox_data = self.bbox;
        let conf_data = self.confidence;
        let num_priors = bbox_data.shape()[1];
        for idx in 0..num_priors {
            let dx = bbox_data[[0, idx, 0]];
            let dy = bbox_data[[0, idx, 1]];
            let dw = bbox_data[[0, idx, 2]];
            let dh = bbox_data[[0, idx, 3]];
            let (anchor_cx, anchor_cy, anchor_w, anchor_h) = anchors[idx];
            let pred_cx = anchor_cx + dx * var0 * anchor_w;
            let pred_cy = anchor_cy + dy * var0 * anchor_h;
            let pred_w = anchor_w * (dw * var1).exp();
            let pred_h = anchor_h * (dh * var1).exp();
            let x_min = pred_cx - pred_w / 2.0;
            let y_min = pred_cy - pred_h / 2.0;
            let x_max = pred_cx + pred_w / 2.0;
            let y_max = pred_cy + pred_h / 2.0;
            let score = conf_data[[0, idx, 1]];
            if score > 0.6 {
                boxes.push(Aabb2::from_min_max_vertices(
                    Point2::new(x_min, y_min),
                    Point2::new(x_max, y_max),
                ));
            }
        }
        Ok(boxes)
    }
 }
 impl FaceDetectionModelOutput {
    pub fn print(&self, limit: usize) {
        tracing::info!("Detected {} faces", self.bbox.shape()[1]);
        for (bbox, confidence) in self
            .bbox
            .clone()
            .remove_axis(ndarray::Axis(0))
            .axis_iter(ndarray::Axis(0))
            .zip(
                self.confidence
                    .clone()
                    .remove_axis(ndarray::Axis(0))
                    .axis_iter(ndarray::Axis(0))
                    .map(|c| c[1]),
            )
            .filter(|(_, c)| *c > 0.1)
            .take(limit)
        {
            tracing::info!("Face BBox: {:?}, Confidence: {:.2}", bbox, confidence);
        }
    }
 }
 impl FaceDetection {
    pub fn new(path: impl AsRef<Path>) -> Result<Self> {
        let model = std::fs::read(path)
            .change_context(Error)
            .attach_printable("Failed to read model file")?;
        Self::new_from_bytes(&model)
    }
    pub fn new_from_bytes(model: &[u8]) -> Result<Self> {
        tracing::info!("Loading face detection model from bytes");
        let mut model = mnn::Interpreter::from_bytes(model)
            .map_err(|e| e.into_inner())
            .change_context(Error)
            .attach_printable("Failed to load model from bytes")?;
        model.set_session_mode(mnn::SessionMode::Release);
        let bc = mnn::BackendConfig::default().with_memory_mode(mnn::MemoryMode::High);
        let sc = mnn::ScheduleConfig::new()
            .with_type(mnn::ForwardType::CPU)
            .with_backend_config(bc);
        tracing::info!("Creating session handle for face detection model");
        let handle = mnn_sync::SessionHandle::new(model, sc)
            .change_context(Error)
            .attach_printable("Failed to create session handle")?;
        Ok(FaceDetection { handle })
    }
    pub fn detect_faces(&self, image: ndarray::Array3<u8>) -> Result<FaceDetectionModelOutput> {
        #[rustfmt::skip]
        use ::tap::*;
        let output = self
            .handle
            .run(move |sr| {
                let mut resized = image
                    .fast_resize(640, 640, None)
                    .change_context(mnn::ErrorKind::TensorError)?
                    .mapv(|f| f as f32)
                    .tap_mut(|arr| {
                        arr.axis_iter_mut(ndarray::Axis(2))
                            .zip([104, 117, 123])
                            .for_each(|(mut array, pixel)| {
                                let pixel = pixel as f32;
                                array.map_inplace(|v| *v -= pixel);
                            });
                    })
                    .permuted_axes((2, 0, 1))
                    .insert_axis(ndarray::Axis(0))
                    .as_standard_layout()
                    .into_owned();
                let tensor = resized
                    .as_mnn_tensor_mut()
                    .attach_printable("Failed to convert ndarray to mnn tensor")
                    .change_context(mnn::error::ErrorKind::TensorError)?;
                tracing::trace!("Image Tensor shape: {:?}", tensor.shape());
                let (intptr, session) = sr.both_mut();
                tracing::trace!("Copying input tensor to host");
                unsafe {
                    let mut input = intptr.input_unresized::<f32>(session, "input")?;
                    tracing::trace!("Input shape: {:?}", input.shape());
                    intptr.resize_tensor_by_nchw::<mnn::View<&mut f32>, _>(
                        input.view_mut(),
                        1,
                        3,
                        640,
                        640,
                    );
                }
                intptr.resize_session(session);
                let mut input = intptr.input::<f32>(session, "input")?;
                tracing::trace!("Input shape: {:?}", input.shape());
                input.copy_from_host_tensor(tensor.view())?;
                tracing::info!("Running face detection session");
                intptr.run_session(&session)?;
                let output_tensor = intptr
                    .output::<f32>(&session, "bbox")?
                    .create_host_tensor_from_device(true)
                    .as_ndarray()
                    .to_owned();
                tracing::trace!("Output Bbox: \t\t{:?}", output_tensor.shape());
                let output_confidence = intptr
                    .output::<f32>(&session, "confidence")?
                    .create_host_tensor_from_device(true)
                    .as_ndarray::<ndarray::Ix3>()
                    .to_owned();
                tracing::trace!("Output Confidence: \t{:?}", output_confidence.shape());
                let output_landmark = intptr
                    .output::<f32>(&session, "landmark")?
                    .create_host_tensor_from_device(true)
                    .as_ndarray::<ndarray::Ix3>()
                    .to_owned();
                tracing::trace!("Output Landmark: \t{:?}", output_landmark.shape());
                Ok(FaceDetectionModelOutput {
                    bbox: output_tensor,
                    confidence: output_confidence,
                    landmark: output_landmark,
                })
            })
            .map_err(|e| e.into_inner())
            .change_context(Error)?;
        Ok(output)
    }
 }
--- a/src/facedet/yolo.rs
+++ b/src/facedet/yolo.rs
@@ -0,0 +1 @@
--- a/src/main.rs
+++ b/src/main.rs
@@ -14,7 +14,7 @@ pub fn main() -> Result<()> {
    match args.cmd {
        cli::SubCommand::Detect(detect) => {
            use detector::facedet;
-            let model = facedet::FaceDetection::new_from_bytes(RETINAFACE_MODEL)
+            let model = facedet::retinaface::FaceDetection::new_from_bytes(RETINAFACE_MODEL)
                .change_context(errors::Error)
                .attach_printable("Failed to create face detection model")?;
            let image = image::open(detect.image).change_context(Error)?;
@@ -27,7 +27,9 @@ pub fn main() -> Result<()> {
                .detect_faces(array)
                .change_context(errors::Error)
                .attach_printable("Failed to detect faces")?;
-            output.print(20);
+            // output.print(20);
            let aabbs = output.postprocess(Default::default());
            dbg!(aabbs);
        }
        cli::SubCommand::List(list) => {
            println!("List: {:?}", list);