face-detector/src/facedet/retinaface.rs

pub mod mnn;
pub mod ort;

use crate::errors::*;
use bounding_box::{Aabb2, nms::nms};
use error_stack::ResultExt;
use nalgebra::{Point2, Vector2};

/// Configuration for face detection postprocessing
#[derive(Debug, Clone, PartialEq)]
pub struct FaceDetectionConfig {
    /// Minimum confidence to keep a detection
    pub threshold: f32,
    /// NMS threshold for suppressing overlapping boxes
    pub nms_threshold: f32,
    /// Variances for bounding box decoding
    pub variances: [f32; 2],
    /// The step size (stride) for each feature map
    pub steps: Vec<usize>,
    /// The minimum anchor sizes for each feature map
    pub min_sizes: Vec<Vec<usize>>,
    /// Whether to clip bounding boxes to the image dimensions
    pub clamp: bool,
    /// Input image width (used for anchor generation)
    pub input_width: usize,
    /// Input image height (used for anchor generation)
    pub input_height: usize,
}

impl FaceDetectionConfig {
    pub fn with_threshold(mut self, threshold: f32) -> Self {
        self.threshold = threshold;
        self
    }

    pub fn with_nms_threshold(mut self, nms_threshold: f32) -> Self {
        self.nms_threshold = nms_threshold;
        self
    }

    pub fn with_variances(mut self, variances: [f32; 2]) -> Self {
        self.variances = variances;
        self
    }

    pub fn with_steps(mut self, steps: Vec<usize>) -> Self {
        self.steps = steps;
        self
    }

    pub fn with_min_sizes(mut self, min_sizes: Vec<Vec<usize>>) -> Self {
        self.min_sizes = min_sizes;
        self
    }

    pub fn with_clip(mut self, clip: bool) -> Self {
        self.clamp = clip;
        self
    }

    pub fn with_input_width(mut self, input_width: usize) -> Self {
        self.input_width = input_width;
        self
    }

    pub fn with_input_height(mut self, input_height: usize) -> Self {
        self.input_height = input_height;
        self
    }
}

impl Default for FaceDetectionConfig {
    fn default() -> Self {
        Self {
            threshold: 0.5,
            nms_threshold: 0.4,
            variances: [0.1, 0.2],
            steps: vec![8, 16, 32],
            min_sizes: vec![vec![16, 32], vec![64, 128], vec![256, 512]],
            clamp: true,
            input_width: 1024,
            input_height: 1024,
        }
    }
}

/// Represents the 5 facial landmarks detected by RetinaFace
#[derive(Debug, Copy, Clone, PartialEq)]
pub struct FaceLandmarks {
    pub left_eye: Point2<f32>,
    pub right_eye: Point2<f32>,
    pub nose: Point2<f32>,
    pub left_mouth: Point2<f32>,
    pub right_mouth: Point2<f32>,
}

#[derive(Debug, Clone, PartialEq)]
pub struct FaceDetectionModelOutput {
    pub bbox: ndarray::Array3<f32>,
    pub confidence: ndarray::Array3<f32>,
    pub landmark: ndarray::Array3<f32>,
}

#[derive(Debug, Clone, PartialEq)]
pub struct FaceDetectionProcessedOutput {
    pub bbox: Vec<Aabb2<f32>>,
    pub confidence: Vec<f32>,
    pub landmarks: Vec<FaceLandmarks>,
}

#[derive(Debug, Clone, PartialEq)]
pub struct FaceDetectionOutput {
    pub bbox: Vec<Aabb2<usize>>,
    pub confidence: Vec<f32>,
    pub landmark: Vec<FaceLandmarks>,
}

/// Raw model outputs that can be converted to FaceDetectionModelOutput
pub trait IntoModelOutput {
    fn into_model_output(self) -> Result<FaceDetectionModelOutput>;
}

/// Generate anchors for RetinaFace model
pub fn generate_anchors(config: &FaceDetectionConfig) -> ndarray::Array2<f32> {
    let mut anchors = Vec::new();
    let feature_maps: Vec<(usize, usize)> = config
        .steps
        .iter()
        .map(|&step| {
            (
                (config.input_height as f32 / step as f32).ceil() as usize,
                (config.input_width as f32 / step as f32).ceil() as usize,
            )
        })
        .collect();

    for (k, f) in feature_maps.iter().enumerate() {
        let min_sizes = &config.min_sizes[k];
        for i in 0..f.0 {
            for j in 0..f.1 {
                for &min_size in min_sizes {
                    let s_kx = min_size as f32 / config.input_width as f32;
                    let s_ky = min_size as f32 / config.input_height as f32;
                    let dense_cx =
                        (j as f32 + 0.5) * config.steps[k] as f32 / config.input_width as f32;
                    let dense_cy =
                        (i as f32 + 0.5) * config.steps[k] as f32 / config.input_height as f32;
                    anchors.push([
                        dense_cx - s_kx / 2.,
                        dense_cy - s_ky / 2.,
                        dense_cx + s_kx / 2.,
                        dense_cy + s_ky / 2.,
                    ]);
                }
            }
        }
    }

    ndarray::Array2::from_shape_vec((anchors.len(), 4), anchors.into_iter().flatten().collect())
        .unwrap()
}

impl FaceDetectionModelOutput {
    pub fn postprocess(self, config: &FaceDetectionConfig) -> Result<FaceDetectionProcessedOutput> {
        use ndarray::s;

        let priors = generate_anchors(config);

        let scores = self.confidence.slice(s![0, .., 1]);
        let boxes = self.bbox.slice(s![0, .., ..]);
        let landmarks_raw = self.landmark.slice(s![0, .., ..]);

        // let mut decoded_boxes = Vec::new();
        // let mut decoded_landmarks = Vec::new();
        // let mut confidences = Vec::new();

        let (decoded_boxes, decoded_landmarks, confidences) = (0..priors.shape()[0])
            .filter(|&i| scores[i] > config.threshold)
            .map(|i| {
                let prior = priors.row(i);
                let loc = boxes.row(i);
                let landm = landmarks_raw.row(i);

                // Decode bounding box
                let prior_cx = (prior[0] + prior[2]) / 2.0;
                let prior_cy = (prior[1] + prior[3]) / 2.0;
                let prior_w = prior[2] - prior[0];
                let prior_h = prior[3] - prior[1];

                let var = config.variances;
                let cx = prior_cx + loc[0] * var[0] * prior_w;
                let cy = prior_cy + loc[1] * var[0] * prior_h;
                let w = prior_w * (loc[2] * var[1]).exp();
                let h = prior_h * (loc[3] * var[1]).exp();

                let xmin = cx - w / 2.0;
                let ymin = cy - h / 2.0;
                let xmax = cx + w / 2.0;
                let ymax = cy + h / 2.0;

                let mut bbox =
                    Aabb2::from_min_max_vertices(Point2::new(xmin, ymin), Point2::new(xmax, ymax));
                if config.clamp {
                    bbox = bbox.component_clamp(0.0, 1.0);
                }

                // Decode landmarks
                let points: [Point2<f32>; 5] = (0..5)
                    .map(|j| {
                        Point2::new(
                            prior_cx + landm[j * 2] * var[0] * prior_w,
                            prior_cy + landm[j * 2 + 1] * var[0] * prior_h,
                        )
                    })
                    .collect::<Vec<_>>()
                    .try_into()
                    .unwrap();

                let landmarks = FaceLandmarks {
                    left_eye: points[0],
                    right_eye: points[1],
                    nose: points[2],
                    left_mouth: points[3],
                    right_mouth: points[4],
                };

                (bbox, landmarks, scores[i])
            })
            .fold(
                (Vec::new(), Vec::new(), Vec::new()),
                |(mut boxes, mut landmarks, mut confs), (bbox, landmark, conf)| {
                    boxes.push(bbox);
                    landmarks.push(landmark);
                    confs.push(conf);
                    (boxes, landmarks, confs)
                },
            );
        Ok(FaceDetectionProcessedOutput {
            bbox: decoded_boxes,
            confidence: confidences,
            landmarks: decoded_landmarks,
        })
    }

    pub fn print(&self, limit: usize) {
        tracing::info!("Detected {} faces", self.bbox.shape()[1]);

        for (bbox, confidence) in self
            .bbox
            .clone()
            .remove_axis(ndarray::Axis(0))
            .axis_iter(ndarray::Axis(0))
            .zip(
                self.confidence
                    .clone()
                    .remove_axis(ndarray::Axis(0))
                    .axis_iter(ndarray::Axis(0))
                    .map(|c| c[1]),
            )
            .filter(|(_, c)| *c > 0.1)
            .take(limit)
        {
            tracing::info!("Face BBox: {:?}, Confidence: {:.2}", bbox, confidence);
        }
    }
}

/// Apply Non-Maximum Suppression and convert to final output format
pub fn apply_nms_and_finalize(
    processed: FaceDetectionProcessedOutput,
    config: &FaceDetectionConfig,
    image_size: (usize, usize), // (width, height)
) -> Result<FaceDetectionOutput> {
    use itertools::Itertools;

    let factor = Vector2::new(image_size.0 as f32, image_size.1 as f32);

    let (boxes, scores, landmarks): (Vec<_>, Vec<_>, Vec<_>) = processed
        .bbox
        .iter()
        .cloned()
        .zip(processed.confidence.iter().cloned())
        .zip(processed.landmarks.iter().cloned())
        .sorted_by_key(|((_, score), _)| ordered_float::OrderedFloat(*score))
        .map(|((b, s), l)| (b, s, l))
        .multiunzip();

    let keep_indices =
        nms(&boxes, &scores, config.threshold, config.nms_threshold).change_context(Error)?;

    let bboxes = boxes
        .into_iter()
        .enumerate()
        .filter(|(i, _)| keep_indices.contains(i))
        .flat_map(|(_, x)| x.denormalize(factor).try_cast::<usize>())
        .collect();
    let confidence = scores
        .into_iter()
        .enumerate()
        .filter(|(i, _)| keep_indices.contains(i))
        .map(|(_, score)| score)
        .collect();
    let landmark = landmarks
        .into_iter()
        .enumerate()
        .filter(|(i, _)| keep_indices.contains(i))
        .map(|(_, score)| score)
        .collect();

    Ok(FaceDetectionOutput {
        bbox: bboxes,
        confidence,
        landmark,
    })
}

/// Common trait for face detection backends
pub trait FaceDetector {
    /// Run inference on the model and return raw outputs
    fn run_model(&mut self, image: ndarray::ArrayView3<u8>) -> Result<FaceDetectionModelOutput>;

    /// Detect faces with full pipeline including postprocessing
    fn detect_faces(
        &mut self,
        image: ndarray::ArrayView3<u8>,
        config: &FaceDetectionConfig,
    ) -> Result<FaceDetectionOutput> {
        let (height, width, _channels) = image.dim();
        let output = self
            .run_model(image)
            .change_context(Error)
            .attach_printable("Failed to detect faces")?;

        let processed = output
            .postprocess(&config)
            .attach_printable("Failed to postprocess")?;

        apply_nms_and_finalize(processed, &config, (width, height))
    }
}