pub mod mnn; pub mod ort; use crate::errors::*; use bounding_box::{Aabb2, nms::nms}; use error_stack::ResultExt; use nalgebra::{Point2, Vector2}; /// Configuration for face detection postprocessing #[derive(Debug, Clone, PartialEq)] pub struct FaceDetectionConfig { /// Minimum confidence to keep a detection pub threshold: f32, /// NMS threshold for suppressing overlapping boxes pub nms_threshold: f32, /// Variances for bounding box decoding pub variances: [f32; 2], /// The step size (stride) for each feature map pub steps: Vec, /// The minimum anchor sizes for each feature map pub min_sizes: Vec>, /// Whether to clip bounding boxes to the image dimensions pub clamp: bool, /// Input image width (used for anchor generation) pub input_width: usize, /// Input image height (used for anchor generation) pub input_height: usize, } impl FaceDetectionConfig { pub fn with_threshold(mut self, threshold: f32) -> Self { self.threshold = threshold; self } pub fn with_nms_threshold(mut self, nms_threshold: f32) -> Self { self.nms_threshold = nms_threshold; self } pub fn with_variances(mut self, variances: [f32; 2]) -> Self { self.variances = variances; self } pub fn with_steps(mut self, steps: Vec) -> Self { self.steps = steps; self } pub fn with_min_sizes(mut self, min_sizes: Vec>) -> Self { self.min_sizes = min_sizes; self } pub fn with_clip(mut self, clip: bool) -> Self { self.clamp = clip; self } pub fn with_input_width(mut self, input_width: usize) -> Self { self.input_width = input_width; self } pub fn with_input_height(mut self, input_height: usize) -> Self { self.input_height = input_height; self } } impl Default for FaceDetectionConfig { fn default() -> Self { Self { threshold: 0.5, nms_threshold: 0.4, variances: [0.1, 0.2], steps: vec![8, 16, 32], min_sizes: vec![vec![16, 32], vec![64, 128], vec![256, 512]], clamp: true, input_width: 1024, input_height: 1024, } } } /// Represents the 5 facial landmarks detected by RetinaFace #[derive(Debug, Copy, Clone, PartialEq)] pub struct FaceLandmarks { pub left_eye: Point2, pub right_eye: Point2, pub nose: Point2, pub left_mouth: Point2, pub right_mouth: Point2, } #[derive(Debug, Clone, PartialEq)] pub struct FaceDetectionModelOutput { pub bbox: ndarray::Array3, pub confidence: ndarray::Array3, pub landmark: ndarray::Array3, } #[derive(Debug, Clone, PartialEq)] pub struct FaceDetectionProcessedOutput { pub bbox: Vec>, pub confidence: Vec, pub landmarks: Vec, } #[derive(Debug, Clone, PartialEq)] pub struct FaceDetectionOutput { pub bbox: Vec>, pub confidence: Vec, pub landmark: Vec, } /// Raw model outputs that can be converted to FaceDetectionModelOutput pub trait IntoModelOutput { fn into_model_output(self) -> Result; } /// Generate anchors for RetinaFace model pub fn generate_anchors(config: &FaceDetectionConfig) -> ndarray::Array2 { let mut anchors = Vec::new(); let feature_maps: Vec<(usize, usize)> = config .steps .iter() .map(|&step| { ( (config.input_height as f32 / step as f32).ceil() as usize, (config.input_width as f32 / step as f32).ceil() as usize, ) }) .collect(); for (k, f) in feature_maps.iter().enumerate() { let min_sizes = &config.min_sizes[k]; for i in 0..f.0 { for j in 0..f.1 { for &min_size in min_sizes { let s_kx = min_size as f32 / config.input_width as f32; let s_ky = min_size as f32 / config.input_height as f32; let dense_cx = (j as f32 + 0.5) * config.steps[k] as f32 / config.input_width as f32; let dense_cy = (i as f32 + 0.5) * config.steps[k] as f32 / config.input_height as f32; anchors.push([ dense_cx - s_kx / 2., dense_cy - s_ky / 2., dense_cx + s_kx / 2., dense_cy + s_ky / 2., ]); } } } } ndarray::Array2::from_shape_vec((anchors.len(), 4), anchors.into_iter().flatten().collect()) .unwrap() } impl FaceDetectionModelOutput { pub fn postprocess(self, config: &FaceDetectionConfig) -> Result { use ndarray::s; let priors = generate_anchors(config); let scores = self.confidence.slice(s![0, .., 1]); let boxes = self.bbox.slice(s![0, .., ..]); let landmarks_raw = self.landmark.slice(s![0, .., ..]); // let mut decoded_boxes = Vec::new(); // let mut decoded_landmarks = Vec::new(); // let mut confidences = Vec::new(); let (decoded_boxes, decoded_landmarks, confidences) = (0..priors.shape()[0]) .filter(|&i| scores[i] > config.threshold) .map(|i| { let prior = priors.row(i); let loc = boxes.row(i); let landm = landmarks_raw.row(i); // Decode bounding box let prior_cx = (prior[0] + prior[2]) / 2.0; let prior_cy = (prior[1] + prior[3]) / 2.0; let prior_w = prior[2] - prior[0]; let prior_h = prior[3] - prior[1]; let var = config.variances; let cx = prior_cx + loc[0] * var[0] * prior_w; let cy = prior_cy + loc[1] * var[0] * prior_h; let w = prior_w * (loc[2] * var[1]).exp(); let h = prior_h * (loc[3] * var[1]).exp(); let xmin = cx - w / 2.0; let ymin = cy - h / 2.0; let xmax = cx + w / 2.0; let ymax = cy + h / 2.0; let mut bbox = Aabb2::from_min_max_vertices(Point2::new(xmin, ymin), Point2::new(xmax, ymax)); if config.clamp { bbox = bbox.component_clamp(0.0, 1.0); } // Decode landmarks let points: [Point2; 5] = (0..5) .map(|j| { Point2::new( prior_cx + landm[j * 2] * var[0] * prior_w, prior_cy + landm[j * 2 + 1] * var[0] * prior_h, ) }) .collect::>() .try_into() .unwrap(); let landmarks = FaceLandmarks { left_eye: points[0], right_eye: points[1], nose: points[2], left_mouth: points[3], right_mouth: points[4], }; (bbox, landmarks, scores[i]) }) .fold( (Vec::new(), Vec::new(), Vec::new()), |(mut boxes, mut landmarks, mut confs), (bbox, landmark, conf)| { boxes.push(bbox); landmarks.push(landmark); confs.push(conf); (boxes, landmarks, confs) }, ); Ok(FaceDetectionProcessedOutput { bbox: decoded_boxes, confidence: confidences, landmarks: decoded_landmarks, }) } pub fn print(&self, limit: usize) { tracing::info!("Detected {} faces", self.bbox.shape()[1]); for (bbox, confidence) in self .bbox .clone() .remove_axis(ndarray::Axis(0)) .axis_iter(ndarray::Axis(0)) .zip( self.confidence .clone() .remove_axis(ndarray::Axis(0)) .axis_iter(ndarray::Axis(0)) .map(|c| c[1]), ) .filter(|(_, c)| *c > 0.1) .take(limit) { tracing::info!("Face BBox: {:?}, Confidence: {:.2}", bbox, confidence); } } } /// Apply Non-Maximum Suppression and convert to final output format pub fn apply_nms_and_finalize( processed: FaceDetectionProcessedOutput, config: &FaceDetectionConfig, image_size: (usize, usize), // (width, height) ) -> Result { use itertools::Itertools; let factor = Vector2::new(image_size.0 as f32, image_size.1 as f32); let (boxes, scores, landmarks): (Vec<_>, Vec<_>, Vec<_>) = processed .bbox .iter() .cloned() .zip(processed.confidence.iter().cloned()) .zip(processed.landmarks.iter().cloned()) .sorted_by_key(|((_, score), _)| ordered_float::OrderedFloat(*score)) .map(|((b, s), l)| (b, s, l)) .multiunzip(); let keep_indices = nms(&boxes, &scores, config.threshold, config.nms_threshold).change_context(Error)?; let bboxes = boxes .into_iter() .enumerate() .filter(|(i, _)| keep_indices.contains(i)) .flat_map(|(_, x)| x.denormalize(factor).try_cast::()) .collect(); let confidence = scores .into_iter() .enumerate() .filter(|(i, _)| keep_indices.contains(i)) .map(|(_, score)| score) .collect(); let landmark = landmarks .into_iter() .enumerate() .filter(|(i, _)| keep_indices.contains(i)) .map(|(_, score)| score) .collect(); Ok(FaceDetectionOutput { bbox: bboxes, confidence, landmark, }) } /// Common trait for face detection backends pub trait FaceDetector { /// Run inference on the model and return raw outputs fn run_model(&mut self, image: ndarray::ArrayView3) -> Result; /// Detect faces with full pipeline including postprocessing fn detect_faces( &mut self, image: ndarray::ArrayView3, config: &FaceDetectionConfig, ) -> Result { let (height, width, _channels) = image.dim(); let output = self .run_model(image) .change_context(Error) .attach_printable("Failed to detect faces")?; let processed = output .postprocess(&config) .attach_printable("Failed to postprocess")?; apply_nms_and_finalize(processed, &config, (width, height)) } }