feat: Remove bbox crate and use 1024 for image size

2025-08-05 18:14:31 +05:30
parent 06fb0b4487
commit 043a845fc1
10 changed files with 89 additions and 839 deletions
--- a/src/cli.rs
+++ b/src/cli.rs
@@ -18,6 +18,7 @@ pub enum SubCommand {
 #[derive(Debug, clap::ValueEnum, Clone, Copy)]
 pub enum Models {
    RetinaFace,
+    Yolo,
 }

 #[derive(Debug, clap::ValueEnum, Clone, Copy)]
@@ -49,6 +50,8 @@ pub struct Detect {
    pub output: Option<PathBuf>,
    #[clap(short, long, default_value_t = 0.8)]
    pub threshold: f32,
+    #[clap(short, long, default_value_t = 0.3)]
+    pub nms_threshold: f32,
    pub image: PathBuf,
 }

--- a/src/facedet/retinaface.rs
+++ b/src/facedet/retinaface.rs
@@ -7,7 +7,7 @@ use ndarray_resize::NdFir;
 use std::path::Path;

 pub struct FaceDetectionConfig {
-    min_sizes: Vec<Vector2<usize>>,
+    anchor_sizes: Vec<Vector2<usize>>,
    steps: Vec<usize>,
    variance: Vec<f32>,
    threshold: f32,
@@ -16,7 +16,7 @@ pub struct FaceDetectionConfig {

 impl FaceDetectionConfig {
    pub fn with_min_sizes(mut self, min_sizes: Vec<Vector2<usize>>) -> Self {
-        self.min_sizes = min_sizes;
+        self.anchor_sizes = min_sizes;
        self
    }
    pub fn with_steps(mut self, steps: Vec<usize>) -> Self {
@@ -40,7 +40,7 @@ impl FaceDetectionConfig {
 impl Default for FaceDetectionConfig {
    fn default() -> Self {
        FaceDetectionConfig {
-            min_sizes: vec![
+            anchor_sizes: vec![
                Vector2::new(16, 32),
                Vector2::new(64, 128),
                Vector2::new(256, 512),
@@ -48,7 +48,7 @@ impl Default for FaceDetectionConfig {
            steps: vec![8, 16, 32],
            variance: vec![0.1, 0.2],
            threshold: 0.8,
-            nms_threshold: 0.6,
+            nms_threshold: 0.4,
        }
    }
 }
@@ -91,15 +91,15 @@ impl FaceDetectionModelOutput {
    pub fn postprocess(self, config: &FaceDetectionConfig) -> Result<FaceDetectionProcessedOutput> {
        let mut anchors = Vec::new();
        for (k, &step) in config.steps.iter().enumerate() {
-            let feature_size = 640 / step;
-            let min_sizes = config.min_sizes[k];
+            let feature_size = 1024 / step;
+            let min_sizes = config.anchor_sizes[k];
            let sizes = [min_sizes.x, min_sizes.y];
            for i in 0..feature_size {
                for j in 0..feature_size {
                    for &size in &sizes {
-                        let cx = (j as f32 + 0.5) * step as f32 / 640.0;
-                        let cy = (i as f32 + 0.5) * step as f32 / 640.0;
-                        let s_k = size as f32 / 640.0;
+                        let cx = (j as f32 + 0.5) * step as f32 / 1024.0;
+                        let cy = (i as f32 + 0.5) * step as f32 / 1024.0;
+                        let s_k = size as f32 / 1024.0;
                        anchors.push((cx, cy, s_k, s_k));
                    }
                }
@@ -220,7 +220,7 @@ impl FaceDetection {
        image: ndarray::Array3<u8>,
        config: FaceDetectionConfig,
    ) -> Result<FaceDetectionOutput> {
-        let (height, width, channels) = image.dim();
+        let (height, width, _channels) = image.dim();
        let output = self
            .run_models(image)
            .change_context(Error)
@@ -242,17 +242,31 @@ impl FaceDetection {
            .map(|((b, s), l)| (b, s, l))
            .multiunzip();

-        let boxes = nms(&boxes, &scores, config.threshold, config.nms_threshold);
+        let keep_indices = nms(&boxes, &scores, config.threshold, config.nms_threshold);

        let bboxes = boxes
            .into_iter()
-            .flat_map(|x| x.denormalize(factor).try_cast::<usize>())
+            .enumerate()
+            .filter(|(i, _)| keep_indices.contains(i))
+            .flat_map(|(_, x)| x.denormalize(factor).try_cast::<usize>())
+            .collect();
+        let confidence = scores
+            .into_iter()
+            .enumerate()
+            .filter(|(i, _)| keep_indices.contains(i))
+            .map(|(_, score)| score)
+            .collect();
+        let landmark = landmarks
+            .into_iter()
+            .enumerate()
+            .filter(|(i, _)| keep_indices.contains(i))
+            .map(|(_, score)| score)
            .collect();

        Ok(FaceDetectionOutput {
            bbox: bboxes,
-            confidence: processed.confidence,
-            landmark: processed.landmarks,
+            confidence,
+            landmark,
        })
    }

@@ -263,7 +277,7 @@ impl FaceDetection {
            .handle
            .run(move |sr| {
                let mut resized = image
-                    .fast_resize(640, 640, None)
+                    .fast_resize(1024, 1024, None)
                    .change_context(mnn::ErrorKind::TensorError)?
                    .mapv(|f| f as f32)
                    .tap_mut(|arr| {
@@ -292,8 +306,8 @@ impl FaceDetection {
                        input.view_mut(),
                        1,
                        3,
-                        640,
-                        640,
+                        1024,
+                        1024,
                    );
                }
                intptr.resize_session(session);
--- a/src/main.rs
+++ b/src/main.rs
@@ -34,7 +34,7 @@ pub fn main() -> Result<()> {
            for bbox in output.bbox {
                tracing::info!("Detected face: {:?}", bbox);
                use bounding_box::draw::*;
-                array.draw(bbox, color::palette::css::GREEN_YELLOW.to_rgba8(), 10);
+                array.draw(bbox, color::palette::css::GREEN_YELLOW.to_rgba8(), 1);
            }
            let v = array.view();
            if let Some(output) = detect.output {