chore: delete unused files and outdated GUI_DEMO documentation

chore: remove submodule 'rfcs' from the project
feat(flake): add uv package to build inputs
2025-09-23 16:13:56 +05:30 · 2025-09-23 15:08:54 +05:30 · 2025-09-16 12:28:09 +05:30 · 2025-09-16 12:22:38 +05:30 · 2025-09-13 17:45:55 +05:30 · 2025-08-28 18:42:35 +05:30
54 changed files with 5479 additions and 530 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -203,6 +203,9 @@ name = "arbitrary"
 version = "1.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c3d036a3c4ab069c7b410a2ce876bd74808d2d0888a82667669f8e783a898bf1"
+dependencies = [
+ "derive_arbitrary",
+]

 [[package]]
 name = "arg_enum_proc_macro"
@@ -478,6 +481,15 @@ dependencies = [
 "windows-targets 0.52.6",
 ]

+[[package]]
+name = "bbox"
+version = "0.1.0"
+dependencies = [
+ "ndarray",
+ "num",
+ "serde",
+]
+
 [[package]]
 name = "bindgen"
 version = "0.60.1"
@@ -770,6 +782,16 @@ dependencies = [
 "windows-link",
 ]

+[[package]]
+name = "clang"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "84c044c781163c001b913cd018fc95a628c50d0d2dfea8bca77dad71edb16e37"
+dependencies = [
+ "clang-sys",
+ "libc",
+]
+
 [[package]]
 name = "clang-sys"
 version = "1.8.1"
@@ -816,6 +838,7 @@ dependencies = [
 "anstyle",
 "clap_lex 0.7.5",
 "strsim 0.11.1",
+ "terminal_size",
 ]

 [[package]]
@@ -979,6 +1002,12 @@ dependencies = [
 "crossbeam-utils",
 ]

+[[package]]
+name = "condtype"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "baf0a07a401f374238ab8e2f11a104d2851bf9ce711ec69804834de8af45c7af"
+
 [[package]]
 name = "core-foundation"
 version = "0.9.4"
@@ -1210,6 +1239,17 @@ dependencies = [
 "powerfmt",
 ]

+[[package]]
+name = "derive_arbitrary"
+version = "1.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e567bd82dcff979e4b03460c307b3cdc9e96fde3d73bed1496d2bc75d9dd62a"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.106",
+]
+
 [[package]]
 name = "detect-desktop-environment"
 version = "0.2.0"
@@ -1231,20 +1271,22 @@ dependencies = [
 "image 0.25.6",
 "imageproc",
 "itertools 0.14.0",
+ "linfa",
+ "linfa-clustering",
 "mnn",
 "mnn-bridge",
 "mnn-sync",
 "nalgebra 0.34.0",
 "ndarray",
 "ndarray-image",
- "ndarray-math 0.1.0 (git+https://git.darksailor.dev/servius/ndarray-math)",
+ "ndarray-math",
 "ndarray-resize",
 "ndarray-safetensors",
 "ordered-float",
 "ort",
 "rfd",
 "rusqlite",
- "sqlite3-safetensor-cosine",
+ "sqlite3-ndarray-math",
 "tap",
 "thiserror 2.0.15",
 "tokio",
@@ -1320,6 +1362,31 @@ dependencies = [
 "syn 2.0.106",
 ]

+[[package]]
+name = "divan"
+version = "0.1.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a405457ec78b8fe08b0e32b4a3570ab5dff6dd16eb9e76a5ee0a9d9cbd898933"
+dependencies = [
+ "cfg-if",
+ "clap 4.5.45",
+ "condtype",
+ "divan-macros",
+ "libc",
+ "regex-lite",
+]
+
+[[package]]
+name = "divan-macros"
+version = "0.1.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9556bc800956545d6420a640173e5ba7dfa82f38d3ea5a167eb555bc69ac3323"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.106",
+]
+
 [[package]]
 name = "dlib"
 version = "0.5.2"
@@ -2026,7 +2093,7 @@ dependencies = [
 "presser",
 "thiserror 1.0.69",
 "winapi",
- "windows",
+ "windows 0.52.0",
 ]

 [[package]]
@@ -2552,6 +2619,17 @@ dependencies = [
 "rayon",
 ]

+[[package]]
+name = "img-parts"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "19734e3c43b2a850f5889c077056e47c874095f2d87e853c7c41214ae67375f0"
+dependencies = [
+ "bytes",
+ "crc32fast",
+ "miniz_oxide",
+]
+
 [[package]]
 name = "imgref"
 version = "1.11.0"
@@ -2732,6 +2810,16 @@ dependencies = [
 "mutate_once",
 ]

+[[package]]
+name = "kdtree"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0f0a0e9f770b65bac9aad00f97a67ab5c5319effed07f6da385da3c2115e47ba"
+dependencies = [
+ "num-traits",
+ "thiserror 1.0.69",
+]
+
 [[package]]
 name = "khronos-egl"
 version = "6.0.0"
@@ -2840,6 +2928,63 @@ dependencies = [
 "vcpkg",
 ]

+[[package]]
+name = "linfa"
+version = "0.7.1"
+source = "git+https://github.com/relf/linfa?branch=upgrade-ndarray-0.16#c1fbee7c54e806de3f5fb2c5240ce163d000f1ba"
+dependencies = [
+ "approx",
+ "ndarray",
+ "num-traits",
+ "rand 0.8.5",
+ "sprs",
+ "thiserror 2.0.15",
+]
+
+[[package]]
+name = "linfa-clustering"
+version = "0.7.1"
+source = "git+https://github.com/relf/linfa?branch=upgrade-ndarray-0.16#c1fbee7c54e806de3f5fb2c5240ce163d000f1ba"
+dependencies = [
+ "linfa",
+ "linfa-linalg",
+ "linfa-nn",
+ "ndarray",
+ "ndarray-rand",
+ "ndarray-stats",
+ "noisy_float",
+ "num-traits",
+ "rand_xoshiro",
+ "space",
+ "thiserror 2.0.15",
+]
+
+[[package]]
+name = "linfa-linalg"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "02a834c0ec063937688a0d13573aa515ab8c425bd8de3154b908dd3b9c197dc4"
+dependencies = [
+ "ndarray",
+ "num-traits",
+ "thiserror 1.0.69",
+]
+
+[[package]]
+name = "linfa-nn"
+version = "0.7.2"
+source = "git+https://github.com/relf/linfa?branch=upgrade-ndarray-0.16#c1fbee7c54e806de3f5fb2c5240ce163d000f1ba"
+dependencies = [
+ "kdtree",
+ "linfa",
+ "ndarray",
+ "ndarray-stats",
+ "noisy_float",
+ "num-traits",
+ "order-stat",
+ "thiserror 2.0.15",
+]
+
 [[package]]
 name = "linux-raw-sys"
 version = "0.4.15"
@@ -3147,6 +3292,7 @@ version = "0.16.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "882ed72dce9365842bf196bdeedf5055305f11fc8c03dee7bb0194a6cad34841"
 dependencies = [
+ "approx",
 "matrixmultiply",
 "num-complex",
 "num-integer",
@@ -3154,6 +3300,7 @@ dependencies = [
 "portable-atomic",
 "portable-atomic-util",
 "rawpointer",
+ "rayon",
 ]

 [[package]]
@@ -3167,6 +3314,7 @@ dependencies = [
 [[package]]
 name = "ndarray-math"
 version = "0.1.0"
+source = "git+https://git.darksailor.dev/servius/ndarray-math#df17c36193df60e070e4e120c9feebe68ff3f517"
 dependencies = [
 "ndarray",
 "num",
@@ -3174,13 +3322,28 @@ dependencies = [
 ]

 [[package]]
-name = "ndarray-math"
-version = "0.1.0"
-source = "git+https://git.darksailor.dev/servius/ndarray-math#f047966f20835267f20e5839272b9ab36c445796"
+name = "ndarray-npy"
+version = "0.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7b313788c468c49141a9d9b6131fc15f403e6ef4e8446a0b2e18f664ddb278a9"
+dependencies = [
+ "byteorder",
+ "ndarray",
+ "num-complex",
+ "num-traits",
+ "py_literal",
+ "zip",
+]
+
+[[package]]
+name = "ndarray-rand"
+version = "0.15.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f093b3db6fd194718dcdeea6bd8c829417deae904e3fcc7732dabcd4416d25d8"
 dependencies = [
 "ndarray",
- "num",
- "thiserror 2.0.15",
+ "rand 0.8.5",
+ "rand_distr",
 ]

 [[package]]
@@ -3206,6 +3369,42 @@ dependencies = [
 "thiserror 2.0.15",
 ]

+[[package]]
+name = "ndarray-stats"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "17ebbe97acce52d06aebed4cd4a87c0941f4b2519b59b82b4feb5bd0ce003dfd"
+dependencies = [
+ "indexmap 2.10.0",
+ "itertools 0.13.0",
+ "ndarray",
+ "noisy_float",
+ "num-integer",
+ "num-traits",
+ "rand 0.8.5",
+]
+
+[[package]]
+name = "ndcv-bridge"
+version = "0.1.0"
+dependencies = [
+ "bounding-box",
+ "bytemuck",
+ "divan",
+ "error-stack",
+ "fast_image_resize",
+ "img-parts",
+ "nalgebra 0.34.0",
+ "ndarray",
+ "ndarray-npy",
+ "num",
+ "opencv",
+ "rayon",
+ "thiserror 2.0.15",
+ "tracing",
+ "wide",
+]
+
 [[package]]
 name = "ndk"
 version = "0.9.0"
@@ -3277,6 +3476,15 @@ dependencies = [
 "memoffset",
 ]

+[[package]]
+name = "noisy_float"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "978fe6e6ebc0bf53de533cd456ca2d9de13de13856eda1518a285d7705a213af"
+dependencies = [
+ "num-traits",
+]
+
 [[package]]
 name = "nom"
 version = "7.1.3"
@@ -3727,6 +3935,41 @@ version = "0.1.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b4ce411919553d3f9fa53a0880544cda985a112117a0444d5ff1e870a893d6ea"

+[[package]]
+name = "opencv"
+version = "0.95.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0c73b6fccd78797a87cdb885c997351a1a290b0ebde778e996b694dec2a4c04a"
+dependencies = [
+ "cc",
+ "dunce",
+ "jobserver",
+ "libc",
+ "num-traits",
+ "once_cell",
+ "opencv-binding-generator",
+ "pkg-config",
+ "semver",
+ "shlex",
+ "vcpkg",
+ "windows 0.59.0",
+]
+
+[[package]]
+name = "opencv-binding-generator"
+version = "0.97.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "010a78e4cc47ff85cf58fb1cbbbab9dcdb8e5e6718917eac26623f077872d012"
+dependencies = [
+ "clang",
+ "clang-sys",
+ "dunce",
+ "once_cell",
+ "percent-encoding",
+ "regex",
+ "shlex",
+]
+
 [[package]]
 name = "orbclient"
 version = "0.3.48"
@@ -3736,6 +3979,12 @@ dependencies = [
 "libredox",
 ]

+[[package]]
+name = "order-stat"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "efa535d5117d3661134dbf1719b6f0ffe06f2375843b13935db186cd094105eb"
+
 [[package]]
 name = "ordered-float"
 version = "5.0.0"
@@ -3903,6 +4152,50 @@ version = "2.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"

+[[package]]
+name = "pest"
+version = "2.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1db05f56d34358a8b1066f67cbb203ee3e7ed2ba674a6263a1d5ec6db2204323"
+dependencies = [
+ "memchr",
+ "thiserror 2.0.15",
+ "ucd-trie",
+]
+
+[[package]]
+name = "pest_derive"
+version = "2.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bb056d9e8ea77922845ec74a1c4e8fb17e7c218cc4fc11a15c5d25e189aa40bc"
+dependencies = [
+ "pest",
+ "pest_generator",
+]
+
+[[package]]
+name = "pest_generator"
+version = "2.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "87e404e638f781eb3202dc82db6760c8ae8a1eeef7fb3fa8264b2ef280504966"
+dependencies = [
+ "pest",
+ "pest_meta",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.106",
+]
+
+[[package]]
+name = "pest_meta"
+version = "2.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "edd1101f170f5903fde0914f899bb503d9ff5271d7ba76bbb70bea63690cc0d5"
+dependencies = [
+ "pest",
+ "sha2",
+]
+
 [[package]]
 name = "phf"
 version = "0.11.3"
@@ -4119,6 +4412,19 @@ dependencies = [
 "syn 2.0.106",
 ]

+[[package]]
+name = "py_literal"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "102df7a3d46db9d3891f178dcc826dc270a6746277a9ae6436f8d29fd490a8e1"
+dependencies = [
+ "num-bigint",
+ "num-complex",
+ "num-traits",
+ "pest",
+ "pest_derive",
+]
+
 [[package]]
 name = "qoi"
 version = "0.4.1"
@@ -4227,6 +4533,15 @@ dependencies = [
 "rand 0.8.5",
 ]

+[[package]]
+name = "rand_xoshiro"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6f97cdb2a36ed4183de61b2f824cc45c9f1037f28afe0a322e9fff4c108b5aaa"
+dependencies = [
+ "rand_core 0.6.4",
+]
+
 [[package]]
 name = "range-alloc"
 version = "0.1.4"
@@ -4401,6 +4716,12 @@ dependencies = [
 "regex-syntax 0.8.5",
 ]

+[[package]]
+name = "regex-lite"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "53a49587ad06b26609c52e423de037e7f57f20d53535d66e08c695f347df952a"
+
 [[package]]
 name = "regex-syntax"
 version = "0.6.29"
@@ -4696,6 +5017,17 @@ dependencies = [
 "digest",
 ]

+[[package]]
+name = "sha2"
+version = "0.10.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283"
+dependencies = [
+ "cfg-if",
+ "cpufeatures",
+ "digest",
+]
+
 [[package]]
 name = "sharded-slab"
 version = "0.1.7"
@@ -4881,6 +5213,12 @@ dependencies = [
 "x11rb",
 ]

+[[package]]
+name = "space"
+version = "0.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e990cc6cb89a82d70fe722cd7811dbce48a72bbfaebd623e58f142b6db28428f"
+
 [[package]]
 name = "spin"
 version = "0.9.8"
@@ -4899,6 +5237,18 @@ dependencies = [
 "bitflags 2.9.2",
 ]

+[[package]]
+name = "sprs"
+version = "0.11.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8bff8419009a08f6cb7519a602c5590241fbff1446bcc823c07af15386eb801b"
+dependencies = [
+ "ndarray",
+ "num-complex",
+ "num-traits",
+ "smallvec 1.15.1",
+]
+
 [[package]]
 name = "sqlite-loadable"
 version = "0.0.5"
@@ -4924,11 +5274,11 @@ dependencies = [
 ]

 [[package]]
-name = "sqlite3-safetensor-cosine"
+name = "sqlite3-ndarray-math"
 version = "0.1.0"
 dependencies = [
 "ndarray",
- "ndarray-math 0.1.0",
+ "ndarray-math",
 "ndarray-safetensors",
 "sqlite-loadable",
 ]
@@ -5079,6 +5429,16 @@ dependencies = [
 "winapi-util",
 ]

+[[package]]
+name = "terminal_size"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "60b8cb979cb11c32ce1603f8137b22262a9d131aaa5c37b5678025f22b8becd0"
+dependencies = [
+ "rustix 1.0.8",
+ "windows-sys 0.60.2",
+]
+
 [[package]]
 name = "textwrap"
 version = "0.16.2"
@@ -5373,6 +5733,12 @@ version = "1.18.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1dccffe3ce07af9386bfd29e80c0ab1a8205a2fc34e4bcd40364df902cfa8f3f"

+[[package]]
+name = "ucd-trie"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2896d95c02a80c6d6a5d6e953d479f5ddf2dfdb6a244441010e373ac0fb88971"
+
 [[package]]
 name = "uds_windows"
 version = "1.1.0"
@@ -5955,6 +6321,16 @@ dependencies = [
 "windows-targets 0.52.6",
 ]

+[[package]]
+name = "windows"
+version = "0.59.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7f919aee0a93304be7f62e8e5027811bbba96bcb1de84d6618be56e43f8a32a1"
+dependencies = [
+ "windows-core 0.59.0",
+ "windows-targets 0.53.3",
+]
+
 [[package]]
 name = "windows-core"
 version = "0.52.0"
@@ -5964,17 +6340,41 @@ dependencies = [
 "windows-targets 0.52.6",
 ]

+[[package]]
+name = "windows-core"
+version = "0.59.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "810ce18ed2112484b0d4e15d022e5f598113e220c53e373fb31e67e21670c1ce"
+dependencies = [
+ "windows-implement 0.59.0",
+ "windows-interface",
+ "windows-result",
+ "windows-strings 0.3.1",
+ "windows-targets 0.53.3",
+]
+
 [[package]]
 name = "windows-core"
 version = "0.61.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c0fdd3ddb90610c7638aa2b3a3ab2904fb9e5cdbecc643ddb3647212781c4ae3"
 dependencies = [
- "windows-implement",
+ "windows-implement 0.60.0",
 "windows-interface",
 "windows-link",
 "windows-result",
- "windows-strings",
+ "windows-strings 0.4.2",
+]
+
+[[package]]
+name = "windows-implement"
+version = "0.59.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "83577b051e2f49a058c308f17f273b570a6a758386fc291b5f6a934dd84e48c1"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.106",
 ]

 [[package]]
@@ -6014,6 +6414,15 @@ dependencies = [
 "windows-link",
 ]

+[[package]]
+name = "windows-strings"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "87fa48cc5d406560701792be122a10132491cff9d0aeb23583cc2dcafc847319"
+dependencies = [
+ "windows-link",
+]
+
 [[package]]
 name = "windows-strings"
 version = "0.4.2"
@@ -6701,6 +7110,35 @@ dependencies = [
 "syn 2.0.106",
 ]

+[[package]]
+name = "zip"
+version = "2.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fabe6324e908f85a1c52063ce7aa26b68dcb7eb6dbc83a2d148403c9bc3eba50"
+dependencies = [
+ "arbitrary",
+ "crc32fast",
+ "crossbeam-utils",
+ "displaydoc",
+ "flate2",
+ "indexmap 2.10.0",
+ "memchr",
+ "thiserror 2.0.15",
+ "zopfli",
+]
+
+[[package]]
+name = "zopfli"
+version = "0.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "edfc5ee405f504cd4984ecc6f14d02d55cfda60fa4b689434ef4102aae150cd7"
+dependencies = [
+ "bumpalo",
+ "crc32fast",
+ "log",
+ "simd-adler32",
+]
+
 [[package]]
 name = "zune-core"
 version = "0.4.12"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,11 +1,27 @@
 [workspace]
-members = ["ndarray-image", "ndarray-resize", ".", "bounding-box", "ndarray-safetensors", "sqlite3-safetensor-cosine"]
+members = [
+    "ndarray-image",
+    "ndarray-resize",
+    ".",
+    "bounding-box",
+    "ndarray-safetensors",
+    "sqlite3-ndarray-math",
+    "ndcv-bridge",
+    "bbox",
+]

 [workspace.package]
 version = "0.1.0"
 edition = "2024"

+[patch.crates-io]
+linfa = { git = "https://github.com/relf/linfa", branch = "upgrade-ndarray-0.16" }
+linfa-clustering = { git = "https://github.com/relf/linfa", branch = "upgrade-ndarray-0.16" }
+
 [workspace.dependencies]
+divan = { version = "0.1.21" }
+ndarray-npy = "0.9.1"
+serde = { version = "1.0", features = ["derive"] }
 ndarray-image = { path = "ndarray-image" }
 ndarray-resize = { path = "ndarray-resize" }
 mnn = { git = "https://github.com/uttarayan21/mnn-rs", version = "0.2.0", features = [
@@ -20,6 +36,15 @@ mnn-sync = { git = "https://github.com/uttarayan21/mnn-rs", version = "0.1.0", f
    "tracing",
 ], branch = "restructure-tensor-type" }
 nalgebra = { version = "0.34.0", default-features = false, features = ["std"] }
+opencv = { version = "0.95.1" }
+bounding-box = { path = "bounding-box" }
+bytemuck = "1.23.2"
+error-stack = "0.5.0"
+thiserror = "2.0"
+fast_image_resize = "5.2.0"
+img-parts = "0.4.0"
+ndarray = { version = "0.16.1", features = ["rayon"] }
+num = "0.4"

 [package]
 name = "detector"
@@ -50,22 +75,29 @@ bounding-box = { version = "0.1.0", path = "bounding-box" }
 color = "0.3.1"
 itertools = "0.14.0"
 ordered-float = "5.0.0"
-ort = { version = "2.0.0-rc.10", default-features = false, features = [ "std", "tracing", "ndarray"]}
+ort = { version = "2.0.0-rc.10", default-features = false, features = [
+    "std",
+    "tracing",
+    "ndarray",
+    "cuda",
+] }
 ndarray-math = { git = "https://git.darksailor.dev/servius/ndarray-math", version = "0.1.0" }
 ndarray-safetensors = { version = "0.1.0", path = "ndarray-safetensors" }
-sqlite3-safetensor-cosine = { version = "0.1.0", path = "sqlite3-safetensor-cosine" }
+sqlite3-ndarray-math = { version = "0.1.0", path = "sqlite3-ndarray-math" }

 # GUI dependencies
 iced = { version = "0.13", features = ["tokio", "image"] }
 rfd = "0.15"
 futures = "0.3"
 imageproc = "0.25"
+linfa = "0.7.1"
+linfa-clustering = "0.7.1"

 [profile.release]
 debug = true

 [features]
-ort-cuda = ["ort/cuda"]
+ort-cuda = []
 ort-coreml = ["ort/coreml"]
 ort-tensorrt = ["ort/tensorrt"]
 ort-tvm = ["ort/tvm"]
@@ -74,4 +106,8 @@ ort-directml = ["ort/directml"]
 mnn-metal = ["mnn/metal"]
 mnn-coreml = ["mnn/coreml"]

-default = ["mnn-metal","mnn-coreml"]
+default = ["ort-cuda"]
+
+[[test]]
+name = "test_bbox_replacement"
+path = "test_bbox_replacement.rs"
--- a/GUI_DEMO.md
+++ b/GUI_DEMO.md
@@ -1,202 +0,0 @@
-# Face Detector GUI - Demo Documentation
-
-## Overview
-
-This document demonstrates the successful creation of a modern GUI with full image rendering capabilities for the face-detector project using iced.rs, a cross-platform GUI framework for Rust.
-
-## What Was Built
-
-### 🎯 Core Features Implemented
-
-1. **Modern Tabbed Interface**
-   - Detection tab for single image face detection with visual results
-   - Comparison tab for face similarity comparison with side-by-side images
-   - Settings tab for model and parameter configuration
-
-2. **Full Image Rendering System**
-   - Real-time image preview for selected input images
-   - Processed image display with bounding boxes drawn around detected faces
-   - Side-by-side comparison view for face matching
-   - Automatic image scaling and fitting within UI containers
-   - Support for displaying results from both MNN and ONNX backends
-
-3. **File Management**
-   - Image file selection dialogs
-   - Output path selection for processed images
-   - Support for multiple image formats (jpg, jpeg, png, bmp, tiff, webp)
-   - Automatic image loading and display upon selection
-
-4. **Real-time Parameter Control**
-   - Adjustable detection threshold (0.1-1.0)
-   - Adjustable NMS threshold (0.1-1.0)
-   - Model type selection (RetinaFace, YOLO)
-   - Execution backend selection (MNN CPU/Metal/CoreML, ONNX CPU)
-
-5. **Progress Tracking**
-   - Status bar with current operation display
-   - Progress bar for long-running operations
-   - Processing time reporting
-
-6. **Visual Results Display**
-   - Face count reporting with visual confirmation
-   - Processed images with red bounding boxes around detected faces
-   - Similarity scores with interpretation and color coding
-   - Error handling and display
-   - Before/after image comparison
-
-## Architecture
-
-### 🏗️ Project Structure
-
-```
-src/
-├── gui/
-│   ├── mod.rs           # Module declarations
-│   ├── app.rs           # Main application logic
-│   └── bridge.rs        # Integration with face detection backend
-├── bin/
-│   └── gui.rs           # GUI executable entry point
-└── ...                  # Existing face detection modules
-```
-
-### 🔌 Integration Points
-
-The GUI seamlessly integrates with your existing face detection infrastructure:
-
- **Backend Support**: Both MNN and ONNX Runtime backends
- **Model Support**: RetinaFace and YOLO models
- **Hardware Acceleration**: Metal, CoreML, and CPU execution
- **Database Integration**: Ready for face database operations
-
-## Technical Highlights
-
-### ⚡ Performance Features
-
-1. **Asynchronous Operations**: All face detection operations run asynchronously to keep the UI responsive
-2. **Memory Efficient**: Proper resource management for image processing
-3. **Hardware Accelerated**: Full support for Metal and CoreML on macOS
-
-### 🎨 User Experience
-
-1. **Intuitive Design**: Clean, modern interface with logical tab organization
-2. **Real-time Feedback**: Immediate visual feedback for all operations
-3. **Error Handling**: User-friendly error messages and recovery
-4. **Accessibility**: Proper contrast and sizing for readability
-
-## Usage Examples
-
-### Running the GUI
-
-```bash
-# Build and run the GUI
-cargo run --bin gui
-
-# Or build the binary
-cargo build --bin gui --release
-./target/release/gui
-```
-
-### Face Detection Workflow
-
-1. **Select Image**: Click "Select Image" to choose an input image
-   - Image immediately appears in the "Original Image" preview
-2. **Adjust Parameters**: Use sliders to fine-tune detection thresholds
-3. **Choose Backend**: Select MNN or ONNX execution backend
-4. **Run Detection**: Click "Detect Faces" to process the image
-5. **View Visual Results**: 
-   - Original image displayed on the left
-   - Processed image with red bounding boxes on the right
-   - Face count, processing time, and status information below
-
-### Face Comparison Workflow
-
-1. **Select Images**: Choose two images for comparison
-   - Both images appear side-by-side in the comparison view
-   - "First Image" and "Second Image" clearly labeled
-2. **Configure Settings**: Adjust detection and comparison parameters
-3. **Run Comparison**: Click "Compare Faces" to analyze similarity
-4. **View Visual Results**: 
-   - Both original images displayed side-by-side for easy comparison
-   - Similarity scores with automatic interpretation and color coding:
-     - **> 0.8**: Very likely the same person (green text)
-     - **0.6-0.8**: Possibly the same person (yellow text)
-     - **0.4-0.6**: Unlikely to be the same person (orange text)
-     - **< 0.4**: Very unlikely to be the same person (red text)
-
-## Current Status
-
-### ✅ Successfully Implemented
-
- [x] Complete GUI framework integration
- [x] Tabbed interface with three main sections
- [x] File dialogs for image selection
- [x] **Full image rendering and display system**
- [x] **Real-time image preview for selected inputs**
- [x] **Processed image display with bounding boxes**
- [x] **Side-by-side image comparison view**
- [x] Parameter controls with real-time updates
- [x] Asynchronous operation handling
- [x] Progress tracking and status reporting
- [x] Integration with existing face detection backend
- [x] Support for both MNN and ONNX backends
- [x] Error handling and user feedback
- [x] Cross-platform compatibility (tested on macOS)
-
-### 🔧 Known Issues
-
-1. **Array Bounds Error**: There's a runtime error in the RetinaFace implementation that needs debugging:
-   ```
-   thread 'tokio-runtime-worker' panicked at src/facedet/retinaface.rs:178:22:
-   ndarray: index 43008 is out of bounds for array of shape [43008]
-   ```
-   This appears to be related to the original face detection logic, not the GUI code.
-
-### 🚀 Future Enhancements
-
-1. ~~**Image Display**: Add image preview and result visualization~~ ✅ **COMPLETED**
-2. **Batch Processing**: Support for processing multiple images
-3. **Database Integration**: GUI for face database operations
-4. **Export Features**: Save results in various formats
-5. **Configuration Persistence**: Remember user settings
-6. **Drag & Drop**: Direct image dropping support
-7. **Zoom and Pan**: Advanced image viewing capabilities
-8. **Landmark Visualization**: Display facial landmarks on detected faces
-
-## Technical Dependencies
-
-### New Dependencies Added
-
-```toml
-# GUI dependencies
-iced = { version = "0.13", features = ["tokio", "image"] }
-rfd = "0.15"              # File dialogs
-futures = "0.3"           # Async utilities
-imageproc = "0.25"        # Image processing utilities
-```
-
-### Integration Approach
-
-The GUI was designed as a thin layer over your existing face detection engine:
-
- **Minimal Changes**: Only added new modules, no modifications to existing detection logic
- **Clean Separation**: GUI logic is completely separate from core detection algorithms
- **Reusable Components**: Bridge pattern allows easy extension to new backends
- **Maintainable Code**: Clear module boundaries and consistent error handling
-
-## Compilation and Testing
-
-The GUI compiles successfully with only minor warnings and has been tested on macOS with Apple Silicon. The interface is responsive and all UI components work as expected.
-
-### Build Output
-```
-Finished `dev` profile [unoptimized + debuginfo] target(s) in 1m 05s
-Running `/target/debug/gui`
-```
-
-The application launches properly, displays the GUI interface, and responds to user interactions. The only runtime issue is in the underlying face detection algorithm, which is separate from the GUI implementation.
-
-## Conclusion
-
-The GUI implementation successfully provides a modern, user-friendly interface for your face detection system. It maintains the full power and flexibility of your existing CLI tool while making it accessible to non-technical users through an intuitive graphical interface.
-
-The architecture is extensible and maintainable, making it easy to add new features and functionality as your face detection system evolves.
--- a/KD4_7131.CR2
+++ b/KD4_7131.CR2
--- a/Makefile.toml
+++ b/Makefile.toml
@@ -1,3 +1,7 @@
+[tasks.convert]
+dependencies = ["convert_facenet", "convert_retinaface"]
+workspace = false
+
 [tasks.convert_facenet]
 command = "MNNConvert"
 args = [
@@ -11,6 +15,7 @@ args = [
    "--bizCode",
    "MNN",
 ]
+workspace = false

 [tasks.convert_retinaface]
 command = "MNNConvert"
@@ -25,3 +30,9 @@ args = [
    "--bizCode",
    "MNN",
 ]
+workspace = false
+
+[tasks.gui]
+command = "cargo"
+args = ["run", "--release", "--bin", "gui"]
+workspace = false
--- a/bbox/Cargo.toml
+++ b/bbox/Cargo.toml
@@ -0,0 +1,13 @@
+[package]
+name = "bbox"
+version.workspace = true
+edition.workspace = true
+
+[dependencies]
+ndarray.workspace = true
+num = "0.4.3"
+serde = { workspace = true, features = ["derive"], optional = true }
+
+[features]
+serde = ["dep:serde"]
+default = ["serde"]
--- a/bbox/src/lib.rs
+++ b/bbox/src/lib.rs
@@ -0,0 +1,708 @@
+pub mod traits;
+
+/// A bounding box of co-ordinates whose origin is at the top-left corner.
+#[derive(
+    Debug, Copy, Clone, PartialEq, PartialOrd, Eq, Hash, serde::Serialize, serde::Deserialize,
+)]
+#[non_exhaustive]
+pub struct BBox<T = f32> {
+    pub x: T,
+    pub y: T,
+    pub width: T,
+    pub height: T,
+}
+
+impl<T> From<[T; 4]> for BBox<T> {
+    fn from([x, y, width, height]: [T; 4]) -> Self {
+        Self {
+            x,
+            y,
+            width,
+            height,
+        }
+    }
+}
+
+impl<T: Copy> BBox<T> {
+    pub fn new(x: T, y: T, width: T, height: T) -> Self {
+        Self {
+            x,
+            y,
+            width,
+            height,
+        }
+    }
+
+    /// Casts the internal values to another type using [as] keyword
+    pub fn cast<T2>(self) -> BBox<T2>
+    where
+        T: num::cast::AsPrimitive<T2>,
+        T2: Copy + 'static,
+    {
+        BBox {
+            x: self.x.as_(),
+            y: self.y.as_(),
+            width: self.width.as_(),
+            height: self.height.as_(),
+        }
+    }
+
+    /// Clamps all the internal values to the given min and max.
+    pub fn clamp(&self, min: T, max: T) -> Self
+    where
+        T: std::cmp::PartialOrd,
+    {
+        Self {
+            x: num::clamp(self.x, min, max),
+            y: num::clamp(self.y, min, max),
+            width: num::clamp(self.width, min, max),
+            height: num::clamp(self.height, min, max),
+        }
+    }
+
+    pub fn clamp_box(&self, bbox: BBox<T>) -> Self
+    where
+        T: std::cmp::PartialOrd,
+        T: num::Zero,
+        T: core::ops::Add<Output = T>,
+        T: core::ops::Sub<Output = T>,
+    {
+        let x1 = num::clamp(self.x1(), bbox.x1(), bbox.x2());
+        let y1 = num::clamp(self.y1(), bbox.y1(), bbox.y2());
+        let x2 = num::clamp(self.x2(), bbox.x1(), bbox.x2());
+        let y2 = num::clamp(self.y2(), bbox.y1(), bbox.y2());
+        Self::new_xyxy(x1, y1, x2, y2)
+    }
+
+    pub fn normalize(&self, width: T, height: T) -> Self
+    where
+        T: core::ops::Div<Output = T> + Copy,
+    {
+        Self {
+            x: self.x / width,
+            y: self.y / height,
+            width: self.width / width,
+            height: self.height / height,
+        }
+    }
+
+    /// Normalize after casting to float
+    pub fn normalize_f64(&self, width: T, height: T) -> BBox<f64>
+    where
+        T: core::ops::Div<Output = T> + Copy,
+        T: num::cast::AsPrimitive<f64>,
+    {
+        BBox {
+            x: self.x.as_() / width.as_(),
+            y: self.y.as_() / height.as_(),
+            width: self.width.as_() / width.as_(),
+            height: self.height.as_() / height.as_(),
+        }
+    }
+
+    pub fn denormalize(&self, width: T, height: T) -> Self
+    where
+        T: core::ops::Mul<Output = T> + Copy,
+    {
+        Self {
+            x: self.x * width,
+            y: self.y * height,
+            width: self.width * width,
+            height: self.height * height,
+        }
+    }
+
+    pub fn height(&self) -> T {
+        self.height
+    }
+
+    pub fn width(&self) -> T {
+        self.width
+    }
+
+    pub fn padding(&self, padding: T) -> Self
+    where
+        T: core::ops::Add<Output = T> + core::ops::Sub<Output = T> + Copy,
+    {
+        Self {
+            x: self.x - padding,
+            y: self.y - padding,
+            width: self.width + padding + padding,
+            height: self.height + padding + padding,
+        }
+    }
+
+    pub fn padding_height(&self, padding: T) -> Self
+    where
+        T: core::ops::Add<Output = T> + core::ops::Sub<Output = T> + Copy,
+    {
+        Self {
+            x: self.x,
+            y: self.y - padding,
+            width: self.width,
+            height: self.height + padding + padding,
+        }
+    }
+
+    pub fn padding_width(&self, padding: T) -> Self
+    where
+        T: core::ops::Add<Output = T> + core::ops::Sub<Output = T> + Copy,
+    {
+        Self {
+            x: self.x - padding,
+            y: self.y,
+            width: self.width + padding + padding,
+            height: self.height,
+        }
+    }
+
+    // Enlarge / shrink the bounding box by a factor while
+    // keeping the center point and the aspect ratio fixed
+    pub fn scale(&self, factor: T) -> Self
+    where
+        T: core::ops::Mul<Output = T>,
+        T: core::ops::Sub<Output = T>,
+        T: core::ops::Add<Output = T>,
+        T: core::ops::Div<Output = T>,
+        T: num::One + Copy,
+    {
+        let two = num::one::<T>() + num::one::<T>();
+        let width = self.width * factor;
+        let height = self.height * factor;
+        let width_inc = width - self.width;
+        let height_inc = height - self.height;
+        Self {
+            x: self.x - width_inc / two,
+            y: self.y - height_inc / two,
+            width,
+            height,
+        }
+    }
+
+    pub fn scale_x(&self, factor: T) -> Self
+    where
+        T: core::ops::Mul<Output = T>
+            + core::ops::Sub<Output = T>
+            + core::ops::Add<Output = T>
+            + core::ops::Div<Output = T>
+            + num::One
+            + Copy,
+    {
+        let two = num::one::<T>() + num::one::<T>();
+        let width = self.width * factor;
+        let width_inc = width - self.width;
+        Self {
+            x: self.x - width_inc / two,
+            y: self.y,
+            width,
+            height: self.height,
+        }
+    }
+
+    pub fn scale_y(&self, factor: T) -> Self
+    where
+        T: core::ops::Mul<Output = T>
+            + core::ops::Sub<Output = T>
+            + core::ops::Add<Output = T>
+            + core::ops::Div<Output = T>
+            + num::One
+            + Copy,
+    {
+        let two = num::one::<T>() + num::one::<T>();
+        let height = self.height * factor;
+        let height_inc = height - self.height;
+        Self {
+            x: self.x,
+            y: self.y - height_inc / two,
+            width: self.width,
+            height,
+        }
+    }
+
+    pub fn offset(&self, offset: Point<T>) -> Self
+    where
+        T: core::ops::Add<Output = T> + Copy,
+    {
+        Self {
+            x: self.x + offset.x,
+            y: self.y + offset.y,
+            width: self.width,
+            height: self.height,
+        }
+    }
+
+    /// Translate the bounding box by the given offset
+    /// if they are in the same scale
+    pub fn translate(&self, bbox: Self) -> Self
+    where
+        T: core::ops::Add<Output = T> + Copy,
+    {
+        Self {
+            x: self.x + bbox.x,
+            y: self.y + bbox.y,
+            width: self.width,
+            height: self.height,
+        }
+    }
+
+    pub fn with_top_left(&self, top_left: Point<T>) -> Self {
+        Self {
+            x: top_left.x,
+            y: top_left.y,
+            width: self.width,
+            height: self.height,
+        }
+    }
+
+    pub fn center(&self) -> Point<T>
+    where
+        T: core::ops::Add<Output = T> + core::ops::Div<Output = T> + Copy,
+        T: num::One,
+    {
+        let two = T::one() + T::one();
+        Point::new(self.x + self.width / two, self.y + self.height / two)
+    }
+
+    pub fn area(&self) -> T
+    where
+        T: core::ops::Mul<Output = T> + Copy,
+    {
+        self.width * self.height
+    }
+
+    // Corresponds to self.x1() and self.y1()
+    pub fn top_left(&self) -> Point<T> {
+        Point::new(self.x, self.y)
+    }
+
+    pub fn top_right(&self) -> Point<T>
+    where
+        T: core::ops::Add<Output = T> + Copy,
+    {
+        Point::new(self.x + self.width, self.y)
+    }
+
+    pub fn bottom_left(&self) -> Point<T>
+    where
+        T: core::ops::Add<Output = T> + Copy,
+    {
+        Point::new(self.x, self.y + self.height)
+    }
+
+    // Corresponds to self.x2() and self.y2()
+    pub fn bottom_right(&self) -> Point<T>
+    where
+        T: core::ops::Add<Output = T> + Copy,
+    {
+        Point::new(self.x + self.width, self.y + self.height)
+    }
+
+    pub const fn x1(&self) -> T {
+        self.x
+    }
+
+    pub const fn y1(&self) -> T {
+        self.y
+    }
+
+    pub fn x2(&self) -> T
+    where
+        T: core::ops::Add<Output = T> + Copy,
+    {
+        self.x + self.width
+    }
+
+    pub fn y2(&self) -> T
+    where
+        T: core::ops::Add<Output = T> + Copy,
+    {
+        self.y + self.height
+    }
+
+    pub fn overlap(&self, other: &Self) -> T
+    where
+        T: std::cmp::PartialOrd
+            + traits::min::Min
+            + traits::max::Max
+            + num::Zero
+            + core::ops::Add<Output = T>
+            + core::ops::Sub<Output = T>
+            + core::ops::Mul<Output = T>
+            + Copy,
+    {
+        let x1 = self.x.max(other.x);
+        let y1 = self.y.max(other.y);
+        let x2 = (self.x + self.width).min(other.x + other.width);
+        let y2 = (self.y + self.height).min(other.y + other.height);
+        let width = (x2 - x1).max(T::zero());
+        let height = (y2 - y1).max(T::zero());
+        width * height
+    }
+
+    pub fn iou(&self, other: &Self) -> T
+    where
+        T: std::cmp::Ord
+            + num::Zero
+            + traits::min::Min
+            + traits::max::Max
+            + core::ops::Add<Output = T>
+            + core::ops::Sub<Output = T>
+            + core::ops::Mul<Output = T>
+            + core::ops::Div<Output = T>
+            + Copy,
+    {
+        let overlap = self.overlap(other);
+        let union = self.area() + other.area() - overlap;
+        overlap / union
+    }
+
+    pub fn contains(&self, point: Point<T>) -> bool
+    where
+        T: std::cmp::PartialOrd + core::ops::Add<Output = T> + Copy,
+    {
+        point.x >= self.x
+            && point.x <= self.x + self.width
+            && point.y >= self.y
+            && point.y <= self.y + self.height
+    }
+
+    pub fn contains_bbox(&self, other: Self) -> bool
+    where
+        T: std::cmp::PartialOrd + Copy,
+        T: core::ops::Add<Output = T>,
+    {
+        self.contains(other.top_left())
+            && self.contains(other.top_right())
+            && self.contains(other.bottom_left())
+            && self.contains(other.bottom_right())
+    }
+
+    pub fn new_xywh(x: T, y: T, width: T, height: T) -> Self {
+        Self {
+            x,
+            y,
+            width,
+            height,
+        }
+    }
+    pub fn new_xyxy(x1: T, y1: T, x2: T, y2: T) -> Self
+    where
+        T: core::ops::Sub<Output = T> + Copy,
+    {
+        Self {
+            x: x1,
+            y: y1,
+            width: x2 - x1,
+            height: y2 - y1,
+        }
+    }
+
+    pub fn containing(box1: Self, box2: Self) -> Self
+    where
+        T: traits::min::Min + traits::max::Max + Copy,
+        T: core::ops::Sub<Output = T>,
+        T: core::ops::Add<Output = T>,
+    {
+        let x1 = box1.x.min(box2.x);
+        let y1 = box1.y.min(box2.y);
+        let x2 = box1.x2().max(box2.x2());
+        let y2 = box1.y2().max(box2.y2());
+        Self::new_xyxy(x1, y1, x2, y2)
+    }
+}
+
+impl<T: core::ops::Sub<Output = T> + Copy> core::ops::Sub<T> for BBox<T> {
+    type Output = BBox<T>;
+    fn sub(self, rhs: T) -> Self::Output {
+        BBox {
+            x: self.x - rhs,
+            y: self.y - rhs,
+            width: self.width - rhs,
+            height: self.height - rhs,
+        }
+    }
+}
+
+impl<T: core::ops::Add<Output = T> + Copy> core::ops::Add<T> for BBox<T> {
+    type Output = BBox<T>;
+    fn add(self, rhs: T) -> Self::Output {
+        BBox {
+            x: self.x + rhs,
+            y: self.y + rhs,
+            width: self.width + rhs,
+            height: self.height + rhs,
+        }
+    }
+}
+impl<T: core::ops::Mul<Output = T> + Copy> core::ops::Mul<T> for BBox<T> {
+    type Output = BBox<T>;
+    fn mul(self, rhs: T) -> Self::Output {
+        BBox {
+            x: self.x * rhs,
+            y: self.y * rhs,
+            width: self.width * rhs,
+            height: self.height * rhs,
+        }
+    }
+}
+impl<T: core::ops::Div<Output = T> + Copy> core::ops::Div<T> for BBox<T> {
+    type Output = BBox<T>;
+    fn div(self, rhs: T) -> Self::Output {
+        BBox {
+            x: self.x / rhs,
+            y: self.y / rhs,
+            width: self.width / rhs,
+            height: self.height / rhs,
+        }
+    }
+}
+
+impl<T> core::ops::Add<BBox<T>> for BBox<T>
+where
+    T: core::ops::Sub<Output = T>
+        + core::ops::Add<Output = T>
+        + traits::min::Min
+        + traits::max::Max
+        + Copy,
+{
+    type Output = BBox<T>;
+    fn add(self, rhs: BBox<T>) -> Self::Output {
+        let x1 = self.x1().min(rhs.x1());
+        let y1 = self.y1().min(rhs.y1());
+        let x2 = self.x2().max(rhs.x2());
+        let y2 = self.y2().max(rhs.y2());
+        BBox::new_xyxy(x1, y1, x2, y2)
+    }
+}
+
+#[test]
+fn test_bbox_add() {
+    let bbox1: BBox<usize> = BBox::new_xyxy(0, 0, 10, 10);
+    let bbox2: BBox<usize> = BBox::new_xyxy(5, 5, 15, 15);
+    let bbox3: BBox<usize> = bbox1 + bbox2;
+    assert_eq!(bbox3, BBox::new_xyxy(0, 0, 15, 15).cast());
+}
+
+#[derive(
+    Debug, Copy, Clone, serde::Serialize, serde::Deserialize, PartialEq, PartialOrd, Eq, Ord, Hash,
+)]
+pub struct Point<T = f32> {
+    x: T,
+    y: T,
+}
+
+impl<T> Point<T> {
+    pub const fn new(x: T, y: T) -> Self {
+        Self { x, y }
+    }
+
+    pub const fn x(&self) -> T
+    where
+        T: Copy,
+    {
+        self.x
+    }
+
+    pub const fn y(&self) -> T
+    where
+        T: Copy,
+    {
+        self.y
+    }
+
+    pub fn cast<T2>(&self) -> Point<T2>
+    where
+        T: num::cast::AsPrimitive<T2>,
+        T2: Copy + 'static,
+    {
+        Point {
+            x: self.x.as_(),
+            y: self.y.as_(),
+        }
+    }
+}
+
+impl<T: core::ops::Sub<T, Output = T> + Copy> core::ops::Sub<Point<T>> for Point<T> {
+    type Output = Point<T>;
+    fn sub(self, rhs: Point<T>) -> Self::Output {
+        Point {
+            x: self.x - rhs.x,
+            y: self.y - rhs.y,
+        }
+    }
+}
+
+impl<T: core::ops::Add<T, Output = T> + Copy> core::ops::Add<Point<T>> for Point<T> {
+    type Output = Point<T>;
+    fn add(self, rhs: Point<T>) -> Self::Output {
+        Point {
+            x: self.x + rhs.x,
+            y: self.y + rhs.y,
+        }
+    }
+}
+
+impl<T: core::ops::Sub<Output = T> + Copy> Point<T> {
+    /// If both the boxes are in the same scale then make the translation of the origin to the
+    /// other box
+    pub fn with_origin(&self, origin: Self) -> Self {
+        *self - origin
+    }
+}
+
+impl<T: core::ops::Add<Output = T> + Copy> Point<T> {
+    pub fn translate(&self, point: Point<T>) -> Self {
+        *self + point
+    }
+}
+
+impl<I: num::Zero> BBox<I>
+where
+    I: num::cast::AsPrimitive<usize>,
+{
+    pub fn zeros_ndarray_2d<T: num::Zero + Copy>(&self) -> ndarray::Array2<T> {
+        ndarray::Array2::<T>::zeros((self.height.as_(), self.width.as_()))
+    }
+    pub fn zeros_ndarray_3d<T: num::Zero + Copy>(&self, channels: usize) -> ndarray::Array3<T> {
+        ndarray::Array3::<T>::zeros((self.height.as_(), self.width.as_(), channels))
+    }
+    pub fn ones_ndarray_2d<T: num::One + Copy>(&self) -> ndarray::Array2<T> {
+        ndarray::Array2::<T>::ones((self.height.as_(), self.width.as_()))
+    }
+}
+
+impl<T: num::Float> BBox<T> {
+    pub fn round(&self) -> Self {
+        Self {
+            x: self.x.round(),
+            y: self.y.round(),
+            width: self.width.round(),
+            height: self.height.round(),
+        }
+    }
+}
+
+#[cfg(test)]
+mod bbox_clamp_tests {
+    use super::*;
+    #[test]
+    pub fn bbox_test_clamp_box() {
+        let large_box = BBox::new(0, 0, 100, 100);
+        let small_box = BBox::new(10, 10, 20, 20);
+        let clamped = large_box.clamp_box(small_box);
+        assert_eq!(clamped, small_box);
+    }
+
+    #[test]
+    pub fn bbox_test_clamp_box_offset() {
+        let box_a = BBox::new(0, 0, 100, 100);
+        let box_b = BBox::new(-10, -10, 20, 20);
+        let clamped = box_b.clamp_box(box_a);
+        let expected = BBox::new(0, 0, 10, 10);
+        assert_eq!(expected, clamped);
+    }
+}
+
+#[cfg(test)]
+mod bbox_padding_tests {
+    use super::*;
+    #[test]
+    pub fn bbox_test_padding() {
+        let bbox = BBox::new(0, 0, 10, 10);
+        let padded = bbox.padding(2);
+        assert_eq!(padded, BBox::new(-2, -2, 14, 14));
+    }
+
+    #[test]
+    pub fn bbox_test_padding_height() {
+        let bbox = BBox::new(0, 0, 10, 10);
+        let padded = bbox.padding_height(2);
+        assert_eq!(padded, BBox::new(0, -2, 10, 14));
+    }
+
+    #[test]
+    pub fn bbox_test_padding_width() {
+        let bbox = BBox::new(0, 0, 10, 10);
+        let padded = bbox.padding_width(2);
+        assert_eq!(padded, BBox::new(-2, 0, 14, 10));
+    }
+
+    #[test]
+    pub fn bbox_test_clamped_padding() {
+        let bbox = BBox::new(0, 0, 10, 10);
+        let padded = bbox.padding(2);
+        let clamp = BBox::new(0, 0, 12, 12);
+        let clamped = padded.clamp_box(clamp);
+        assert_eq!(clamped, clamp);
+    }
+
+    #[test]
+    pub fn bbox_clamp_failure() {
+        let og = BBox::new(475.0, 79.625, 37.0, 282.15);
+        let padded = BBox {
+            x: 471.3,
+            y: 51.412499999999994,
+            width: 40.69999999999999,
+            height: 338.54999999999995,
+        };
+        let clamp = BBox::new(0.0, 0.0, 512.0, 512.0);
+        let sus = padded.clamp_box(clamp);
+        assert!(clamp.contains_bbox(sus));
+    }
+}
+
+#[cfg(test)]
+mod bbox_scale_tests {
+    use super::*;
+    #[test]
+    pub fn bbox_test_scale_int() {
+        let bbox = BBox::new(0, 0, 10, 10);
+        let scaled = bbox.scale(2);
+        assert_eq!(scaled, BBox::new(-5, -5, 20, 20));
+    }
+
+    #[test]
+    pub fn bbox_test_scale_float() {
+        let bbox = BBox::new(0, 0, 10, 10).cast();
+        let scaled = bbox.scale(1.05); // 5% increase
+        let l = 10.0 * 0.05;
+        assert_eq!(scaled, BBox::new(-l / 2.0, -l / 2.0, 10.0 + l, 10.0 + l));
+    }
+
+    #[test]
+    pub fn bbox_test_scale_float_negative() {
+        let bbox = BBox::new(0, 0, 10, 10).cast();
+        let scaled = bbox.scale(0.95); // 5% decrease
+        let l = -10.0 * 0.05;
+        assert_eq!(scaled, BBox::new(-l / 2.0, -l / 2.0, 10.0 + l, 10.0 + l));
+    }
+
+    #[test]
+    pub fn bbox_scale_float() {
+        let bbox = BBox::new_xywh(0, 0, 200, 200);
+        let scaled = bbox.cast::<f64>().scale(1.1).cast::<i32>().clamp(0, 1000);
+        let expected = BBox::new(0, 0, 220, 220);
+        assert_eq!(scaled, expected);
+    }
+    #[test]
+    pub fn add_padding_bbox_example() {
+        // let result = add_padding_bbox(
+        //     vec![Rect::new(100, 200, 300, 400)],
+        //     (0.1, 0.1),
+        //     (1000, 1000),
+        // );
+        //   assert_eq!(result[0], Rect::new(70, 160, 360, 480));
+        let bbox = BBox::new(100, 200, 300, 400);
+        let scaled = bbox.cast::<f64>().scale(1.2).cast::<i32>().clamp(0, 1000);
+        assert_eq!(bbox, BBox::new(100, 200, 300, 400));
+        assert_eq!(scaled, BBox::new(70, 160, 360, 480));
+    }
+    #[test]
+    pub fn scale_bboxes() {
+        // let result = scale_bboxes(Rect::new(100, 200, 300, 400), (1000, 1000), (500, 500));
+        // assert_eq!(result[0], Rect::new(200, 400, 600, 800));
+        let bbox = BBox::new(100, 200, 300, 400);
+        let scaled = bbox.scale(2);
+        assert_eq!(scaled, BBox::new(200, 400, 600, 800));
+    }
+}
--- a/bbox/src/traits.rs
+++ b/bbox/src/traits.rs
@@ -0,0 +1,2 @@
+pub mod max;
+pub mod min;
--- a/bbox/src/traits/max.rs
+++ b/bbox/src/traits/max.rs
@@ -0,0 +1,27 @@
+pub trait Max: Sized + Copy {
+    fn max(self, other: Self) -> Self;
+}
+
+macro_rules! impl_max {
+    ($($t:ty),*) => {
+        $(
+            impl Max for $t {
+                fn max(self, other: Self) -> Self {
+                    Ord::max(self, other)
+                }
+            }
+        )*
+    };
+    (float $($t:ty),*) => {
+        $(
+            impl Max for $t {
+                fn max(self, other: Self) -> Self {
+                    Self::max(self, other)
+                }
+            }
+        )*
+    };
+}
+
+impl_max!(usize, u8, u16, u32, u64, u128, isize, i8, i16, i32, i64, i128);
+impl_max!(float f32, f64);
--- a/bbox/src/traits/min.rs
+++ b/bbox/src/traits/min.rs
@@ -0,0 +1,27 @@
+pub trait Min: Sized + Copy {
+    fn min(self, other: Self) -> Self;
+}
+
+macro_rules! impl_min {
+    ($($t:ty),*) => {
+        $(
+            impl Min for $t {
+                fn min(self, other: Self) -> Self {
+                    Ord::min(self, other)
+                }
+            }
+        )*
+    };
+    (float $($t:ty),*) => {
+        $(
+            impl Min for $t {
+                fn min(self, other: Self) -> Self {
+                    Self::min(self, other)
+                }
+            }
+        )*
+    };
+}
+
+impl_min!(usize, u8, u16, u32, u64, u128, isize, i8, i16, i32, i64, i128);
+impl_min!(float f32, f64);
--- a/bounding-box/src/lib.rs
+++ b/bounding-box/src/lib.rs
@@ -163,6 +163,21 @@ impl<T: Num, const D: usize> AxisAlignedBoundingBox<T, D> {
        }
    }

+    pub fn scale_uniform(self, scalar: T) -> Self
+    where
+        T: core::ops::MulAssign,
+        T: core::ops::DivAssign,
+        T: core::ops::SubAssign,
+    {
+        let two = T::one() + T::one();
+        let new_size = self.size * scalar;
+        let new_point = self.point.coords - (new_size - self.size) / two;
+        Self {
+            point: Point::from(new_point),
+            size: new_size,
+        }
+    }
+
    pub fn contains_bbox(&self, other: &Self) -> bool
    where
        T: core::ops::AddAssign,
@@ -270,15 +285,17 @@ impl<T: Num, const D: usize> AxisAlignedBoundingBox<T, D> {
        })
    }

-    // pub fn as_<T2>(&self) -> Option<Aabb<T2, D>>
-    // where
-    //     T2: Num + simba::scalar::SubsetOf<T>,
-    // {
-    //     Some(Aabb {
-    //         point: Point::from(self.point.coords.as_()),
-    //         size: self.size.as_(),
-    //     })
-    // }
+    pub fn as_<T2>(&self) -> Aabb<T2, D>
+    where
+        T2: Num,
+        T: num::cast::AsPrimitive<T2>,
+    {
+        Aabb {
+            point: Point::from(self.point.coords.map(|x| x.as_())),
+            size: self.size.map(|x| x.as_()),
+        }
+    }
+
    pub fn measure(&self) -> T
    where
        T: core::ops::MulAssign,
--- a/cr2.xmp
+++ b/cr2.xmp
@@ -1,62 +0,0 @@
-<?xpacket begin='' id='W5M0MpCehiHzreSzNTczkc9d'?><x:xmpmeta xmlns:x="adobe:ns:meta/"><rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"><rdf:Description rdf:about="" xmlns:xmp="http://ns.adobe.com/xap/1.0/"><xmp:Rating>0</xmp:Rating></rdf:Description></rdf:RDF></x:xmpmeta>                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-                                                                                                                                
-             <?xpacket end='w'?>
--- a/embedding.sql
+++ b/embedding.sql
@@ -1,9 +0,0 @@
-.load /Users/fs0c131y/.cache/cargo/target/release/libsqlite3_safetensor_cosine.dylib
-
-SELECT
-    cosine_similarity(e1.embedding, e2.embedding) AS similarity
-FROM
-    embeddings AS e1
-    CROSS JOIN embeddings AS e2
-WHERE
-    e1.id = e2.id;
--- a/flake.nix
+++ b/flake.nix
@@ -43,6 +43,8 @@
      system: let
        pkgs = import nixpkgs {
          inherit system;
+          config.allowUnfree = true;
+          config.cudaSupport = pkgs.stdenv.isLinux;
          overlays = [
            rust-overlay.overlays.default
            (final: prev: {
@@ -75,7 +77,7 @@
        craneLib = (crane.mkLib pkgs).overrideToolchain stableToolchain;
        craneLibLLvmTools = (crane.mkLib pkgs).overrideToolchain stableToolchainWithLLvmTools;

-        ort_static = pkgs.onnxruntime.overrideAttrs (old: {
+        ort_static = (pkgs.onnxruntime.overide {cudaSupport = true;}).overrideAttrs (old: {
          cmakeFlags =
            old.cmakeFlags
            ++ [
@@ -198,8 +200,9 @@
        devShells = {
          default = pkgs.mkShell.override {stdenv = pkgs.clangStdenv;} (
            commonArgs
-            // {
+            // rec {
              LLDB_DEBUGSERVER_PATH = "/Applications/Xcode.app/Contents/SharedFrameworks/LLDB.framework/Versions/A/Resources/debugserver";
+              LD_LIBRARY_PATH = "$LD_LIBRARY_PATH:${builtins.toString (pkgs.lib.makeLibraryPath packages)}";
              packages = with pkgs;
                [
                  stableToolchainWithRustAnalyzer
@@ -211,9 +214,41 @@
                  mnn
                  cargo-make
                  hyperfine
+                  opencv
+                  uv
+                  # (python312.withPackages (ps:
+                  #   with ps; [
+                  #     numpy
+                  #     matplotlib
+                  #     scikit-learn
+                  #     opencv-python
+                  #     seaborn
+                  #     torch
+                  #     torchvision
+                  #     tensorflow-lite
+                  #     retinaface
+                  #     facenet-pytorch
+                  #     tqdm
+                  #     pillow
+                  #     orjson
+                  #     huggingface-hub
+                  #     # insightface
+                  #   ]))
                ]
                ++ (lib.optionals pkgs.stdenv.isDarwin [
                  apple-sdk_13
+                ])
+                ++ (lib.optionals pkgs.stdenv.isLinux [
+                  xorg.libX11
+                  xorg.libXcursor
+                  xorg.libXrandr
+                  xorg.libXi
+                  xorg.libxcb
+                  libxkbcommon
+                  vulkan-loader
+                  wayland
+                  zenity
+                  cudatoolkit
                ]);
            }
          );
--- a/models/facenet.mnn
+++ b/models/facenet.mnn
--- a/models/facenet.onnx
+++ b/models/facenet.onnx
--- a/models/retinaface.mnn
+++ b/models/retinaface.mnn
--- a/models/retinaface.onnx
+++ b/models/retinaface.onnx
--- a/ndcv-bridge/Cargo.toml
+++ b/ndcv-bridge/Cargo.toml
@@ -0,0 +1,36 @@
+[package]
+name = "ndcv-bridge"
+version.workspace = true
+edition.workspace = true
+
+[dependencies]
+bounding-box.workspace = true
+nalgebra.workspace = true
+bytemuck.workspace = true
+error-stack.workspace = true
+fast_image_resize.workspace = true
+ndarray = { workspace = true, features = ["rayon"] }
+num.workspace = true
+opencv = { workspace = true, optional = true }
+rayon = "1.10.0"
+thiserror.workspace = true
+tracing = "0.1.41"
+wide = "0.7.32"
+img-parts.workspace = true
+
+[dev-dependencies]
+divan.workspace = true
+ndarray-npy.workspace = true
+
+[features]
+opencv = ["dep:opencv"]
+default = ["opencv"]
+
+
+[[bench]]
+name = "conversions"
+harness = false
+
+[[bench]]
+name = "gaussian"
+harness = false
--- a/ndcv-bridge/benches/conversions.rs
+++ b/ndcv-bridge/benches/conversions.rs
@@ -0,0 +1,75 @@
+use divan::black_box;
+use ndcv_bridge::*;
+
+// #[global_allocator]
+// static ALLOC: AllocProfiler = AllocProfiler::system();
+
+fn main() {
+    divan::main();
+}
+
+#[divan::bench]
+fn bench_3d_mat_to_ndarray_512() {
+    bench_mat_to_3d_ndarray(512);
+}
+
+#[divan::bench]
+fn bench_3d_mat_to_ndarray_1024() {
+    bench_mat_to_3d_ndarray(1024);
+}
+
+#[divan::bench]
+fn bench_3d_mat_to_ndarray_2k() {
+    bench_mat_to_3d_ndarray(2048);
+}
+
+#[divan::bench]
+fn bench_3d_mat_to_ndarray_4k() {
+    bench_mat_to_3d_ndarray(4096);
+}
+
+#[divan::bench]
+fn bench_3d_mat_to_ndarray_8k() {
+    bench_mat_to_3d_ndarray(8192);
+}
+
+#[divan::bench]
+fn bench_3d_mat_to_ndarray_8k_ref() {
+    bench_mat_to_3d_ndarray_ref(8192);
+}
+
+#[divan::bench]
+fn bench_2d_mat_to_ndarray_8k_ref() {
+    bench_mat_to_2d_ndarray(8192);
+}
+
+fn bench_mat_to_2d_ndarray(size: i32) -> ndarray::Array2<u8> {
+    let mat =
+        opencv::core::Mat::new_nd_with_default(&[size, size], opencv::core::CV_8UC1, (200).into())
+            .expect("failed");
+    let ndarray: ndarray::Array2<u8> = mat.as_ndarray().expect("failed").to_owned();
+    ndarray
+}
+
+fn bench_mat_to_3d_ndarray(size: i32) -> ndarray::Array3<u8> {
+    let mat = opencv::core::Mat::new_nd_with_default(
+        &[size, size],
+        opencv::core::CV_8UC3,
+        (200, 100, 10).into(),
+    )
+    .expect("failed");
+    // ndarray::Array3::<u8>::from_mat(black_box(mat)).expect("failed")
+    let ndarray: ndarray::Array3<u8> = mat.as_ndarray().expect("failed").to_owned();
+    ndarray
+}
+
+fn bench_mat_to_3d_ndarray_ref(size: i32) {
+    let mut mat = opencv::core::Mat::new_nd_with_default(
+        &[size, size],
+        opencv::core::CV_8UC3,
+        (200, 100, 10).into(),
+    )
+    .expect("failed");
+    let array: ndarray::ArrayView3<u8> = black_box(&mut mat).as_ndarray().expect("failed");
+    let _ = black_box(array);
+}
--- a/ndcv-bridge/benches/gaussian.rs
+++ b/ndcv-bridge/benches/gaussian.rs
@@ -0,0 +1,265 @@
+use divan::black_box;
+use ndarray::*;
+use ndcv_bridge::*;
+
+// #[global_allocator]
+// static ALLOC: AllocProfiler = AllocProfiler::system();
+
+fn main() {
+    divan::main();
+}
+
+// Helper function to create test images with different patterns
+fn create_test_image(size: usize, pattern: &str) -> Array3<u8> {
+    let mut arr = Array3::<u8>::zeros((size, size, 3));
+    match pattern {
+        "edges" => {
+            // Create a pattern with sharp edges
+            arr.slice_mut(s![size / 4..3 * size / 4, size / 4..3 * size / 4, ..])
+                .fill(255);
+        }
+        "gradient" => {
+            // Create a gradual gradient
+            for i in 0..size {
+                let val = (i * 255 / size) as u8;
+                arr.slice_mut(s![i, .., ..]).fill(val);
+            }
+        }
+        "checkerboard" => {
+            // Create a checkerboard pattern
+            for i in 0..size {
+                for j in 0..size {
+                    if (i / 20 + j / 20) % 2 == 0 {
+                        arr[[i, j, 0]] = 255;
+                        arr[[i, j, 1]] = 255;
+                        arr[[i, j, 2]] = 255;
+                    }
+                }
+            }
+        }
+        _ => arr.fill(255), // Default to solid white
+    }
+    arr
+}
+
+#[divan::bench_group]
+mod sizes {
+    use super::*;
+    // Benchmark different image sizes
+    #[divan::bench(args = [512, 1024, 2048, 4096])]
+    fn bench_gaussian_sizes_u8(size: usize) {
+        let arr = Array3::<u8>::ones((size, size, 3));
+        let _out = black_box(
+            arr.gaussian_blur((3, 3), 1.0, 1.0, BorderType::BorderConstant)
+                .unwrap(),
+        );
+    }
+
+    #[divan::bench(args = [512, 1024, 2048, 4096])]
+    fn bench_gaussian_sizes_u8_inplace(size: usize) {
+        let mut arr = Array3::<u8>::ones((size, size, 3));
+        black_box(
+            arr.gaussian_blur_inplace((3, 3), 1.0, 1.0, BorderType::BorderConstant)
+                .unwrap(),
+        );
+    }
+
+    #[divan::bench(args = [512, 1024, 2048, 4096])]
+    fn bench_gaussian_sizes_f32(size: usize) {
+        let arr = Array3::<f32>::ones((size, size, 3));
+        let _out = black_box(
+            arr.gaussian_blur((3, 3), 1.0, 1.0, BorderType::BorderConstant)
+                .unwrap(),
+        );
+    }
+
+    #[divan::bench(args = [512, 1024, 2048, 4096])]
+    fn bench_gaussian_sizes_f32_inplace(size: usize) {
+        let mut arr = Array3::<f32>::ones((size, size, 3));
+        black_box(
+            arr.gaussian_blur_inplace((3, 3), 1.0, 1.0, BorderType::BorderConstant)
+                .unwrap(),
+        );
+    }
+}
+
+// Benchmark different kernel sizes
+#[divan::bench(args = [(3, 3), (5, 5), (7, 7), (9, 9), (11, 11)])]
+fn bench_gaussian_kernels(kernel_size: (u8, u8)) {
+    let mut arr = Array3::<u8>::ones((1000, 1000, 3));
+    arr.gaussian_blur_inplace(kernel_size, 1.0, 1.0, BorderType::BorderConstant)
+        .unwrap();
+}
+
+// Benchmark different sigma values
+#[divan::bench(args = [0.5, 1.0, 2.0, 5.0])]
+fn bench_gaussian_sigmas(sigma: f64) {
+    let mut arr = Array3::<u8>::ones((1000, 1000, 3));
+    arr.gaussian_blur_inplace((3, 3), sigma, sigma, BorderType::BorderConstant)
+        .unwrap();
+}
+
+// Benchmark different sigma_x and sigma_y combinations
+#[divan::bench(args = [(0.5, 2.0), (1.0, 1.0), (2.0, 0.5), (3.0, 1.0)])]
+fn bench_gaussian_asymmetric_sigmas(sigmas: (f64, f64)) {
+    let mut arr = Array3::<u8>::ones((1000, 1000, 3));
+    arr.gaussian_blur_inplace((3, 3), sigmas.0, sigmas.1, BorderType::BorderConstant)
+        .unwrap();
+}
+
+// Benchmark different border types
+#[divan::bench]
+fn bench_gaussian_border_types() -> Vec<()> {
+    let border_types = [
+        BorderType::BorderConstant,
+        BorderType::BorderReplicate,
+        BorderType::BorderReflect,
+        BorderType::BorderReflect101,
+    ];
+
+    let mut arr = Array3::<u8>::ones((1000, 1000, 3));
+    border_types
+        .iter()
+        .map(|border_type| {
+            arr.gaussian_blur_inplace((3, 3), 1.0, 1.0, *border_type)
+                .unwrap();
+        })
+        .collect()
+}
+
+// Benchmark different image patterns
+#[divan::bench]
+fn bench_gaussian_patterns() {
+    let patterns = ["edges", "gradient", "checkerboard", "solid"];
+
+    patterns.iter().for_each(|&pattern| {
+        let mut arr = create_test_image(1000, pattern);
+        arr.gaussian_blur_inplace((3, 3), 1.0, 1.0, BorderType::BorderConstant)
+            .unwrap();
+    })
+}
+
+#[divan::bench_group]
+mod allocation {
+    use super::*;
+    #[divan::bench]
+    fn bench_gaussian_allocation_inplace() {
+        let mut arr = Array3::<f32>::ones((3840, 2160, 3));
+
+        black_box(
+            arr.gaussian_blur_inplace((3, 3), 1.0, 1.0, BorderType::BorderConstant)
+                .unwrap(),
+        );
+    }
+
+    #[divan::bench]
+    fn bench_gaussian_allocation_allocate() {
+        let arr = Array3::<f32>::ones((3840, 2160, 3));
+
+        let _out = black_box(
+            arr.gaussian_blur((3, 3), 1.0, 1.0, BorderType::BorderConstant)
+                .unwrap(),
+        );
+    }
+}
+
+#[divan::bench_group]
+mod realistic {
+    use super::*;
+    #[divan::bench]
+    fn small_800_600_3x3() {
+        let small_blur = Array3::<u8>::ones((800, 600, 3));
+        let _blurred = black_box(
+            small_blur
+                .gaussian_blur((3, 3), 0.5, 0.5, BorderType::BorderConstant)
+                .unwrap(),
+        );
+    }
+    #[divan::bench]
+    fn small_800_600_3x3_inplace() {
+        let mut small_blur = Array3::<u8>::ones((800, 600, 3));
+        small_blur
+            .gaussian_blur_inplace((3, 3), 0.5, 0.5, BorderType::BorderConstant)
+            .unwrap();
+    }
+    #[divan::bench]
+    fn medium_1920x1080_5x5() {
+        let mut medium_blur = Array3::<u8>::ones((1920, 1080, 3));
+        let _blurred = black_box(
+            medium_blur
+                .gaussian_blur_inplace((5, 5), 2.0, 2.0, BorderType::BorderConstant)
+                .unwrap(),
+        );
+    }
+    #[divan::bench]
+    fn medium_1920x1080_5x5_inplace() {
+        let mut medium_blur = Array3::<u8>::ones((1920, 1080, 3));
+        medium_blur
+            .gaussian_blur_inplace((5, 5), 2.0, 2.0, BorderType::BorderConstant)
+            .unwrap();
+    }
+    #[divan::bench]
+    fn large_3840x2160_9x9() {
+        let large_blur = Array3::<u8>::ones((3840, 2160, 3));
+        let _blurred = black_box(
+            large_blur
+                .gaussian_blur((9, 9), 5.0, 5.0, BorderType::BorderConstant)
+                .unwrap(),
+        );
+    }
+    #[divan::bench]
+    fn large_3840x2160_9x9_inplace() {
+        let mut large_blur = Array3::<u8>::ones((3840, 2160, 3));
+        large_blur
+            .gaussian_blur_inplace((9, 9), 5.0, 5.0, BorderType::BorderConstant)
+            .unwrap();
+    }
+    #[divan::bench]
+    fn small_800_600_3x3_f32() {
+        let small_blur = Array3::<f32>::ones((800, 600, 3));
+        let _blurred = black_box(
+            small_blur
+                .gaussian_blur((3, 3), 0.5, 0.5, BorderType::BorderConstant)
+                .unwrap(),
+        );
+    }
+    #[divan::bench]
+    fn small_800_600_3x3_inplace_f32() {
+        let mut small_blur = Array3::<f32>::ones((800, 600, 3));
+        small_blur
+            .gaussian_blur_inplace((3, 3), 0.5, 0.5, BorderType::BorderConstant)
+            .unwrap();
+    }
+    #[divan::bench]
+    fn medium_1920x1080_5x5_f32() {
+        let mut medium_blur = Array3::<f32>::ones((1920, 1080, 3));
+        let _blurred = black_box(
+            medium_blur
+                .gaussian_blur_inplace((5, 5), 2.0, 2.0, BorderType::BorderConstant)
+                .unwrap(),
+        );
+    }
+    #[divan::bench]
+    fn medium_1920x1080_5x5_inplace_f32() {
+        let mut medium_blur = Array3::<f32>::ones((1920, 1080, 3));
+        medium_blur
+            .gaussian_blur_inplace((5, 5), 2.0, 2.0, BorderType::BorderConstant)
+            .unwrap();
+    }
+    #[divan::bench]
+    fn large_3840x2160_9x9_f32() {
+        let large_blur = Array3::<f32>::ones((3840, 2160, 3));
+        let _blurred = black_box(
+            large_blur
+                .gaussian_blur((9, 9), 5.0, 5.0, BorderType::BorderConstant)
+                .unwrap(),
+        );
+    }
+    #[divan::bench]
+    fn large_3840x2160_9x9_inplace_f32() {
+        let mut large_blur = Array3::<f32>::ones((3840, 2160, 3));
+        large_blur
+            .gaussian_blur_inplace((9, 9), 5.0, 5.0, BorderType::BorderConstant)
+            .unwrap();
+    }
+}
--- a/ndcv-bridge/src/blend.rs
+++ b/ndcv-bridge/src/blend.rs
@@ -0,0 +1,180 @@
+use crate::prelude_::*;
+use ndarray::*;
+
+type Result<T, E = Report<NdCvError>> = std::result::Result<T, E>;
+
+mod seal {
+    pub trait Sealed {}
+    impl<T: ndarray::Data<Elem = f32>> Sealed for ndarray::ArrayBase<T, ndarray::Ix3> {}
+}
+pub trait NdBlend<T, D: ndarray::Dimension>: seal::Sealed {
+    fn blend(
+        &self,
+        mask: ndarray::ArrayView<T, D::Smaller>,
+        other: ndarray::ArrayView<T, D>,
+        alpha: T,
+    ) -> Result<ndarray::Array<T, D>>;
+    fn blend_inplace(
+        &mut self,
+        mask: ndarray::ArrayView<T, D::Smaller>,
+        other: ndarray::ArrayView<T, D>,
+        alpha: T,
+    ) -> Result<()>;
+}
+
+impl<S> NdBlend<f32, Ix3> for ndarray::ArrayBase<S, Ix3>
+where
+    S: ndarray::DataMut<Elem = f32>,
+{
+    fn blend(
+        &self,
+        mask: ndarray::ArrayView<f32, Ix2>,
+        other: ndarray::ArrayView<f32, Ix3>,
+        alpha: f32,
+    ) -> Result<ndarray::Array<f32, Ix3>> {
+        if self.shape() != other.shape() {
+            return Err(NdCvError)
+                .attach_printable("Shapes of image and other imagge do not match");
+        }
+        if self.shape()[0] != mask.shape()[0] || self.shape()[1] != mask.shape()[1] {
+            return Err(NdCvError).attach_printable("Shapes of image and mask do not match");
+        }
+
+        let mut output = ndarray::Array3::zeros(self.dim());
+        let (_height, _width, channels) = self.dim();
+
+        Zip::from(output.lanes_mut(Axis(2)))
+            .and(self.lanes(Axis(2)))
+            .and(other.lanes(Axis(2)))
+            .and(mask)
+            .par_for_each(|mut out, this, other, mask| {
+                let this = wide::f32x4::from(this.as_slice().expect("Invalid self array"));
+                let other = wide::f32x4::from(other.as_slice().expect("Invalid other array"));
+                let mask = wide::f32x4::splat(mask * alpha);
+                let o = this * (1.0 - mask) + other * mask;
+                out.as_slice_mut()
+                    .expect("Failed to get mutable slice")
+                    .copy_from_slice(&o.as_array_ref()[..channels]);
+            });
+
+        Ok(output)
+    }
+
+    fn blend_inplace(
+        &mut self,
+        mask: ndarray::ArrayView<f32, <Ix3 as Dimension>::Smaller>,
+        other: ndarray::ArrayView<f32, Ix3>,
+        alpha: f32,
+    ) -> Result<()> {
+        if self.shape() != other.shape() {
+            return Err(NdCvError)
+                .attach_printable("Shapes of image and other imagge do not match");
+        }
+        if self.shape()[0] != mask.shape()[0] || self.shape()[1] != mask.shape()[1] {
+            return Err(NdCvError).attach_printable("Shapes of image and mask do not match");
+        }
+
+        let (_height, _width, channels) = self.dim();
+
+        // Zip::from(self.lanes_mut(Axis(2)))
+        //     .and(other.lanes(Axis(2)))
+        //     .and(mask)
+        //     .par_for_each(|mut this, other, mask| {
+        //         let this_wide = wide::f32x4::from(this.as_slice().expect("Invalid self array"));
+        //         let other = wide::f32x4::from(other.as_slice().expect("Invalid other array"));
+        //         let mask = wide::f32x4::splat(mask * alpha);
+        //         let o = this_wide * (1.0 - mask) + other * mask;
+        //         this.as_slice_mut()
+        //             .expect("Failed to get mutable slice")
+        //             .copy_from_slice(&o.as_array_ref()[..channels]);
+        //     });
+        let this = self
+            .as_slice_mut()
+            .ok_or(NdCvError)
+            .attach_printable("Failed to get source image as a continuous slice")?;
+        let other = other
+            .as_slice()
+            .ok_or(NdCvError)
+            .attach_printable("Failed to get other image as a continuous slice")?;
+        let mask = mask
+            .as_slice()
+            .ok_or(NdCvError)
+            .attach_printable("Failed to get mask as a continuous slice")?;
+
+        use rayon::prelude::*;
+        this.par_chunks_exact_mut(channels)
+            .zip(other.par_chunks_exact(channels))
+            .zip(mask)
+            .for_each(|((this, other), mask)| {
+                let this_wide = wide::f32x4::from(&*this);
+                let other = wide::f32x4::from(other);
+                let mask = wide::f32x4::splat(mask * alpha);
+                this.copy_from_slice(
+                    &(this_wide * (1.0 - mask) + other * mask).as_array_ref()[..channels],
+                );
+            });
+
+        // for h in 0.._height {
+        //     for w in 0.._width {
+        //         let mask_index = h * _width + w;
+        //         let mask = mask[mask_index];
+        //         let mask = wide::f32x4::splat(mask * alpha);
+        //         let this = &mut this[mask_index * channels..(mask_index + 1) * channels];
+        //         let other = &other[mask_index * channels..(mask_index + 1) * channels];
+        //         let this_wide = wide::f32x4::from(&*this);
+        //         let other = wide::f32x4::from(other);
+        //         let o = this_wide * (1.0 - mask) + other * mask;
+        //         this.copy_from_slice(&o.as_array_ref()[..channels]);
+        //     }
+        // }
+        Ok(())
+    }
+}
+
+#[test]
+pub fn test_blend() {
+    let img = Array3::<f32>::from_shape_fn((10, 10, 3), |(i, j, k)| match (i, j, k) {
+        (0..=3, _, 0) => 1f32, // red
+        (4..=6, _, 1) => 1f32, // green
+        (7..=9, _, 2) => 1f32, // blue
+        _ => 0f32,
+    });
+    let other = img.clone().permuted_axes([1, 0, 2]).to_owned();
+    let mask = Array2::<f32>::from_shape_fn((10, 10), |(_, j)| if j > 5 { 1f32 } else { 0f32 });
+    // let other = Array3::<f32>::zeros((10, 10, 3));
+    let out = img.blend(mask.view(), other.view(), 1f32).unwrap();
+    let out_u8 = out.mapv(|v| (v * 255f32) as u8);
+    let expected = Array3::<u8>::from_shape_fn((10, 10, 3), |(i, j, k)| {
+        match (i, j, k) {
+            (0..=3, 0..=5, 0) => u8::MAX,                  // red
+            (4..=6, 0..=5, 1) | (_, 6, 1) => u8::MAX,      // green
+            (7..=9, 0..=5, 2) | (_, 7..=10, 2) => u8::MAX, // blue
+            _ => u8::MIN,
+        }
+    });
+    assert_eq!(out_u8, expected);
+}
+
+// #[test]
+// pub fn test_blend_inplace() {
+//     let mut img = Array3::<f32>::from_shape_fn((10, 10, 3), |(i, j, k)| match (i, j, k) {
+//         (0..=3, _, 0) => 1f32, // red
+//         (4..=6, _, 1) => 1f32, // green
+//         (7..=9, _, 2) => 1f32, // blue
+//         _ => 0f32,
+//     });
+//     let other = img.clone().permuted_axes([1, 0, 2]);
+//     let mask = Array2::<f32>::from_shape_fn((10, 10), |(_, j)| if j > 5 { 1f32 } else { 0f32 });
+//     // let other = Array3::<f32>::zeros((10, 10, 3));
+//     img.blend_inplace(mask.view(), other.view(), 1f32).unwrap();
+//     let out_u8 = img.mapv(|v| (v * 255f32) as u8);
+//     let expected = Array3::<u8>::from_shape_fn((10, 10, 3), |(i, j, k)| {
+//         match (i, j, k) {
+//             (0..=3, 0..=5, 0) => u8::MAX,                  // red
+//             (4..=6, 0..=5, 1) | (_, 6, 1) => u8::MAX,      // green
+//             (7..=9, 0..=5, 2) | (_, 7..=10, 2) => u8::MAX, // blue
+//             _ => u8::MIN,
+//         }
+//     });
+//     assert_eq!(out_u8, expected);
+// }
--- a/ndcv-bridge/src/bounding_rect.rs
+++ b/ndcv-bridge/src/bounding_rect.rs
@@ -0,0 +1,48 @@
+//! Calculates the up-right bounding rectangle of a point set or non-zero pixels of gray-scale image.
+//! The function calculates and returns the minimal up-right bounding rectangle for the specified point set or non-zero pixels of gray-scale image.
+use crate::{NdAsImage, prelude_::*};
+pub trait BoundingRect: seal::SealedInternal {
+    fn bounding_rect(&self) -> Result<bounding_box::Aabb2<i32>, NdCvError>;
+}
+
+mod seal {
+    pub trait SealedInternal {}
+    impl<T, S: ndarray::Data<Elem = T>> SealedInternal for ndarray::ArrayBase<S, ndarray::Ix2> {}
+}
+
+impl<S: ndarray::Data<Elem = u8>> BoundingRect for ndarray::ArrayBase<S, ndarray::Ix2> {
+    fn bounding_rect(&self) -> Result<bounding_box::Aabb2<i32>, NdCvError> {
+        let mat = self.as_image_mat()?;
+        let rect = opencv::imgproc::bounding_rect(mat.as_ref()).change_context(NdCvError)?;
+        Ok(bounding_box::Aabb2::from_xywh(
+            rect.x,
+            rect.y,
+            rect.width,
+            rect.height,
+        ))
+    }
+}
+
+#[test]
+fn test_bounding_rect_empty() {
+    let arr = ndarray::Array2::<u8>::zeros((10, 10));
+    let rect = arr.bounding_rect().unwrap();
+    assert_eq!(rect, bounding_box::Aabb2::from_xywh(0, 0, 0, 0));
+}
+
+#[test]
+fn test_bounding_rect_valued() {
+    let mut arr = ndarray::Array2::<u8>::zeros((10, 10));
+    crate::NdRoiMut::roi_mut(&mut arr, bounding_box::Aabb2::from_xywh(1, 1, 3, 3)).fill(1);
+    let rect = arr.bounding_rect().unwrap();
+    assert_eq!(rect, bounding_box::Aabb2::from_xywh(1, 1, 3, 3));
+}
+
+#[test]
+fn test_bounding_rect_complex() {
+    let mut arr = ndarray::Array2::<u8>::zeros((10, 10));
+    crate::NdRoiMut::roi_mut(&mut arr, bounding_box::Aabb2::from_xywh(1, 3, 3, 3)).fill(1);
+    crate::NdRoiMut::roi_mut(&mut arr, bounding_box::Aabb2::from_xywh(2, 3, 3, 5)).fill(5);
+    let rect = arr.bounding_rect().unwrap();
+    assert_eq!(rect, bounding_box::Aabb2::from_xywh(1, 3, 4, 5));
+}
--- a/ndcv-bridge/src/codec.rs
+++ b/ndcv-bridge/src/codec.rs
@@ -0,0 +1,4 @@
+pub mod codecs;
+pub mod decode;
+pub mod encode;
+pub mod error;
--- a/ndcv-bridge/src/codec/codecs.rs
+++ b/ndcv-bridge/src/codec/codecs.rs
@@ -0,0 +1,218 @@
+use super::decode::Decoder;
+use super::encode::Encoder;
+use crate::NdCvError;
+use crate::conversions::matref::MatRef;
+use error_stack::*;
+use img_parts::{
+    Bytes,
+    jpeg::{Jpeg, markers},
+};
+use opencv::{
+    core::{Mat, Vector, VectorToVec},
+    imgcodecs::{ImreadModes, ImwriteFlags, imdecode, imencode},
+};
+
+#[derive(Debug)]
+pub enum CvEncoder {
+    Jpeg(CvJpegEncFlags),
+    Tiff(CvTiffEncFlags),
+}
+
+pub enum EncKind {
+    Jpeg,
+    Tiff,
+}
+
+impl CvEncoder {
+    fn kind(&self) -> EncKind {
+        match self {
+            Self::Jpeg(_) => EncKind::Jpeg,
+            Self::Tiff(_) => EncKind::Tiff,
+        }
+    }
+
+    fn extension(&self) -> &'static str {
+        match self {
+            Self::Jpeg(_) => ".jpg",
+            Self::Tiff(_) => ".tiff",
+        }
+    }
+
+    fn to_cv_param_list(&self) -> Vector<i32> {
+        match self {
+            Self::Jpeg(flags) => flags.to_cv_param_list(),
+            Self::Tiff(flags) => flags.to_cv_param_list(),
+        }
+    }
+}
+
+#[derive(Default, Debug)]
+pub struct CvJpegEncFlags {
+    quality: Option<usize>,
+    progressive: Option<bool>,
+    optimize: Option<bool>,
+    remove_app0: Option<bool>,
+}
+
+#[derive(Default, Debug)]
+pub struct CvTiffEncFlags {
+    compression: Option<i32>,
+}
+
+impl CvTiffEncFlags {
+    pub fn new() -> Self {
+        Self::default().with_compression(1)
+    }
+
+    pub fn with_compression(mut self, compression: i32) -> Self {
+        self.compression = Some(compression);
+        self
+    }
+
+    fn to_cv_param_list(&self) -> Vector<i32> {
+        let iter = [(
+            ImwriteFlags::IMWRITE_TIFF_COMPRESSION as i32,
+            self.compression.map(|i| i as i32),
+        )]
+        .into_iter()
+        .filter_map(|(flag, opt)| opt.map(|o| [flag, o]))
+        .flatten();
+
+        Vector::from_iter(iter)
+    }
+}
+
+impl CvJpegEncFlags {
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    pub fn with_quality(mut self, quality: usize) -> Self {
+        self.quality = Some(quality);
+        self
+    }
+
+    pub fn remove_app0_marker(mut self, val: bool) -> Self {
+        self.remove_app0 = Some(val);
+        self
+    }
+
+    fn to_cv_param_list(&self) -> Vector<i32> {
+        let iter = [
+            (
+                ImwriteFlags::IMWRITE_JPEG_QUALITY as i32,
+                self.quality.map(|i| i as i32),
+            ),
+            (
+                ImwriteFlags::IMWRITE_JPEG_PROGRESSIVE as i32,
+                self.progressive.map(|i| i as i32),
+            ),
+            (
+                ImwriteFlags::IMWRITE_JPEG_OPTIMIZE as i32,
+                self.optimize.map(|i| i as i32),
+            ),
+        ]
+        .into_iter()
+        .filter_map(|(flag, opt)| opt.map(|o| [flag, o]))
+        .flatten();
+
+        Vector::from_iter(iter)
+    }
+}
+
+impl Encoder for CvEncoder {
+    type Input<'a>
+        = MatRef<'a>
+    where
+        Self: 'a;
+
+    fn encode(&self, input: Self::Input<'_>) -> Result<Vec<u8>, NdCvError> {
+        let mut buf = Vector::default();
+
+        let params = self.to_cv_param_list();
+
+        imencode(self.extension(), &input.as_ref(), &mut buf, &params).change_context(NdCvError)?;
+
+        match self.kind() {
+            EncKind::Jpeg => {
+                let bytes = Bytes::from(buf.to_vec());
+                let mut jpg = Jpeg::from_bytes(bytes).change_context(NdCvError)?;
+                jpg.remove_segments_by_marker(markers::APP0);
+                let bytes = jpg.encoder().bytes();
+                Ok(bytes.to_vec())
+            }
+            EncKind::Tiff => Ok(buf.to_vec()),
+        }
+    }
+}
+
+pub enum CvDecoder {
+    Jpeg(CvJpegDecFlags),
+}
+
+impl CvDecoder {
+    fn to_cv_decode_flag(&self) -> i32 {
+        match self {
+            Self::Jpeg(flags) => flags.to_cv_decode_flag(),
+        }
+    }
+}
+
+#[derive(Default)]
+pub enum ColorMode {
+    #[default]
+    Color,
+    GrayScale,
+}
+
+impl ColorMode {
+    fn to_cv_decode_flag(&self) -> i32 {
+        match self {
+            Self::Color => ImreadModes::IMREAD_ANYCOLOR as i32,
+            Self::GrayScale => ImreadModes::IMREAD_GRAYSCALE as i32,
+        }
+    }
+}
+
+#[derive(Default)]
+pub struct CvJpegDecFlags {
+    color_mode: ColorMode,
+    ignore_orientation: bool,
+}
+
+impl CvJpegDecFlags {
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    pub fn with_color_mode(mut self, color_mode: ColorMode) -> Self {
+        self.color_mode = color_mode;
+        self
+    }
+
+    pub fn with_ignore_orientation(mut self, ignore_orientation: bool) -> Self {
+        self.ignore_orientation = ignore_orientation;
+        self
+    }
+
+    fn to_cv_decode_flag(&self) -> i32 {
+        let flag = self.color_mode.to_cv_decode_flag();
+
+        if self.ignore_orientation {
+            flag | ImreadModes::IMREAD_IGNORE_ORIENTATION as i32
+        } else {
+            flag
+        }
+    }
+}
+
+impl Decoder for CvDecoder {
+    type Output = Mat;
+
+    fn decode(&self, input: impl AsRef<[u8]>) -> Result<Self::Output, NdCvError> {
+        let flag = self.to_cv_decode_flag();
+        let out = imdecode(&Vector::from_slice(input.as_ref()), flag).change_context(NdCvError)?;
+
+        Ok(out)
+    }
+}
--- a/ndcv-bridge/src/codec/decode.rs
+++ b/ndcv-bridge/src/codec/decode.rs
@@ -0,0 +1,61 @@
+#![deny(warnings)]
+
+use super::codecs::CvDecoder;
+use super::error::ErrorReason;
+use crate::NdCvError;
+use crate::{NdAsImage, conversions::NdCvConversion};
+use error_stack::*;
+use ndarray::Array;
+use std::path::Path;
+
+pub trait Decodable<D: Decoder>: Sized {
+    fn decode(buf: impl AsRef<[u8]>, decoder: &D) -> Result<Self, NdCvError> {
+        let output = decoder.decode(buf)?;
+        Self::transform(output)
+    }
+
+    fn read(&self, path: impl AsRef<Path>, decoder: &D) -> Result<Self, NdCvError> {
+        let buf = std::fs::read(path)
+            .map_err(|e| match e.kind() {
+                std::io::ErrorKind::NotFound => {
+                    Report::new(e).attach_printable(ErrorReason::ImageWriteFileNotFound)
+                }
+                std::io::ErrorKind::PermissionDenied => {
+                    Report::new(e).attach_printable(ErrorReason::ImageWritePermissionDenied)
+                }
+                std::io::ErrorKind::OutOfMemory => {
+                    Report::new(e).attach_printable(ErrorReason::OutOfMemory)
+                }
+                std::io::ErrorKind::StorageFull => {
+                    Report::new(e).attach_printable(ErrorReason::OutOfStorage)
+                }
+                _ => Report::new(e).attach_printable(ErrorReason::ImageWriteOtherError),
+            })
+            .change_context(NdCvError)?;
+        Self::decode(buf, decoder)
+    }
+
+    fn transform(input: D::Output) -> Result<Self, NdCvError>;
+}
+
+pub trait Decoder {
+    type Output: Sized;
+    fn decode(&self, buf: impl AsRef<[u8]>) -> Result<Self::Output, NdCvError>;
+}
+
+impl<T: bytemuck::Pod + Copy, D: ndarray::Dimension> Decodable<CvDecoder> for Array<T, D>
+where
+    Self: NdAsImage<T, D>,
+{
+    fn transform(input: <CvDecoder as Decoder>::Output) -> Result<Self, NdCvError> {
+        Self::from_mat(input)
+    }
+}
+
+#[test]
+fn decode_image() {
+    use crate::codec::codecs::*;
+    let img = std::fs::read("/Users/fs0c131y/Projects/face-detector/assets/selfie.jpg").unwrap();
+    let decoder = CvDecoder::Jpeg(CvJpegDecFlags::new().with_ignore_orientation(true));
+    let _out = ndarray::Array3::<u8>::decode(img, &decoder).unwrap();
+}
--- a/ndcv-bridge/src/codec/encode.rs
+++ b/ndcv-bridge/src/codec/encode.rs
@@ -0,0 +1,56 @@
+use super::codecs::CvEncoder;
+use super::error::ErrorReason;
+use crate::conversions::NdAsImage;
+use crate::NdCvError;
+use error_stack::*;
+use ndarray::ArrayBase;
+use std::path::Path;
+
+pub trait Encodable<E: Encoder> {
+    fn encode(&self, encoder: &E) -> Result<Vec<u8>, NdCvError> {
+        let input = self.transform()?;
+        encoder.encode(input)
+    }
+
+    fn write(&self, path: impl AsRef<Path>, encoder: &E) -> Result<(), NdCvError> {
+        let buf = self.encode(encoder)?;
+
+        std::fs::write(path, buf)
+            .map_err(|e| match e.kind() {
+                std::io::ErrorKind::NotFound => {
+                    Report::new(e).attach_printable(ErrorReason::ImageWriteFileNotFound)
+                }
+                std::io::ErrorKind::PermissionDenied => {
+                    Report::new(e).attach_printable(ErrorReason::ImageWritePermissionDenied)
+                }
+                std::io::ErrorKind::OutOfMemory => {
+                    Report::new(e).attach_printable(ErrorReason::OutOfMemory)
+                }
+                std::io::ErrorKind::StorageFull => {
+                    Report::new(e).attach_printable(ErrorReason::OutOfStorage)
+                }
+                _ => Report::new(e).attach_printable(ErrorReason::ImageWriteOtherError),
+            })
+            .change_context(NdCvError)
+    }
+
+    fn transform(&self) -> Result<<E as Encoder>::Input<'_>, NdCvError>;
+}
+
+pub trait Encoder {
+    type Input<'a>
+    where
+        Self: 'a;
+
+    fn encode(&self, input: Self::Input<'_>) -> Result<Vec<u8>, NdCvError>;
+}
+
+impl<T: bytemuck::Pod + Copy, S: ndarray::Data<Elem = T>, D: ndarray::Dimension>
+    Encodable<CvEncoder> for ArrayBase<S, D>
+where
+    Self: NdAsImage<T, D>,
+{
+    fn transform(&self) -> Result<<CvEncoder as Encoder>::Input<'_>, NdCvError> {
+        self.as_image_mat()
+    }
+}
--- a/ndcv-bridge/src/codec/error.rs
+++ b/ndcv-bridge/src/codec/error.rs
@@ -0,0 +1,19 @@
+#[derive(Debug)]
+pub enum ErrorReason {
+    ImageReadFileNotFound,
+    ImageReadPermissionDenied,
+    ImageReadOtherError,
+
+    ImageWriteFileNotFound,
+    ImageWritePermissionDenied,
+    ImageWriteOtherError,
+
+    OutOfMemory,
+    OutOfStorage,
+}
+
+impl std::fmt::Display for ErrorReason {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{:?}", self)
+    }
+}
--- a/ndcv-bridge/src/color_space.rs
+++ b/ndcv-bridge/src/color_space.rs
@@ -0,0 +1,88 @@
+//! Colorspace conversion functions
+//! ## Example
+//! ```rust
+//! let arr = Array3::<u8>::ones((100, 100, 3));
+//! let out: Array3<u8> = arr.cvt::<Rgba<u8>, Rgb<u8>>()
+//! ```
+use crate::prelude_::*;
+use ndarray::*;
+
+pub trait ColorSpace {
+    type Elem: seal::Sealed;
+    type Dim: ndarray::Dimension;
+    const CHANNELS: usize;
+}
+
+mod seal {
+    pub trait Sealed: bytemuck::Pod {}
+    // impl<T> Sealed for T {}
+    impl Sealed for u8 {} // 0 to 255
+    impl Sealed for u16 {} // 0 to 65535
+    impl Sealed for f32 {} // 0 to 1
+}
+
+macro_rules! define_color_space {
+    ($name:ident, $channels:expr, $depth:ty) => {
+        pub struct $name<T> {
+            __phantom: core::marker::PhantomData<T>,
+        }
+        impl<T: seal::Sealed> ColorSpace for $name<T> {
+            type Elem = T;
+            type Dim = $depth;
+            const CHANNELS: usize = $channels;
+        }
+    };
+}
+
+define_color_space!(Rgb, 3, Ix3);
+define_color_space!(Bgr, 3, Ix3);
+define_color_space!(Rgba, 4, Ix3);
+
+pub trait NdArray<T, D: ndarray::Dimension> {}
+impl<T, D: ndarray::Dimension, S: ndarray::Data<Elem = T>> NdArray<S, D> for ArrayBase<S, D> {}
+
+pub trait ConvertColor<T, U>
+where
+    T: ColorSpace,
+    U: ColorSpace,
+    Self: NdArray<T::Elem, T::Dim>,
+{
+    type Output: NdArray<U::Elem, U::Dim>;
+    fn cvt(&self) -> Self::Output;
+}
+
+// impl<T: seal::Sealed, S: ndarray::Data<Elem = T>> ConvertColor<Rgb<T>, Bgr<T>> for ArrayBase<S, Ix3>
+// where
+//     Self: NdArray<T, Ix3>,
+// {
+//     type Output = ArrayView3<'a, T>;
+//     fn cvt(&self) -> CowArray<T, Ix3> {
+//         self.view().permuted_axes([2, 1, 0]).into()
+//     }
+// }
+//
+// impl<T: seal::Sealed, S: ndarray::Data<Elem = T>> ConvertColor<Bgr<T>, Rgb<T>> for ArrayBase<S, Ix3>
+// where
+//     Self: NdArray<T, Ix3>,
+// {
+//     type Output = ArrayView3<'a, T>;
+//     fn cvt(&self) -> CowArray<T, Ix3> {
+//         self.view().permuted_axes([2, 1, 0]).into()
+//     }
+// }
+
+
+// impl<T: seal::Sealed + num::One + num::Zero, S: ndarray::Data<Elem = T>>
+//     ConvertColor<Rgb<T>, Rgba<T>> for ArrayBase<S, Ix3>
+// {
+//     fn cvt(&self) -> CowArray<T, Ix3> {
+//         let mut out = Array3::<T>::zeros((self.height(), self.width(), 4));
+//         // Zip::from(&mut out).and(self).for_each(|out, &in_| {
+//         //     out[0] = in_[0];
+//         //     out[1] = in_[1];
+//         //     out[2] = in_[2];
+//         //     out[3] = T::one();
+//         // });
+//         out.into()
+//     }
+// }
--- a/ndcv-bridge/src/connected_components.rs
+++ b/ndcv-bridge/src/connected_components.rs
@@ -0,0 +1,113 @@
+use crate::{NdAsImage, NdAsImageMut, conversions::MatAsNd, prelude_::*};
+
+pub(crate) mod seal {
+    pub trait ConnectedComponentOutput: Sized + Copy + bytemuck::Pod + num::Zero {
+        fn as_cv_type() -> i32 {
+            crate::type_depth::<Self>()
+        }
+    }
+    impl ConnectedComponentOutput for i32 {}
+    impl ConnectedComponentOutput for u16 {}
+}
+
+pub trait NdCvConnectedComponents<T> {
+    fn connected_components<O: seal::ConnectedComponentOutput>(
+        &self,
+        connectivity: Connectivity,
+    ) -> Result<ndarray::Array2<O>, NdCvError>;
+    fn connected_components_with_stats<O: seal::ConnectedComponentOutput>(
+        &self,
+        connectivity: Connectivity,
+    ) -> Result<ConnectedComponentStats<O>, NdCvError>;
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
+pub enum Connectivity {
+    Four = 4,
+    #[default]
+    Eight = 8,
+}
+
+#[derive(Debug, Clone)]
+pub struct ConnectedComponentStats<O: seal::ConnectedComponentOutput> {
+    pub num_labels: i32,
+    pub labels: ndarray::Array2<O>,
+    pub stats: ndarray::Array2<i32>,
+    pub centroids: ndarray::Array2<f64>,
+}
+
+// use crate::conversions::NdCvConversionRef;
+impl<T: bytemuck::Pod, S: ndarray::Data<Elem = T>> NdCvConnectedComponents<T>
+    for ndarray::ArrayBase<S, ndarray::Ix2>
+where
+    ndarray::Array2<T>: NdAsImage<T, ndarray::Ix2>,
+{
+    fn connected_components<O: seal::ConnectedComponentOutput>(
+        &self,
+        connectivity: Connectivity,
+    ) -> Result<ndarray::Array2<O>, NdCvError> {
+        let mat = self.as_image_mat()?;
+        let mut labels = ndarray::Array2::<O>::zeros(self.dim());
+        let mut cv_labels = labels.as_image_mat_mut()?;
+        opencv::imgproc::connected_components(
+            mat.as_ref(),
+            cv_labels.as_mut(),
+            connectivity as i32,
+            O::as_cv_type(),
+        )
+        .change_context(NdCvError)?;
+        Ok(labels)
+    }
+
+    fn connected_components_with_stats<O: seal::ConnectedComponentOutput>(
+        &self,
+        connectivity: Connectivity,
+    ) -> Result<ConnectedComponentStats<O>, NdCvError> {
+        let mut labels = ndarray::Array2::<O>::zeros(self.dim());
+        let mut stats = opencv::core::Mat::default();
+        let mut centroids = opencv::core::Mat::default();
+        let num_labels = opencv::imgproc::connected_components_with_stats(
+            self.as_image_mat()?.as_ref(),
+            labels.as_image_mat_mut()?.as_mut(),
+            &mut stats,
+            &mut centroids,
+            connectivity as i32,
+            O::as_cv_type(),
+        )
+        .change_context(NdCvError)?;
+        let stats = stats.as_ndarray()?.to_owned();
+        let centroids = centroids.as_ndarray()?.to_owned();
+        Ok(ConnectedComponentStats {
+            labels,
+            stats,
+            centroids,
+            num_labels,
+        })
+    }
+}
+
+// #[test]
+// fn test_connected_components() {
+//     use opencv::core::MatTrait as _;
+//     let mat = opencv::core::Mat::new_nd_with_default(&[10, 10], opencv::core::CV_8UC1, 0.into())
+//         .expect("failed");
+//     let roi1 = opencv::core::Rect::new(2, 2, 2, 2);
+//     let roi2 = opencv::core::Rect::new(6, 6, 3, 3);
+//     let mut mat1 = opencv::core::Mat::roi(&mat, roi1).expect("failed");
+//     mat1.set_scalar(1.into()).expect("failed");
+//     let mut mat2 = opencv::core::Mat::roi(&mat, roi2).expect("failed");
+//     mat2.set_scalar(1.into()).expect("failed");
+
+//     let array2: ndarray::ArrayView2<u8> = mat.as_ndarray().expect("failed");
+//     let output = array2
+//         .connected_components::<u16>(Connectivity::Four)
+//         .expect("failed");
+//     let expected = {
+//         let mut expected = ndarray::Array2::zeros((10, 10));
+//         expected.slice_mut(ndarray::s![2..4, 2..4]).fill(1);
+//         expected.slice_mut(ndarray::s![6..9, 6..9]).fill(2);
+//         expected
+//     };
+
+//     assert_eq!(output, expected);
+// }
--- a/ndcv-bridge/src/contours.rs
+++ b/ndcv-bridge/src/contours.rs
@@ -0,0 +1,270 @@
+//! <https://docs.rs/opencv/latest/opencv/imgproc/fn.find_contours.html>
+
+#![deny(warnings)]
+
+use crate::conversions::*;
+use crate::prelude_::*;
+use nalgebra::Point2;
+use ndarray::*;
+
+#[repr(C)]
+#[derive(Default, Debug, Copy, Clone, PartialEq, Eq)]
+pub enum ContourRetrievalMode {
+    #[default]
+    External = 0, // RETR_EXTERNAL
+    List = 1,      // RETR_LIST
+    CComp = 2,     // RETR_CCOMP
+    Tree = 3,      // RETR_TREE
+    FloodFill = 4, // RETR_FLOODFILL
+}
+
+#[repr(C)]
+#[derive(Default, Debug, Copy, Clone, PartialEq, Eq)]
+pub enum ContourApproximationMethod {
+    #[default]
+    None = 1, // CHAIN_APPROX_NONE
+    Simple = 2,   // CHAIN_APPROX_SIMPLE
+    Tc89L1 = 3,   // CHAIN_APPROX_TC89_L1
+    Tc89Kcos = 4, // CHAIN_APPROX_TC89_KCOS
+}
+
+#[derive(Debug, Clone)]
+pub struct ContourHierarchy {
+    pub next: i32,
+    pub previous: i32,
+    pub first_child: i32,
+    pub parent: i32,
+}
+
+#[derive(Debug, Clone)]
+pub struct ContourResult {
+    pub contours: Vec<Vec<Point2<i32>>>,
+    pub hierarchy: Vec<ContourHierarchy>,
+}
+
+mod seal {
+    pub trait Sealed {}
+    impl Sealed for u8 {}
+}
+
+pub trait NdCvFindContours<T: bytemuck::Pod + seal::Sealed>:
+    crate::image::NdImage + crate::conversions::NdAsImage<T, ndarray::Ix2>
+{
+    fn find_contours(
+        &self,
+        mode: ContourRetrievalMode,
+        method: ContourApproximationMethod,
+    ) -> Result<Vec<Vec<Point2<i32>>>, NdCvError>;
+
+    fn find_contours_with_hierarchy(
+        &self,
+        mode: ContourRetrievalMode,
+        method: ContourApproximationMethod,
+    ) -> Result<ContourResult, NdCvError>;
+
+    fn find_contours_def(&self) -> Result<Vec<Vec<Point2<i32>>>, NdCvError> {
+        self.find_contours(
+            ContourRetrievalMode::External,
+            ContourApproximationMethod::Simple,
+        )
+    }
+
+    fn find_contours_with_hierarchy_def(&self) -> Result<ContourResult, NdCvError> {
+        self.find_contours_with_hierarchy(
+            ContourRetrievalMode::External,
+            ContourApproximationMethod::Simple,
+        )
+    }
+}
+
+pub trait NdCvContourArea<T: bytemuck::Pod> {
+    fn contours_area(&self, oriented: bool) -> Result<f64, NdCvError>;
+
+    fn contours_area_def(&self) -> Result<f64, NdCvError> {
+        self.contours_area(false)
+    }
+}
+
+impl<T: ndarray::RawData + ndarray::Data<Elem = u8>> NdCvFindContours<u8> for ArrayBase<T, Ix2> {
+    fn find_contours(
+        &self,
+        mode: ContourRetrievalMode,
+        method: ContourApproximationMethod,
+    ) -> Result<Vec<Vec<Point2<i32>>>, NdCvError> {
+        let cv_self = self.as_image_mat()?;
+        let mut contours = opencv::core::Vector::<opencv::core::Vector<opencv::core::Point>>::new();
+
+        opencv::imgproc::find_contours(
+            &*cv_self,
+            &mut contours,
+            mode as i32,
+            method as i32,
+            opencv::core::Point::new(0, 0),
+        )
+        .change_context(NdCvError)
+        .attach_printable("Failed to find contours")?;
+        let mut result: Vec<Vec<Point2<i32>>> = Vec::new();
+
+        for i in 0..contours.len() {
+            let contour = contours.get(i).change_context(NdCvError)?;
+            let points: Vec<Point2<i32>> =
+                contour.iter().map(|pt| Point2::new(pt.x, pt.y)).collect();
+            result.push(points);
+        }
+
+        Ok(result)
+    }
+
+    fn find_contours_with_hierarchy(
+        &self,
+        mode: ContourRetrievalMode,
+        method: ContourApproximationMethod,
+    ) -> Result<ContourResult, NdCvError> {
+        let cv_self = self.as_image_mat()?;
+        let mut contours = opencv::core::Vector::<opencv::core::Vector<opencv::core::Point>>::new();
+        let mut hierarchy = opencv::core::Vector::<opencv::core::Vec4i>::new();
+
+        opencv::imgproc::find_contours_with_hierarchy(
+            &*cv_self,
+            &mut contours,
+            &mut hierarchy,
+            mode as i32,
+            method as i32,
+            opencv::core::Point::new(0, 0),
+        )
+        .change_context(NdCvError)
+        .attach_printable("Failed to find contours with hierarchy")?;
+        let mut contour_list: Vec<Vec<Point2<i32>>> = Vec::new();
+
+        for i in 0..contours.len() {
+            let contour = contours.get(i).change_context(NdCvError)?;
+            let points: Vec<Point2<i32>> =
+                contour.iter().map(|pt| Point2::new(pt.x, pt.y)).collect();
+            contour_list.push(points);
+        }
+
+        let mut hierarchy_list = Vec::new();
+        for i in 0..hierarchy.len() {
+            let h = hierarchy.get(i).change_context(NdCvError)?;
+            hierarchy_list.push(ContourHierarchy {
+                next: h[0],
+                previous: h[1],
+                first_child: h[2],
+                parent: h[3],
+            });
+        }
+
+        Ok(ContourResult {
+            contours: contour_list,
+            hierarchy: hierarchy_list,
+        })
+    }
+}
+
+impl<T> NdCvContourArea<T> for Vec<Point2<T>>
+where
+    T: bytemuck::Pod + num::traits::AsPrimitive<i32> + std::cmp::PartialEq + std::fmt::Debug + Copy,
+{
+    fn contours_area(&self, oriented: bool) -> Result<f64, NdCvError> {
+        if self.is_empty() {
+            return Ok(0.0);
+        }
+
+        let mut cv_contour: opencv::core::Vector<opencv::core::Point> = opencv::core::Vector::new();
+        self.iter().for_each(|point| {
+            cv_contour.push(opencv::core::Point::new(
+                point.coords[0].as_(),
+                point.coords[1].as_(),
+            ));
+        });
+
+        opencv::imgproc::contour_area(&cv_contour, oriented)
+            .change_context(NdCvError)
+            .attach_printable("Failed to calculate contour area")
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use ndarray::Array2;
+
+    fn simple_binary_rect_image() -> Array2<u8> {
+        let mut img = Array2::<u8>::zeros((10, 10));
+        for i in 2..8 {
+            for j in 3..7 {
+                img[(i, j)] = 255;
+            }
+        }
+        img
+    }
+
+    #[test]
+    fn test_find_contours_external_simple() {
+        let img = simple_binary_rect_image();
+        let contours = img
+            .find_contours(
+                ContourRetrievalMode::External,
+                ContourApproximationMethod::Simple,
+            )
+            .expect("Failed to find contours");
+        assert_eq!(contours.len(), 1);
+        assert!(contours[0].len() >= 4);
+    }
+
+    #[test]
+    fn test_find_contours_with_hierarchy() {
+        let img = simple_binary_rect_image();
+        let res = img
+            .find_contours_with_hierarchy(
+                ContourRetrievalMode::External,
+                ContourApproximationMethod::Simple,
+            )
+            .expect("Failed to find contours with hierarchy");
+        assert_eq!(res.contours.len(), 1);
+        assert_eq!(res.hierarchy.len(), 1);
+
+        let h = &res.hierarchy[0];
+        assert_eq!(h.parent, -1);
+        assert_eq!(h.first_child, -1);
+    }
+
+    #[test]
+    fn test_default_methods() {
+        let img = simple_binary_rect_image();
+        let contours = img.find_contours_def().unwrap();
+        let res = img.find_contours_with_hierarchy_def().unwrap();
+        assert_eq!(contours.len(), 1);
+        assert_eq!(res.contours.len(), 1);
+    }
+
+    #[test]
+    fn test_contour_area_calculation() {
+        let img = simple_binary_rect_image();
+        let contours = img.find_contours_def().unwrap();
+        let expected_area = 15.;
+        let area = contours[0].contours_area_def().unwrap();
+        assert!(
+            (area - expected_area).abs() < 1.0,
+            "Area mismatch: got {area}, expected {expected_area}",
+        );
+    }
+
+    #[test]
+    fn test_empty_input_returns_no_contours() {
+        let img = Array2::<u8>::zeros((10, 10));
+        let contours = img.find_contours_def().unwrap();
+        assert!(contours.is_empty());
+
+        let res = img.find_contours_with_hierarchy_def().unwrap();
+        assert!(res.contours.is_empty());
+        assert!(res.hierarchy.is_empty());
+    }
+
+    #[test]
+    fn test_contour_area_empty_contour() {
+        let contour: Vec<Point2<i32>> = vec![];
+        let area = contour.contours_area_def().unwrap();
+        assert_eq!(area, 0.0);
+    }
+}
--- a/ndcv-bridge/src/conversions.rs
+++ b/ndcv-bridge/src/conversions.rs
@@ -0,0 +1,337 @@
+//! Mat <--> ndarray conversion traits
+//!
+//! Conversion Table
+//!
+//! | ndarray           | Mat     |
+//! |---------          |-----    |
+//! | Array<T, Ix1>     | Mat(ndims = 1, channels = 1)   |
+//! | Array<T, Ix2>     | Mat(ndims = 2, channels = 1)   |
+//! | Array<T, Ix2>     | Mat(ndims = 1, channels = X)   |
+//! | Array<T, Ix3>     | Mat(ndims = 3, channels = 1)   |
+//! | Array<T, Ix3>     | Mat(ndims = 2, channels = X)   |
+//! | Array<T, Ix4>     | Mat(ndims = 4, channels = 1)   |
+//! | Array<T, Ix4>     | Mat(ndims = 3, channels = X)   |
+//! | Array<T, Ix5>     | Mat(ndims = 5, channels = 1)   |
+//! | Array<T, Ix5>     | Mat(ndims = 4, channels = X)   |
+//! | Array<T, Ix6>     | Mat(ndims = 6, channels = 1)   |
+//! | Array<T, Ix6>     | Mat(ndims = 5, channels = X)   |
+//!
+//! // X is the last dimension
+use crate::NdCvError;
+use crate::type_depth;
+use error_stack::*;
+use ndarray::{Ix2, Ix3};
+use opencv::core::MatTraitConst;
+mod impls;
+pub(crate) mod matref;
+use matref::{MatRef, MatRefMut};
+
+pub(crate) mod seal {
+    pub trait SealedInternal {}
+    impl<T, S: ndarray::Data<Elem = T>, D> SealedInternal for ndarray::ArrayBase<S, D> {}
+    // impl<T, S: ndarray::DataMut<Elem = T>, D> SealedInternal for ndarray::ArrayBase<S, D> {}
+}
+
+pub trait NdCvConversion<T: bytemuck::Pod + Copy, D: ndarray::Dimension>:
+    seal::SealedInternal + Sized
+{
+    fn to_mat(&self) -> Result<opencv::core::Mat, NdCvError>;
+    fn from_mat(
+        mat: opencv::core::Mat,
+    ) -> Result<ndarray::ArrayBase<ndarray::OwnedRepr<T>, D>, NdCvError>;
+}
+
+impl<T: bytemuck::Pod + Copy, S: ndarray::Data<Elem = T>, D: ndarray::Dimension>
+    NdCvConversion<T, D> for ndarray::ArrayBase<S, D>
+where
+    Self: NdAsImage<T, D>,
+{
+    fn to_mat(&self) -> Result<opencv::core::Mat, NdCvError> {
+        Ok(self.as_image_mat()?.mat.clone())
+    }
+
+    fn from_mat(
+        mat: opencv::core::Mat,
+    ) -> Result<ndarray::ArrayBase<ndarray::OwnedRepr<T>, D>, NdCvError> {
+        let ndarray = unsafe { impls::mat_to_ndarray::<T, D>(&mat) }.change_context(NdCvError)?;
+        Ok(ndarray.to_owned())
+    }
+}
+
+pub trait MatAsNd {
+    fn as_ndarray<T: bytemuck::Pod, D: ndarray::Dimension>(
+        &self,
+    ) -> Result<ndarray::ArrayView<T, D>, NdCvError>;
+}
+
+impl MatAsNd for opencv::core::Mat {
+    fn as_ndarray<T: bytemuck::Pod, D: ndarray::Dimension>(
+        &self,
+    ) -> Result<ndarray::ArrayView<T, D>, NdCvError> {
+        unsafe { impls::mat_to_ndarray::<T, D>(self) }.change_context(NdCvError)
+    }
+}
+
+pub trait NdAsMat<T: bytemuck::Pod + Copy, D: ndarray::Dimension> {
+    fn as_single_channel_mat(&self) -> Result<MatRef, NdCvError>;
+    fn as_multi_channel_mat(&self) -> Result<MatRef, NdCvError>;
+}
+
+pub trait NdAsMatMut<T: bytemuck::Pod + Copy, D: ndarray::Dimension>: NdAsMat<T, D> {
+    fn as_single_channel_mat_mut(&mut self) -> Result<MatRefMut, NdCvError>;
+    fn as_multi_channel_mat_mut(&mut self) -> Result<MatRefMut, NdCvError>;
+}
+
+impl<T: bytemuck::Pod, S: ndarray::Data<Elem = T>, D: ndarray::Dimension> NdAsMat<T, D>
+    for ndarray::ArrayBase<S, D>
+{
+    fn as_single_channel_mat(&self) -> Result<MatRef, NdCvError> {
+        let mat = unsafe { impls::ndarray_to_mat_regular(self) }.change_context(NdCvError)?;
+        Ok(MatRef::new(mat))
+    }
+    fn as_multi_channel_mat(&self) -> Result<MatRef, NdCvError> {
+        let mat = unsafe { impls::ndarray_to_mat_consolidated(self) }.change_context(NdCvError)?;
+        Ok(MatRef::new(mat))
+    }
+}
+
+impl<T: bytemuck::Pod, S: ndarray::DataMut<Elem = T>, D: ndarray::Dimension> NdAsMatMut<T, D>
+    for ndarray::ArrayBase<S, D>
+{
+    fn as_single_channel_mat_mut(&mut self) -> Result<MatRefMut, NdCvError> {
+        let mat = unsafe { impls::ndarray_to_mat_regular(self) }.change_context(NdCvError)?;
+        Ok(MatRefMut::new(mat))
+    }
+
+    fn as_multi_channel_mat_mut(&mut self) -> Result<MatRefMut, NdCvError> {
+        let mat = unsafe { impls::ndarray_to_mat_consolidated(self) }.change_context(NdCvError)?;
+        Ok(MatRefMut::new(mat))
+    }
+}
+
+pub trait NdAsImage<T: bytemuck::Pod, D: ndarray::Dimension> {
+    fn as_image_mat(&self) -> Result<MatRef, NdCvError>;
+}
+
+pub trait NdAsImageMut<T: bytemuck::Pod, D: ndarray::Dimension> {
+    fn as_image_mat_mut(&mut self) -> Result<MatRefMut, NdCvError>;
+}
+
+impl<T, S> NdAsImage<T, Ix2> for ndarray::ArrayBase<S, Ix2>
+where
+    T: bytemuck::Pod + Copy,
+    S: ndarray::Data<Elem = T>,
+{
+    fn as_image_mat(&self) -> Result<MatRef, NdCvError> {
+        self.as_single_channel_mat()
+    }
+}
+
+impl<T, S> NdAsImageMut<T, Ix2> for ndarray::ArrayBase<S, Ix2>
+where
+    T: bytemuck::Pod + Copy,
+    S: ndarray::DataMut<Elem = T>,
+{
+    fn as_image_mat_mut(&mut self) -> Result<MatRefMut, NdCvError> {
+        self.as_single_channel_mat_mut()
+    }
+}
+
+impl<T, S> NdAsImage<T, Ix3> for ndarray::ArrayBase<S, Ix3>
+where
+    T: bytemuck::Pod + Copy,
+    S: ndarray::Data<Elem = T>,
+{
+    fn as_image_mat(&self) -> Result<MatRef, NdCvError> {
+        self.as_multi_channel_mat()
+    }
+}
+
+impl<T, S> NdAsImageMut<T, Ix3> for ndarray::ArrayBase<S, Ix3>
+where
+    T: bytemuck::Pod + Copy,
+    S: ndarray::DataMut<Elem = T>,
+{
+    fn as_image_mat_mut(&mut self) -> Result<MatRefMut, NdCvError> {
+        self.as_multi_channel_mat_mut()
+    }
+}
+
+// #[test]
+// fn test_1d_mat_to_ndarray() {
+//     let mat = opencv::core::Mat::new_nd_with_default(
+//         &[10],
+//         opencv::core::CV_MAKE_TYPE(opencv::core::CV_8U, 1),
+//         200.into(),
+//     )
+//     .expect("failed");
+//     let array: ndarray::ArrayView1<u8> = mat.as_ndarray().expect("failed");
+//     array.into_iter().for_each(|&x| assert_eq!(x, 200));
+// }
+
+// #[test]
+// fn test_2d_mat_to_ndarray() {
+//     let mat = opencv::core::Mat::new_nd_with_default(
+//         &[10],
+//         opencv::core::CV_16SC3,
+//         (200, 200, 200).into(),
+//     )
+//     .expect("failed");
+//     let array2: ndarray::ArrayView2<i16> = mat.as_ndarray().expect("failed");
+//     assert_eq!(array2.shape(), [10, 3]);
+//     array2.into_iter().for_each(|&x| {
+//         assert_eq!(x, 200);
+//     });
+// }
+
+// #[test]
+// fn test_3d_mat_to_ndarray() {
+//     let mat = opencv::core::Mat::new_nd_with_default(
+//         &[20, 30],
+//         opencv::core::CV_32FC3,
+//         (200, 200, 200).into(),
+//     )
+//     .expect("failed");
+//     let array2: ndarray::ArrayView3<f32> = mat.as_ndarray().expect("failed");
+//     array2.into_iter().for_each(|&x| {
+//         assert_eq!(x, 200f32);
+//     });
+// }
+
+// #[test]
+// fn test_mat_to_dyn_ndarray() {
+//     let mat = opencv::core::Mat::new_nd_with_default(&[10], opencv::core::CV_8UC1, 200.into())
+//         .expect("failed");
+//     let array2: ndarray::ArrayViewD<u8> = mat.as_ndarray().expect("failed");
+//     array2.into_iter().for_each(|&x| assert_eq!(x, 200));
+// }
+
+// #[test]
+// fn test_3d_mat_to_ndarray_4k() {
+//     let mat = opencv::core::Mat::new_nd_with_default(
+//         &[4096, 4096],
+//         opencv::core::CV_8UC3,
+//         (255, 0, 255).into(),
+//     )
+//     .expect("failed");
+//     let array2: ndarray::ArrayView3<u8> = (mat).as_ndarray().expect("failed");
+//     array2.exact_chunks((1, 1, 3)).into_iter().for_each(|x| {
+//         assert_eq!(x[(0, 0, 0)], 255);
+//         assert_eq!(x[(0, 0, 1)], 0);
+//         assert_eq!(x[(0, 0, 2)], 255);
+//     });
+// }
+
+// // #[test]
+// // fn test_3d_mat_to_ndarray_8k() {
+// //     let mat = opencv::core::Mat::new_nd_with_default(
+// //         &[8192, 8192],
+// //         opencv::core::CV_8UC3,
+// //         (255, 0, 255).into(),
+// //     )
+// //     .expect("failed");
+// //     let array2 = ndarray::Array3::<u8>::from_mat(mat).expect("failed");
+// //     array2.exact_chunks((1, 1, 3)).into_iter().for_each(|x| {
+// //         assert_eq!(x[(0, 0, 0)], 255);
+// //         assert_eq!(x[(0, 0, 1)], 0);
+// //         assert_eq!(x[(0, 0, 2)], 255);
+// //     });
+// // }
+
+// #[test]
+// pub fn test_mat_to_nd_default_strides() {
+//     let mat = opencv::core::Mat::new_rows_cols_with_default(
+//         10,
+//         10,
+//         opencv::core::CV_8UC3,
+//         opencv::core::VecN([10f64, 0.0, 0.0, 0.0]),
+//     )
+//     .expect("failed");
+//     let array = unsafe { impls::mat_to_ndarray::<u8, Ix3>(&mat) }.expect("failed");
+//     assert_eq!(array.shape(), [10, 10, 3]);
+//     assert_eq!(array.strides(), [30, 3, 1]);
+//     assert_eq!(array[(0, 0, 0)], 10);
+// }
+
+// #[test]
+// pub fn test_mat_to_nd_custom_strides() {
+//     let mat = opencv::core::Mat::new_rows_cols_with_default(
+//         10,
+//         10,
+//         opencv::core::CV_8UC3,
+//         opencv::core::VecN([10f64, 0.0, 0.0, 0.0]),
+//     )
+//     .unwrap();
+//     let mat_roi = opencv::core::Mat::roi(&mat, opencv::core::Rect::new(3, 2, 3, 5))
+//         .expect("failed to get roi");
+//     let array = unsafe { impls::mat_to_ndarray::<u8, Ix3>(&mat_roi) }.expect("failed");
+//     assert_eq!(array.shape(), [5, 3, 3]);
+//     assert_eq!(array.strides(), [30, 3, 1]);
+//     assert_eq!(array[(0, 0, 0)], 10);
+// }
+
+// #[test]
+// pub fn test_non_continuous_3d() {
+//     let array = ndarray::Array3::<f32>::from_shape_fn((10, 10, 4), |(i, j, k)| {
+//         ((i + 1) * (j + 1) * (k + 1)) as f32
+//     });
+//     let slice = array.slice(ndarray::s![3..7, 3..7, 0..4]);
+//     let mat = unsafe { impls::ndarray_to_mat_consolidated(&slice) }.unwrap();
+//     let arr = unsafe { impls::mat_to_ndarray::<f32, Ix3>(&mat).unwrap() };
+//     assert!(slice == arr);
+// }
+
+// #[test]
+// pub fn test_5d_array() {
+//     let array = ndarray::Array5::<f32>::ones((1, 2, 3, 4, 5));
+//     let mat = unsafe { impls::ndarray_to_mat_consolidated(&array) }.unwrap();
+//     let arr = unsafe { impls::mat_to_ndarray::<f32, ndarray::Ix5>(&mat).unwrap() };
+//     assert_eq!(array, arr);
+// }
+
+// #[test]
+// pub fn test_3d_array() {
+//     let array = ndarray::Array3::<f32>::ones((23, 31, 33));
+//     let mat = unsafe { impls::ndarray_to_mat_consolidated(&array) }.unwrap();
+//     let arr = unsafe { impls::mat_to_ndarray::<f32, ndarray::Ix3>(&mat).unwrap() };
+//     assert_eq!(array, arr);
+// }
+
+// #[test]
+// pub fn test_2d_array() {
+//     let array = ndarray::Array2::<f32>::ones((23, 31));
+//     let mat = unsafe { impls::ndarray_to_mat_consolidated(&array) }.unwrap();
+//     let arr = unsafe { impls::mat_to_ndarray::<f32, ndarray::Ix2>(&mat).unwrap() };
+//     assert_eq!(array, arr);
+// }
+
+// #[test]
+// #[should_panic]
+// pub fn test_1d_array_consolidated() {
+//     let array = ndarray::Array1::<f32>::ones(23);
+//     let mat = unsafe { impls::ndarray_to_mat_consolidated(&array) }.unwrap();
+//     let arr = unsafe { impls::mat_to_ndarray::<f32, ndarray::Ix1>(&mat).unwrap() };
+//     assert_eq!(array, arr);
+// }
+
+// #[test]
+// pub fn test_1d_array_regular() {
+//     let array = ndarray::Array1::<f32>::ones(23);
+//     let mat = unsafe { impls::ndarray_to_mat_regular(&array) }.unwrap();
+//     let arr = unsafe { impls::mat_to_ndarray::<f32, ndarray::Ix1>(&mat).unwrap() };
+//     assert_eq!(array, arr);
+// }
+
+// #[test]
+// pub fn test_2d_array_regular() {
+//     let array = ndarray::Array2::<f32>::ones((23, 31));
+//     let mat = unsafe { impls::ndarray_to_mat_regular(&array) }.unwrap();
+//     let arr = unsafe { impls::mat_to_ndarray::<f32, ndarray::Ix2>(&mat).unwrap() };
+//     assert_eq!(array, arr);
+// }
+
+// #[test]
+// pub fn test_ndcv_1024_1024_to_mat() {
+//     let array = ndarray::Array2::<f32>::ones((1024, 1024));
+//     let _mat = array.to_mat().unwrap();
+// }
--- a/ndcv-bridge/src/conversions/impls.rs
+++ b/ndcv-bridge/src/conversions/impls.rs
@@ -0,0 +1,168 @@
+use super::*;
+use core::ffi::*;
+use opencv::core::prelude::*;
+pub(crate) unsafe fn ndarray_to_mat_regular<
+    T,
+    S: ndarray::Data<Elem = T>,
+    D: ndarray::Dimension,
+>(
+    input: &ndarray::ArrayBase<S, D>,
+) -> Result<opencv::core::Mat, NdCvError> {
+    let shape = input.shape();
+    let strides = input.strides();
+
+    // let channels = shape.last().copied().unwrap_or(1);
+    // if channels > opencv::core::CV_CN_MAX as usize {
+    //     Err(Report::new(NdCvError).attach_printable(format!(
+    //             "Number of channels({channels}) exceeds CV_CN_MAX({}) use the regular version of the function", opencv::core::CV_CN_MAX
+    //         )))?;
+    // }
+
+    // let size_len = shape.len();
+    let size = shape.iter().copied().map(|f| f as i32).collect::<Vec<_>>();
+    // Step len for ndarray is always 1 less than ndims
+    let step_len = strides.len() - 1;
+    let step = strides
+        .iter()
+        .take(step_len)
+        .copied()
+        .map(|f| f as usize * core::mem::size_of::<T>())
+        .collect::<Vec<_>>();
+
+    let data_ptr = input.as_ptr() as *const c_void;
+
+    let typ = opencv::core::CV_MAKETYPE(type_depth::<T>(), 1);
+    let mat = opencv::core::Mat::new_nd_with_data_unsafe(
+        size.as_slice(),
+        typ,
+        data_ptr.cast_mut(),
+        Some(step.as_slice()),
+    )
+    .change_context(NdCvError)?;
+
+    Ok(mat)
+}
+
+pub(crate) unsafe fn ndarray_to_mat_consolidated<
+    T,
+    S: ndarray::Data<Elem = T>,
+    D: ndarray::Dimension,
+>(
+    input: &ndarray::ArrayBase<S, D>,
+) -> Result<opencv::core::Mat, NdCvError> {
+    let shape = input.shape();
+    let strides = input.strides();
+
+    let channels = shape.last().copied().unwrap_or(1);
+    if channels > opencv::core::CV_CN_MAX as usize {
+        Err(Report::new(NdCvError).attach_printable(format!(
+                "Number of channels({channels}) exceeds CV_CN_MAX({}) use the regular version of the function", opencv::core::CV_CN_MAX
+            )))?;
+    }
+
+    if shape.len() > 2 {
+        // Basically the second last stride is used to jump from one column to next
+        // But opencv only keeps ndims - 1 strides so we can't have the column stride as that
+        // will be lost
+        if shape.last() != strides.get(strides.len() - 2).map(|x| *x as usize).as_ref() {
+            Err(Report::new(NdCvError).attach_printable(
+                "You cannot slice into the last axis in ndarray when converting to mat",
+            ))?;
+        }
+    } else if shape.len() == 1 {
+        return Err(Report::new(NdCvError).attach_printable(
+            "You cannot convert a 1D array to a Mat while using the consolidated version",
+        ));
+    }
+
+    // Since this is the consolidated version we should always only have ndims - 1 sizes and
+    // ndims - 2 strides
+
+    let size_len = shape.len() - 1; // Since we move last axis into the channel
+    let size = shape
+        .iter()
+        .take(size_len)
+        .map(|f| *f as i32)
+        .collect::<Vec<_>>();
+
+    let step_len = strides.len() - 1;
+    let step = strides
+        .iter()
+        .take(step_len)
+        .map(|f| *f as usize * core::mem::size_of::<T>())
+        .collect::<Vec<_>>();
+
+    let data_ptr = input.as_ptr() as *const c_void;
+
+    let typ = opencv::core::CV_MAKETYPE(type_depth::<T>(), channels as i32);
+
+    let mat = opencv::core::Mat::new_nd_with_data_unsafe(
+        size.as_slice(),
+        typ,
+        data_ptr.cast_mut(),
+        Some(step.as_slice()),
+    )
+    .change_context(NdCvError)?;
+
+    Ok(mat)
+}
+
+pub(crate) unsafe fn mat_to_ndarray<T: bytemuck::Pod, D: ndarray::Dimension>(
+    mat: &opencv::core::Mat,
+) -> Result<ndarray::ArrayView<'_, T, D>, NdCvError> {
+    let depth = mat.depth();
+    if type_depth::<T>() != depth {
+        return Err(Report::new(NdCvError).attach_printable(format!(
+            "Expected type Mat<{}> ({}), got Mat<{}> ({})",
+            std::any::type_name::<T>(),
+            type_depth::<T>(),
+            crate::depth_type(depth),
+            depth,
+        )));
+    }
+
+    // Since a dims always returns >= 2 we can't use this to check if it's a 1D array
+    // So we compare the first axis to the total to see if its a 1D array
+    let is_1d = mat.total() as i32 == mat.rows();
+    let dims = is_1d.then_some(1).unwrap_or(mat.dims());
+    let channels = mat.channels();
+    let ndarray_size = (channels != 1).then_some(dims + 1).unwrap_or(dims) as usize;
+    if let Some(ndim) = D::NDIM {
+        // When channels is not 1,
+        // the last dimension is the channels
+        // Array1 -> Mat(ndims = 1, channels = 1)
+        // Array2 -> Mat(ndims = 1, channels = X)
+        // Array2 -> Mat(ndims = 2, channels = 1)
+        // Array3 -> Mat(ndims = 2, channels = X)
+        // Array3 -> Mat(ndims = 3, channels = 1)
+        // ...
+        if ndim != dims as usize && channels == 1 {
+            return Err(Report::new(NdCvError)
+                .attach_printable(format!("Expected {}D array, got {}D", ndim, ndarray_size)));
+        }
+    }
+
+    let mat_size = mat.mat_size();
+    let sizes = (0..dims)
+        .map(|i| mat_size.get(i).change_context(NdCvError))
+        .chain([Ok(channels)])
+        .map(|x| x.map(|x| x as usize))
+        .take(ndarray_size)
+        .collect::<Result<Vec<_>, NdCvError>>()
+        .change_context(NdCvError)?;
+    let strides = (0..(dims - 1))
+        .map(|i| mat.step1(i).change_context(NdCvError))
+        .chain([
+            Ok(channels as usize),
+            Ok((channels == 1).then_some(0).unwrap_or(1)),
+        ])
+        .take(ndarray_size)
+        .collect::<Result<Vec<_>, NdCvError>>()
+        .change_context(NdCvError)?;
+    use ndarray::ShapeBuilder;
+    let shape = sizes.strides(strides);
+    let raw_array = ndarray::RawArrayView::from_shape_ptr(shape, mat.data() as *const T)
+        .into_dimensionality()
+        .change_context(NdCvError)?;
+    Ok(unsafe { raw_array.deref_into_view() })
+}
--- a/ndcv-bridge/src/conversions/matref.rs
+++ b/ndcv-bridge/src/conversions/matref.rs
@@ -0,0 +1,73 @@
+#[derive(Debug, Clone)]
+pub struct MatRef<'a> {
+    pub(crate) mat: opencv::core::Mat,
+    pub(crate) _marker: core::marker::PhantomData<&'a ()>,
+}
+
+impl MatRef<'_> {
+    pub fn clone_pointee(&self) -> opencv::core::Mat {
+        self.mat.clone()
+    }
+}
+
+impl MatRef<'_> {
+    pub fn new<'a>(mat: opencv::core::Mat) -> MatRef<'a> {
+        MatRef {
+            mat,
+            _marker: core::marker::PhantomData,
+        }
+    }
+}
+
+impl AsRef<opencv::core::Mat> for MatRef<'_> {
+    fn as_ref(&self) -> &opencv::core::Mat {
+        &self.mat
+    }
+}
+
+impl AsRef<opencv::core::Mat> for MatRefMut<'_> {
+    fn as_ref(&self) -> &opencv::core::Mat {
+        &self.mat
+    }
+}
+
+impl AsMut<opencv::core::Mat> for MatRefMut<'_> {
+    fn as_mut(&mut self) -> &mut opencv::core::Mat {
+        &mut self.mat
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct MatRefMut<'a> {
+    pub(crate) mat: opencv::core::Mat,
+    pub(crate) _marker: core::marker::PhantomData<&'a mut ()>,
+}
+
+impl MatRefMut<'_> {
+    pub fn new<'a>(mat: opencv::core::Mat) -> MatRefMut<'a> {
+        MatRefMut {
+            mat,
+            _marker: core::marker::PhantomData,
+        }
+    }
+}
+
+impl core::ops::Deref for MatRef<'_> {
+    type Target = opencv::core::Mat;
+    fn deref(&self) -> &Self::Target {
+        &self.mat
+    }
+}
+
+impl core::ops::Deref for MatRefMut<'_> {
+    type Target = opencv::core::Mat;
+    fn deref(&self) -> &Self::Target {
+        &self.mat
+    }
+}
+
+impl core::ops::DerefMut for MatRefMut<'_> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.mat
+    }
+}
--- a/ndcv-bridge/src/fir.rs
+++ b/ndcv-bridge/src/fir.rs
@@ -0,0 +1,262 @@
+use error_stack::*;
+use fast_image_resize::*;
+use images::{Image, ImageRef};
+#[derive(Debug, Clone, thiserror::Error)]
+#[error("NdFirError")]
+pub struct NdFirError;
+type Result<T, E = Report<NdFirError>> = std::result::Result<T, E>;
+
+pub trait NdAsImage<T: seal::Sealed, D: ndarray::Dimension>: Sized {
+    fn as_image_ref(&self) -> Result<ImageRef<'_>>;
+}
+
+pub trait NdAsImageMut<T: seal::Sealed, D: ndarray::Dimension>: Sized {
+    fn as_image_ref_mut(&mut self) -> Result<Image<'_>>;
+}
+
+pub struct NdarrayImageContainer<'a, T: seal::Sealed, D: ndarray::Dimension> {
+    #[allow(dead_code)]
+    data: ndarray::ArrayView<'a, T, D>,
+    pub _phantom: std::marker::PhantomData<(T, D)>,
+}
+
+impl<'a, T: seal::Sealed> NdarrayImageContainer<'a, T, ndarray::Ix3> {
+    pub fn new<S: ndarray::Data<Elem = T>>(array: &'a ndarray::ArrayBase<S, ndarray::Ix3>) -> Self {
+        Self {
+            data: array.view(),
+            _phantom: std::marker::PhantomData,
+        }
+    }
+}
+
+impl<'a, T: seal::Sealed> NdarrayImageContainer<'a, T, ndarray::Ix2> {
+    pub fn new<S: ndarray::Data<Elem = T>>(array: &'a ndarray::ArrayBase<S, ndarray::Ix2>) -> Self {
+        Self {
+            data: array.view(),
+            _phantom: std::marker::PhantomData,
+        }
+    }
+}
+pub struct NdarrayImageContainerMut<'a, T: seal::Sealed, D: ndarray::Dimension> {
+    #[allow(dead_code)]
+    data: ndarray::ArrayViewMut<'a, T, D>,
+}
+
+impl<'a, T: seal::Sealed> NdarrayImageContainerMut<'a, T, ndarray::Ix3> {
+    pub fn new<S: ndarray::DataMut<Elem = T>>(
+        array: &'a mut ndarray::ArrayBase<S, ndarray::Ix3>,
+    ) -> Self {
+        Self {
+            data: array.view_mut(),
+        }
+    }
+}
+
+impl<'a, T: seal::Sealed> NdarrayImageContainerMut<'a, T, ndarray::Ix2> {
+    pub fn new<S: ndarray::DataMut<Elem = T>>(
+        array: &'a mut ndarray::ArrayBase<S, ndarray::Ix2>,
+    ) -> Self {
+        Self {
+            data: array.view_mut(),
+        }
+    }
+}
+
+pub struct NdarrayImageContainerTyped<'a, T: seal::Sealed, D: ndarray::Dimension, P: PixelTrait> {
+    #[allow(dead_code)]
+    data: ndarray::ArrayView<'a, T, D>,
+    __marker: std::marker::PhantomData<P>,
+}
+
+// unsafe impl<'a, T: seal::Sealed + Sync + InnerPixel, P: PixelTrait> ImageView
+//     for NdarrayImageContainerTyped<'a, T, ndarray::Ix3, P>
+// where
+//     T: bytemuck::Pod,
+// {
+//     type Pixel = P;
+//     fn width(&self) -> u32 {
+//         self.data.shape()[1] as u32
+//     }
+//     fn height(&self) -> u32 {
+//         self.data.shape()[0] as u32
+//     }
+//     fn iter_rows(&self, start_row: u32) -> impl Iterator<Item = &[Self::Pixel]> {
+//         self.data
+//             .rows()
+//             .into_iter()
+//             .skip(start_row as usize)
+//             .map(|row| {
+//                 row.as_slice()
+//                     .unwrap_or_default()
+//                     .chunks_exact(P::CHANNELS as usize)
+//             })
+//     }
+// }
+
+// impl<'a, T: fast_image_resize::pixels::InnerPixel + seal::Sealed, D: ndarray::Dimension>
+//     fast_image_resize::IntoImageView for NdarrayImageContainer<'a, T, D>
+// {
+//     fn pixel_type(&self) -> Option<PixelType> {
+//         match D::NDIM {
+//             Some(2) => Some(to_pixel_type::<T>(1).expect("Failed to convert to pixel type")),
+//             Some(3) => Some(
+//                 to_pixel_type::<T>(self.data.shape()[2]).expect("Failed to convert to pixel type"),
+//             ),
+//             _ => None,
+//         }
+//     }
+//     fn width(&self) -> u32 {
+//         self.data.shape()[1] as u32
+//     }
+//     fn height(&self) -> u32 {
+//         self.data.shape()[0] as u32
+//     }
+//     fn image_view<P: PixelTrait>(&'a self) -> Option<NdarrayImageContainerTyped<'a, T, D, P>> {
+//         Some(NdarrayImageContainerTyped {
+//             data: self.data.view(),
+//             __marker: std::marker::PhantomData,
+//         })
+//     }
+// }
+
+pub fn to_pixel_type<T: seal::Sealed>(u: usize) -> Result<PixelType> {
+    match (core::any::type_name::<T>(), u) {
+        ("u8", 1) => Ok(PixelType::U8),
+        ("u8", 2) => Ok(PixelType::U8x2),
+        ("u8", 3) => Ok(PixelType::U8x3),
+        ("u8", 4) => Ok(PixelType::U8x4),
+        ("u16", 1) => Ok(PixelType::U16),
+        ("i32", 1) => Ok(PixelType::I32),
+        ("f32", 1) => Ok(PixelType::F32),
+        ("f32", 2) => Ok(PixelType::F32x2),
+        ("f32", 3) => Ok(PixelType::F32x3),
+        ("f32", 4) => Ok(PixelType::F32x4),
+        _ => Err(Report::new(NdFirError).attach_printable("Unsupported pixel type")),
+    }
+}
+
+mod seal {
+    pub trait Sealed {}
+    impl Sealed for u8 {}
+    impl Sealed for u16 {}
+    impl Sealed for i32 {}
+    impl Sealed for f32 {}
+}
+
+impl<S: ndarray::Data<Elem = T>, T: seal::Sealed + bytemuck::Pod, D: ndarray::Dimension>
+    NdAsImage<T, D> for ndarray::ArrayBase<S, D>
+{
+    /// Clones self and makes a new image
+    fn as_image_ref(&self) -> Result<ImageRef> {
+        let shape = self.shape();
+        let rows = *shape
+            .first()
+            .ok_or_else(|| Report::new(NdFirError).attach_printable("Failed to get rows"))?
+            as u32;
+        let cols = *shape.get(1).unwrap_or(&1) as u32;
+        let channels = *shape.get(2).unwrap_or(&1);
+        let data = self
+            .as_slice()
+            .ok_or(NdFirError)
+            .attach_printable("The ndarray is non continuous")?;
+        let data_bytes: &[u8] = bytemuck::cast_slice(data);
+
+        let pixel_type = to_pixel_type::<T>(channels)?;
+        ImageRef::new(cols, rows, data_bytes, pixel_type)
+            .change_context(NdFirError)
+            .attach_printable("Failed to create Image from ndarray")
+    }
+}
+
+impl<S: ndarray::DataMut<Elem = T>, T: seal::Sealed + bytemuck::Pod, D: ndarray::Dimension>
+    NdAsImageMut<T, D> for ndarray::ArrayBase<S, D>
+{
+    fn as_image_ref_mut(&mut self) -> Result<Image<'_>>
+    where
+        S: ndarray::DataMut<Elem = T>,
+    {
+        let shape = self.shape();
+        let rows = *shape
+            .first()
+            .ok_or_else(|| Report::new(NdFirError).attach_printable("Failed to get rows"))?
+            as u32;
+        let cols = *shape.get(1).unwrap_or(&1) as u32;
+        let channels = *shape.get(2).unwrap_or(&1);
+        let data = self
+            .as_slice_mut()
+            .ok_or(NdFirError)
+            .attach_printable("The ndarray is non continuous")?;
+        let data_bytes: &mut [u8] = bytemuck::cast_slice_mut(data);
+
+        let pixel_type = to_pixel_type::<T>(channels)?;
+        Image::from_slice_u8(cols, rows, data_bytes, pixel_type)
+            .change_context(NdFirError)
+            .attach_printable("Failed to create Image from ndarray")
+    }
+}
+
+pub trait NdFir<T, D> {
+    fn fast_resize<'o>(
+        &self,
+        height: usize,
+        width: usize,
+        options: impl Into<Option<&'o ResizeOptions>>,
+    ) -> Result<ndarray::Array<T, D>>;
+}
+
+impl<T: seal::Sealed + bytemuck::Pod + num::Zero, S: ndarray::Data<Elem = T>> NdFir<T, ndarray::Ix3>
+    for ndarray::ArrayBase<S, ndarray::Ix3>
+{
+    fn fast_resize<'o>(
+        &self,
+        height: usize,
+        width: usize,
+        options: impl Into<Option<&'o ResizeOptions>>,
+    ) -> Result<ndarray::Array3<T>> {
+        let source = self.as_image_ref()?;
+        let (_height, _width, channels) = self.dim();
+        let mut dest = ndarray::Array3::<T>::zeros((height, width, channels));
+        let mut dest_image = dest.as_image_ref_mut()?;
+        let mut resizer = fast_image_resize::Resizer::default();
+        resizer
+            .resize(&source, &mut dest_image, options)
+            .change_context(NdFirError)?;
+        Ok(dest)
+    }
+}
+
+impl<T: seal::Sealed + bytemuck::Pod + num::Zero, S: ndarray::Data<Elem = T>> NdFir<T, ndarray::Ix2>
+    for ndarray::ArrayBase<S, ndarray::Ix2>
+{
+    fn fast_resize<'o>(
+        &self,
+        height: usize,
+        width: usize,
+        options: impl Into<Option<&'o ResizeOptions>>,
+    ) -> Result<ndarray::Array<T, ndarray::Ix2>> {
+        let source = self.as_image_ref()?;
+        let (_height, _width) = self.dim();
+        let mut dest = ndarray::Array::<T, ndarray::Ix2>::zeros((height, width));
+        let mut dest_image = dest.as_image_ref_mut()?;
+        let mut resizer = fast_image_resize::Resizer::default();
+        resizer
+            .resize(&source, &mut dest_image, options)
+            .change_context(NdFirError)?;
+        Ok(dest)
+    }
+}
+
+#[test]
+pub fn test_ndarray_fast_image_resize_u8() {
+    let source_fhd = ndarray::Array3::<u8>::ones((1920, 1080, 3));
+    let mut resized_hd = ndarray::Array3::<u8>::zeros((1280, 720, 3));
+    let mut resizer = fast_image_resize::Resizer::default();
+    resizer
+        .resize(
+            &source_fhd.as_image_ref().unwrap(),
+            &mut resized_hd.as_image_ref_mut().unwrap(),
+            None,
+        )
+        .unwrap();
+    assert_eq!(resized_hd.shape(), [1280, 720, 3]);
+}
--- a/ndcv-bridge/src/gaussian.rs
+++ b/ndcv-bridge/src/gaussian.rs
@@ -0,0 +1,307 @@
+//! <https://docs.rs/opencv/latest/opencv/imgproc/fn.gaussian_blur.html>
+use crate::conversions::*;
+use crate::prelude_::*;
+use ndarray::*;
+
+#[repr(C)]
+#[derive(Default, Debug, Copy, Clone)]
+pub enum BorderType {
+    #[default]
+    BorderConstant = 0,
+    BorderReplicate = 1,
+    BorderReflect = 2,
+    BorderWrap = 3,
+    BorderReflect101 = 4,
+    BorderTransparent = 5,
+    BorderIsolated = 16,
+}
+
+#[repr(C)]
+#[derive(Default, Debug, Copy, Clone)]
+pub enum AlgorithmHint {
+    #[default]
+    AlgoHintDefault = 0,
+    AlgoHintAccurate = 1,
+    AlgoHintApprox = 2,
+}
+
+mod seal {
+    pub trait Sealed {}
+    // src: input image; the image can have any number of channels, which are processed independently, but the depth should be CV_8U, CV_16U, CV_16S, CV_32F or CV_64F.
+    impl Sealed for u8 {}
+    impl Sealed for u16 {}
+    impl Sealed for i16 {}
+    impl Sealed for f32 {}
+    impl Sealed for f64 {}
+}
+
+pub trait NdCvGaussianBlur<T: bytemuck::Pod + seal::Sealed, D: ndarray::Dimension>:
+    crate::image::NdImage + crate::conversions::NdAsImage<T, D>
+{
+    fn gaussian_blur(
+        &self,
+        kernel_size: (u8, u8),
+        sigma_x: f64,
+        sigma_y: f64,
+        border_type: BorderType,
+    ) -> Result<ndarray::Array<T, D>, NdCvError>;
+    fn gaussian_blur_def(
+        &self,
+        kernel: (u8, u8),
+        sigma_x: f64,
+    ) -> Result<ndarray::Array<T, D>, NdCvError> {
+        self.gaussian_blur(kernel, sigma_x, sigma_x, BorderType::BorderConstant)
+    }
+}
+
+impl<
+        T: bytemuck::Pod + num::Zero + seal::Sealed,
+        S: ndarray::RawData + ndarray::Data<Elem = T>,
+        D: ndarray::Dimension,
+    > NdCvGaussianBlur<T, D> for ArrayBase<S, D>
+where
+    ndarray::ArrayBase<S, D>: crate::image::NdImage + crate::conversions::NdAsImage<T, D>,
+    ndarray::Array<T, D>: crate::conversions::NdAsImageMut<T, D>,
+{
+    fn gaussian_blur(
+        &self,
+        kernel_size: (u8, u8),
+        sigma_x: f64,
+        sigma_y: f64,
+        border_type: BorderType,
+    ) -> Result<ndarray::Array<T, D>, NdCvError> {
+        let mut dst = ndarray::Array::zeros(self.dim());
+        let cv_self = self.as_image_mat()?;
+        let mut cv_dst = dst.as_image_mat_mut()?;
+        opencv::imgproc::gaussian_blur(
+            &*cv_self,
+            &mut *cv_dst,
+            opencv::core::Size {
+                width: kernel_size.0 as i32,
+                height: kernel_size.1 as i32,
+            },
+            sigma_x,
+            sigma_y,
+            border_type as i32,
+        )
+        .change_context(NdCvError)
+        .attach_printable("Failed to apply gaussian blur")?;
+        Ok(dst)
+    }
+}
+
+// impl<
+//         T: bytemuck::Pod + num::Zero + seal::Sealed,
+//         S: ndarray::RawData + ndarray::Data<Elem = T>,
+//     > NdCvGaussianBlur<T, Ix3> for ArrayBase<S, Ix3>
+// {
+//     fn gaussian_blur(
+//         &self,
+//         kernel_size: (u8, u8),
+//         sigma_x: f64,
+//         sigma_y: f64,
+//         border_type: BorderType,
+//     ) -> Result<ndarray::Array<T, Ix3>, NdCvError> {
+//         let mut dst = ndarray::Array::zeros(self.dim());
+//         let cv_self = self.as_image_mat()?;
+//         let mut cv_dst = dst.as_image_mat_mut()?;
+//         opencv::imgproc::gaussian_blur(
+//             &*cv_self,
+//             &mut *cv_dst,
+//             opencv::core::Size {
+//                 width: kernel_size.0 as i32,
+//                 height: kernel_size.1 as i32,
+//             },
+//             sigma_x,
+//             sigma_y,
+//             border_type as i32,
+//         )
+//         .change_context(NdCvError)
+//         .attach_printable("Failed to apply gaussian blur")?;
+//         Ok(dst)
+//     }
+// }
+//
+// impl<
+//         T: bytemuck::Pod + num::Zero + seal::Sealed,
+//         S: ndarray::RawData + ndarray::Data<Elem = T>,
+//     > NdCvGaussianBlur<T, Ix2> for ArrayBase<S, Ix2>
+// {
+//     fn gaussian_blur(
+//         &self,
+//         kernel_size: (u8, u8),
+//         sigma_x: f64,
+//         sigma_y: f64,
+//         border_type: BorderType,
+//     ) -> Result<ndarray::Array<T, Ix2>, NdCvError> {
+//         let mut dst = ndarray::Array::zeros(self.dim());
+//         let cv_self = self.as_image_mat()?;
+//         let mut cv_dst = dst.as_image_mat_mut()?;
+//         opencv::imgproc::gaussian_blur(
+//             &*cv_self,
+//             &mut *cv_dst,
+//             opencv::core::Size {
+//                 width: kernel_size.0 as i32,
+//                 height: kernel_size.1 as i32,
+//             },
+//             sigma_x,
+//             sigma_y,
+//             border_type as i32,
+//         )
+//         .change_context(NdCvError)
+//         .attach_printable("Failed to apply gaussian blur")?;
+//         Ok(dst)
+//     }
+// }
+
+/// For smaller values it is faster to use the allocated version
+/// For example in a 4k f32 image this is about 50% faster than the allocated one
+pub trait NdCvGaussianBlurInPlace<T: bytemuck::Pod + seal::Sealed, D: ndarray::Dimension>:
+    crate::image::NdImage + crate::conversions::NdAsImageMut<T, D>
+{
+    fn gaussian_blur_inplace(
+        &mut self,
+        kernel_size: (u8, u8),
+        sigma_x: f64,
+        sigma_y: f64,
+        border_type: BorderType,
+    ) -> Result<&mut Self, NdCvError>;
+    fn gaussian_blur_def_inplace(
+        &mut self,
+        kernel: (u8, u8),
+        sigma_x: f64,
+    ) -> Result<&mut Self, NdCvError> {
+        self.gaussian_blur_inplace(kernel, sigma_x, sigma_x, BorderType::BorderConstant)
+    }
+}
+
+impl<
+        T: bytemuck::Pod + num::Zero + seal::Sealed,
+        S: ndarray::RawData + ndarray::DataMut<Elem = T>,
+        D: ndarray::Dimension,
+    > NdCvGaussianBlurInPlace<T, D> for ArrayBase<S, D>
+where
+    Self: crate::image::NdImage + crate::conversions::NdAsImageMut<T, D>,
+{
+    fn gaussian_blur_inplace(
+        &mut self,
+        kernel_size: (u8, u8),
+        sigma_x: f64,
+        sigma_y: f64,
+        border_type: BorderType,
+    ) -> Result<&mut Self, NdCvError> {
+        let mut cv_self = self.as_image_mat_mut()?;
+
+        unsafe {
+            crate::inplace::op_inplace(&mut *cv_self, |this, out| {
+                opencv::imgproc::gaussian_blur(
+                    this,
+                    out,
+                    opencv::core::Size {
+                        width: kernel_size.0 as i32,
+                        height: kernel_size.1 as i32,
+                    },
+                    sigma_x,
+                    sigma_y,
+                    border_type as i32,
+                )
+            })
+        }
+        .change_context(NdCvError)
+        .attach_printable("Failed to apply gaussian blur")?;
+        Ok(self)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use ndarray::Array3;
+
+    #[test]
+    fn test_gaussian_basic() {
+        let arr = Array3::<u8>::ones((10, 10, 3));
+        let kernel_size = (3, 3);
+        let sigma_x = 0.0;
+        let sigma_y = 0.0;
+        let border_type = BorderType::BorderConstant;
+        let res = arr
+            .gaussian_blur(kernel_size, sigma_x, sigma_y, border_type)
+            .unwrap();
+        assert_eq!(res.shape(), &[10, 10, 3]);
+    }
+
+    #[test]
+    fn test_gaussian_edge_preservation() {
+        // Create an image with a sharp edge
+        let mut arr = Array3::<u8>::zeros((10, 10, 3));
+        arr.slice_mut(s![..5, .., ..]).fill(255); // Top half white, bottom half black
+
+        let res = arr
+            .gaussian_blur((3, 3), 1.0, 1.0, BorderType::BorderConstant)
+            .unwrap();
+
+        // Check that the middle row (edge) has intermediate values
+        let middle_row = res.slice(s![4..6, 5, 0]);
+        assert!(middle_row.iter().all(|&x| x > 0 && x < 255));
+    }
+
+    #[test]
+    fn test_gaussian_different_kernel_sizes() {
+        let arr = Array3::<u8>::ones((20, 20, 3));
+
+        // Test different kernel sizes
+        let kernel_sizes = [(3, 3), (5, 5), (7, 7)];
+        for &kernel_size in &kernel_sizes {
+            let res = arr
+                .gaussian_blur(kernel_size, 1.0, 1.0, BorderType::BorderConstant)
+                .unwrap();
+            assert_eq!(res.shape(), &[20, 20, 3]);
+        }
+    }
+
+    #[test]
+    fn test_gaussian_different_border_types() {
+        let mut arr = Array3::<u8>::zeros((10, 10, 3));
+        arr.slice_mut(s![4..7, 4..7, ..]).fill(255); // White square in center
+
+        let border_types = [
+            BorderType::BorderConstant,
+            BorderType::BorderReplicate,
+            BorderType::BorderReflect,
+            BorderType::BorderReflect101,
+        ];
+
+        for border_type in border_types {
+            let res = arr.gaussian_blur((3, 3), 1.0, 1.0, border_type).unwrap();
+            assert_eq!(res.shape(), &[10, 10, 3]);
+        }
+    }
+
+    #[test]
+    fn test_gaussian_different_types() {
+        // Test with different numeric types
+        let arr_u8 = Array3::<u8>::ones((10, 10, 3));
+        let arr_f32 = Array3::<f32>::ones((10, 10, 3));
+
+        let res_u8 = arr_u8
+            .gaussian_blur((3, 3), 1.0, 1.0, BorderType::BorderConstant)
+            .unwrap();
+        let res_f32 = arr_f32
+            .gaussian_blur((3, 3), 1.0, 1.0, BorderType::BorderConstant)
+            .unwrap();
+
+        assert_eq!(res_u8.shape(), &[10, 10, 3]);
+        assert_eq!(res_f32.shape(), &[10, 10, 3]);
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_gaussian_invalid_kernel_size() {
+        let arr = Array3::<u8>::ones((10, 10, 3));
+        // Even kernel sizes should fail
+        let _ = arr
+            .gaussian_blur((2, 2), 1.0, 1.0, BorderType::BorderConstant)
+            .unwrap();
+    }
+}
--- a/ndcv-bridge/src/image.rs
+++ b/ndcv-bridge/src/image.rs
@@ -0,0 +1,30 @@
+use ndarray::*;
+pub trait NdImage {
+    fn width(&self) -> usize;
+    fn height(&self) -> usize;
+    fn channels(&self) -> usize;
+}
+
+impl<T, S: RawData<Elem = T>> NdImage for ArrayBase<S, Ix3> {
+    fn width(&self) -> usize {
+        self.dim().1
+    }
+    fn height(&self) -> usize {
+        self.dim().0
+    }
+    fn channels(&self) -> usize {
+        self.dim().2
+    }
+}
+
+impl<T, S: RawData<Elem = T>> NdImage for ArrayBase<S, Ix2> {
+    fn width(&self) -> usize {
+        self.dim().1
+    }
+    fn height(&self) -> usize {
+        self.dim().0
+    }
+    fn channels(&self) -> usize {
+        1
+    }
+}
--- a/ndcv-bridge/src/inplace.rs
+++ b/ndcv-bridge/src/inplace.rs
@@ -0,0 +1,14 @@
+use opencv::core::Mat;
+use opencv::prelude::*;
+use opencv::Result;
+
+#[inline(always)]
+pub(crate) unsafe fn op_inplace<T>(
+    m: &mut Mat,
+    f: impl FnOnce(&Mat, &mut Mat) -> Result<T>,
+) -> Result<T> {
+    let mut m_alias = Mat::from_raw(m.as_raw_mut());
+    let out = f(m, &mut m_alias);
+    let _ = m_alias.into_raw();
+    out
+}
--- a/ndcv-bridge/src/lib.rs
+++ b/ndcv-bridge/src/lib.rs
@@ -0,0 +1,83 @@
+//! Methods and type conversions for ndarray to opencv and vice versa
+mod blend;
+// mod dilate;
+pub mod fir;
+mod image;
+mod inplace;
+pub mod percentile;
+mod roi;
+
+#[cfg(feature = "opencv")]
+pub mod bounding_rect;
+// #[cfg(feature = "opencv")]
+// pub mod color_space;
+#[cfg(feature = "opencv")]
+pub mod connected_components;
+#[cfg(feature = "opencv")]
+pub mod contours;
+#[cfg(feature = "opencv")]
+pub mod conversions;
+// #[cfg(feature = "opencv")]
+// pub mod gaussian;
+#[cfg(feature = "opencv")]
+pub mod resize;
+
+pub mod codec;
+pub mod orient;
+pub use blend::NdBlend;
+pub use fast_image_resize::{FilterType, ResizeAlg, ResizeOptions, Resizer};
+pub use fir::NdFir;
+// pub use gaussian::{BorderType, NdCvGaussianBlur, NdCvGaussianBlurInPlace};
+pub use roi::{NdRoi, NdRoiMut, NdRoiZeroPadded};
+
+#[cfg(feature = "opencv")]
+pub use contours::{
+    ContourApproximationMethod, ContourHierarchy, ContourResult, ContourRetrievalMode,
+    NdCvContourArea, NdCvFindContours,
+};
+
+#[cfg(feature = "opencv")]
+pub use bounding_rect::BoundingRect;
+#[cfg(feature = "opencv")]
+pub use connected_components::{Connectivity, NdCvConnectedComponents};
+#[cfg(feature = "opencv")]
+pub use conversions::{MatAsNd, NdAsImage, NdAsImageMut, NdAsMat, NdAsMatMut, NdCvConversion};
+#[cfg(feature = "opencv")]
+pub use resize::{Interpolation, NdCvResize};
+
+pub(crate) mod prelude_ {
+    pub use crate::NdCvError;
+    pub use error_stack::*;
+}
+
+#[derive(Debug, thiserror::Error)]
+#[error("NdCvError")]
+pub struct NdCvError;
+
+#[cfg(feature = "opencv")]
+pub fn type_depth<T>() -> i32 {
+    match std::any::type_name::<T>() {
+        "u8" => opencv::core::CV_8U,
+        "i8" => opencv::core::CV_8S,
+        "u16" => opencv::core::CV_16U,
+        "i16" => opencv::core::CV_16S,
+        "i32" => opencv::core::CV_32S,
+        "f32" => opencv::core::CV_32F,
+        "f64" => opencv::core::CV_64F,
+        _ => panic!("Unsupported type"),
+    }
+}
+
+#[cfg(feature = "opencv")]
+pub fn depth_type(depth: i32) -> &'static str {
+    match depth {
+        opencv::core::CV_8U => "u8",
+        opencv::core::CV_8S => "i8",
+        opencv::core::CV_16U => "u16",
+        opencv::core::CV_16S => "i16",
+        opencv::core::CV_32S => "i32",
+        opencv::core::CV_32F => "f32",
+        opencv::core::CV_64F => "f64",
+        _ => panic!("Unsupported depth"),
+    }
+}
--- a/ndcv-bridge/src/orient.rs
+++ b/ndcv-bridge/src/orient.rs
@@ -0,0 +1,188 @@
+use ndarray::{Array, ArrayBase, ArrayView};
+
+#[derive(Clone, Copy)]
+pub enum Orientation {
+    NoRotation,
+    Mirror,
+    Clock180,
+    Water,
+    MirrorClock270,
+    Clock90,
+    MirrorClock90,
+    Clock270,
+    Unknown,
+}
+
+impl Orientation {
+    pub fn inverse(&self) -> Self {
+        match self {
+            Self::Clock90 => Self::Clock270,
+            Self::Clock270 => Self::Clock90,
+            _ => *self,
+        }
+    }
+}
+
+impl Orientation {
+    pub fn from_raw(flip: u8) -> Self {
+        match flip {
+            1 => Orientation::NoRotation,
+            2 => Orientation::Mirror,
+            3 => Orientation::Clock180,
+            4 => Orientation::Water,
+            5 => Orientation::MirrorClock270,
+            6 => Orientation::Clock90,
+            7 => Orientation::MirrorClock90,
+            8 => Orientation::Clock270,
+            _ => Orientation::Unknown,
+        }
+    }
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum RotationFlag {
+    Clock90,
+    Clock180,
+    Clock270,
+}
+
+impl RotationFlag {
+    pub fn neg(&self) -> Self {
+        match self {
+            RotationFlag::Clock90 => RotationFlag::Clock270,
+            RotationFlag::Clock180 => RotationFlag::Clock180,
+            RotationFlag::Clock270 => RotationFlag::Clock90,
+        }
+    }
+
+    pub fn to_orientation(&self) -> Orientation {
+        match self {
+            RotationFlag::Clock90 => Orientation::Clock90,
+            RotationFlag::Clock180 => Orientation::Clock180,
+            RotationFlag::Clock270 => Orientation::Clock270,
+        }
+    }
+}
+
+#[derive(Clone, Copy)]
+pub enum FlipFlag {
+    Mirror,
+    Water,
+    Both,
+}
+
+pub trait Orient<T: bytemuck::Pod, D: ndarray::Dimension> {
+    fn flip(&self, flip: FlipFlag) -> Array<T, D>;
+    fn rotate(&self, rotation: RotationFlag) -> Array<T, D>;
+    fn owned(&self) -> Array<T, D>;
+
+    fn unorient(&self, orientation: Orientation) -> Array<T, D>
+    where
+        Array<T, D>: Orient<T, D>,
+        Self: ToOwned<Owned = Array<T, D>>,
+    {
+        let inverse_orientation = orientation.inverse();
+        self.orient(inverse_orientation)
+
+        // match orientation {
+        //     Orientation::NoRotation | Orientation::Unknown => self.to_owned(),
+        //     Orientation::Mirror => self.flip(FlipFlag::Mirror).to_owned(),
+        //     Orientation::Clock180 => self.rotate(RotationFlag::Clock180),
+        //     Orientation::Water => self.flip(FlipFlag::Water).to_owned(),
+        //     Orientation::MirrorClock270 => self
+        //         .rotate(RotationFlag::Clock90)
+        //         .flip(FlipFlag::Mirror)
+        //         .to_owned(),
+        //     Orientation::Clock90 => self.rotate(RotationFlag::Clock270),
+        //     Orientation::MirrorClock90 => self
+        //         .rotate(RotationFlag::Clock270)
+        //         .flip(FlipFlag::Mirror)
+        //         .to_owned(),
+        //     Orientation::Clock270 => self.rotate(RotationFlag::Clock90),
+        // }
+    }
+
+    fn orient(&self, orientation: Orientation) -> Array<T, D>
+    where
+        Array<T, D>: Orient<T, D>,
+    {
+        match orientation {
+            Orientation::NoRotation | Orientation::Unknown => self.owned(),
+            Orientation::Mirror => self.flip(FlipFlag::Mirror).to_owned(),
+            Orientation::Clock180 => self.rotate(RotationFlag::Clock180),
+            Orientation::Water => self.flip(FlipFlag::Water).to_owned(),
+            Orientation::MirrorClock270 => self
+                .flip(FlipFlag::Mirror)
+                .rotate(RotationFlag::Clock270)
+                .to_owned(),
+            Orientation::Clock90 => self.rotate(RotationFlag::Clock90),
+            Orientation::MirrorClock90 => self
+                .flip(FlipFlag::Mirror)
+                .rotate(RotationFlag::Clock90)
+                .to_owned(),
+            Orientation::Clock270 => self.rotate(RotationFlag::Clock270),
+        }
+        .as_standard_layout()
+        .to_owned()
+    }
+}
+
+impl<T: bytemuck::Pod + Copy, S: ndarray::Data<Elem = T>> Orient<T, ndarray::Ix3>
+    for ArrayBase<S, ndarray::Ix3>
+{
+    fn flip(&self, flip: FlipFlag) -> Array<T, ndarray::Ix3> {
+        match flip {
+            FlipFlag::Mirror => self.slice(ndarray::s![.., ..;-1, ..]),
+            FlipFlag::Water => self.slice(ndarray::s![..;-1, .., ..]),
+            FlipFlag::Both => self.slice(ndarray::s![..;-1, ..;-1, ..]),
+        }
+        .as_standard_layout()
+        .to_owned()
+    }
+
+    fn owned(&self) -> Array<T, ndarray::Ix3> {
+        self.to_owned()
+    }
+
+    fn rotate(&self, rotation: RotationFlag) -> Array<T, ndarray::Ix3> {
+        match rotation {
+            RotationFlag::Clock90 => self
+                .view()
+                .permuted_axes([1, 0, 2])
+                .flip(FlipFlag::Mirror)
+                .to_owned(),
+            RotationFlag::Clock180 => self.flip(FlipFlag::Both).to_owned(),
+            RotationFlag::Clock270 => self
+                .view()
+                .permuted_axes([1, 0, 2])
+                .flip(FlipFlag::Water)
+                .to_owned(),
+        }
+    }
+}
+
+impl<T: bytemuck::Pod + Copy, S: ndarray::Data<Elem = T>> Orient<T, ndarray::Ix2>
+    for ArrayBase<S, ndarray::Ix2>
+{
+    fn flip(&self, flip: FlipFlag) -> Array<T, ndarray::Ix2> {
+        match flip {
+            FlipFlag::Mirror => self.slice(ndarray::s![.., ..;-1,]),
+            FlipFlag::Water => self.slice(ndarray::s![..;-1, ..,]),
+            FlipFlag::Both => self.slice(ndarray::s![..;-1, ..;-1,]),
+        }
+        .as_standard_layout()
+        .to_owned()
+    }
+
+    fn owned(&self) -> Array<T, ndarray::Ix2> {
+        self.to_owned()
+    }
+
+    fn rotate(&self, rotation: RotationFlag) -> Array<T, ndarray::Ix2> {
+        match rotation {
+            RotationFlag::Clock90 => self.t().flip(FlipFlag::Mirror).to_owned(),
+            RotationFlag::Clock180 => self.flip(FlipFlag::Both).to_owned(),
+            RotationFlag::Clock270 => self.t().flip(FlipFlag::Water).to_owned(),
+        }
+    }
+}
--- a/ndcv-bridge/src/percentile.rs
+++ b/ndcv-bridge/src/percentile.rs
@@ -0,0 +1,63 @@
+use error_stack::*;
+use ndarray::{ArrayBase, Ix1};
+use num::cast::AsPrimitive;
+
+use crate::NdCvError;
+
+pub trait Percentile {
+    fn percentile(&self, qth_percentile: f64) -> Result<f64, NdCvError>;
+}
+
+impl<T: std::cmp::Ord + Clone + AsPrimitive<f64>, S: ndarray::Data<Elem = T>> Percentile
+    for ArrayBase<S, Ix1>
+{
+    fn percentile(&self, qth_percentile: f64) -> Result<f64, NdCvError> {
+        if self.len() == 0 {
+            return Err(error_stack::Report::new(NdCvError).attach_printable("Empty Input"));
+        }
+
+        if !(0_f64..1_f64).contains(&qth_percentile) {
+            return Err(error_stack::Report::new(NdCvError)
+                .attach_printable("Qth percentile must be between 0 and 1"));
+        }
+
+        let mut standard_array = self.as_standard_layout();
+        let mut raw_data = standard_array
+            .as_slice_mut()
+            .expect("An array in standard layout will always return its inner slice");
+
+        raw_data.sort();
+
+        let actual_index = qth_percentile * (raw_data.len() - 1) as f64;
+
+        let lower_index = (actual_index.floor() as usize).clamp(0, raw_data.len() - 1);
+        let upper_index = (actual_index.ceil() as usize).clamp(0, raw_data.len() - 1);
+
+        if lower_index == upper_index {
+            Ok(raw_data[lower_index].as_())
+        } else {
+            let weight = actual_index - lower_index as f64;
+            Ok(raw_data[lower_index].as_() * (1.0 - weight) + raw_data[upper_index].as_() * weight)
+        }
+    }
+}
+
+// fn percentile(data: &Array1<f64>, p: f64) -> f64 {
+//     if data.len() == 0 {
+//         return 0.0;
+//     }
+//
+//     let mut sorted_data = data.to_vec();
+//     sorted_data.sort_by(|a, b| a.partial_cmp(b).unwrap());
+//
+//     let index = (p / 100.0) * (sorted_data.len() - 1) as f64;
+//     let lower = index.floor() as usize;
+//     let upper = index.ceil() as usize;
+//
+//     if lower == upper {
+//         sorted_data[lower] as f64
+//     } else {
+//         let weight = index - lower as f64;
+//         sorted_data[lower] as f64 * (1.0 - weight) + sorted_data[upper] as f64 * weight
+//     }
+// }
--- a/ndcv-bridge/src/resize.rs
+++ b/ndcv-bridge/src/resize.rs
@@ -0,0 +1,108 @@
+use crate::{prelude_::*, NdAsImage, NdAsImageMut};
+
+/// Resize ndarray using OpenCV resize functions
+pub trait NdCvResize<T, D>: seal::SealedInternal {
+    /// The input array must be a continuous 2D or 3D ndarray
+    fn resize(
+        &self,
+        height: u16,
+        width: u16,
+        interpolation: Interpolation,
+    ) -> Result<ndarray::ArrayBase<ndarray::OwnedRepr<T>, D>, NdCvError>;
+}
+
+#[repr(i32)]
+#[derive(Debug, Copy, Clone)]
+pub enum Interpolation {
+    Linear = opencv::imgproc::INTER_LINEAR,
+    LinearExact = opencv::imgproc::INTER_LINEAR_EXACT,
+    Max = opencv::imgproc::INTER_MAX,
+    Area = opencv::imgproc::INTER_AREA,
+    Cubic = opencv::imgproc::INTER_CUBIC,
+    Nearest = opencv::imgproc::INTER_NEAREST,
+    NearestExact = opencv::imgproc::INTER_NEAREST_EXACT,
+    Lanczos4 = opencv::imgproc::INTER_LANCZOS4,
+}
+
+mod seal {
+    pub trait SealedInternal {}
+    impl<T: bytemuck::Pod, S: ndarray::Data<Elem = T>> SealedInternal
+        for ndarray::ArrayBase<S, ndarray::Ix3>
+    {
+    }
+    impl<T: bytemuck::Pod, S: ndarray::Data<Elem = T>> SealedInternal
+        for ndarray::ArrayBase<S, ndarray::Ix2>
+    {
+    }
+}
+
+impl<T: bytemuck::Pod + num::Zero, S: ndarray::Data<Elem = T>> NdCvResize<T, ndarray::Ix2>
+    for ndarray::ArrayBase<S, ndarray::Ix2>
+{
+    fn resize(
+        &self,
+        height: u16,
+        width: u16,
+        interpolation: Interpolation,
+    ) -> Result<ndarray::Array2<T>, NdCvError> {
+        let mat = self.as_image_mat()?;
+        let mut dest = ndarray::Array2::zeros((height.into(), width.into()));
+        let mut dest_mat = dest.as_image_mat_mut()?;
+        opencv::imgproc::resize(
+            mat.as_ref(),
+            dest_mat.as_mut(),
+            opencv::core::Size {
+                height: height.into(),
+                width: width.into(),
+            },
+            0.,
+            0.,
+            interpolation as i32,
+        )
+        .change_context(NdCvError)?;
+        Ok(dest)
+    }
+}
+
+impl<T: bytemuck::Pod + num::Zero, S: ndarray::Data<Elem = T>> NdCvResize<T, ndarray::Ix3>
+    for ndarray::ArrayBase<S, ndarray::Ix3>
+{
+    fn resize(
+        &self,
+        height: u16,
+        width: u16,
+        interpolation: Interpolation,
+    ) -> Result<ndarray::ArrayBase<ndarray::OwnedRepr<T>, ndarray::Ix3>, NdCvError> {
+        let mat = self.as_image_mat()?;
+        let mut dest =
+            ndarray::Array3::zeros((height.into(), width.into(), self.len_of(ndarray::Axis(2))));
+        let mut dest_mat = dest.as_image_mat_mut()?;
+        opencv::imgproc::resize(
+            mat.as_ref(),
+            dest_mat.as_mut(),
+            opencv::core::Size {
+                height: height.into(),
+                width: width.into(),
+            },
+            0.,
+            0.,
+            interpolation as i32,
+        )
+        .change_context(NdCvError)?;
+        Ok(dest)
+    }
+}
+
+#[test]
+fn test_resize_simple() {
+    let foo = ndarray::Array2::<u8>::ones((10, 10));
+    let foo_resized = foo.resize(15, 20, Interpolation::Linear).unwrap();
+    assert_eq!(foo_resized, ndarray::Array2::<u8>::ones((15, 20)));
+}
+
+#[test]
+fn test_resize_3d() {
+    let foo = ndarray::Array3::<u8>::ones((10, 10, 3));
+    let foo_resized = foo.resize(15, 20, Interpolation::Linear).unwrap();
+    assert_eq!(foo_resized, ndarray::Array3::<u8>::ones((15, 20, 3)));
+}
--- a/ndcv-bridge/src/roi.rs
+++ b/ndcv-bridge/src/roi.rs
@@ -0,0 +1,274 @@
+pub trait NdRoi<T, D>: seal::Sealed {
+    fn roi(&self, rect: bounding_box::Aabb2<usize>) -> ndarray::ArrayView<T, D>;
+}
+
+pub trait NdRoiMut<T, D>: seal::Sealed {
+    fn roi_mut(&mut self, rect: bounding_box::Aabb2<usize>) -> ndarray::ArrayViewMut<T, D>;
+}
+
+mod seal {
+    use ndarray::{Ix2, Ix3};
+    pub trait Sealed {}
+    impl<T: bytemuck::Pod, S: ndarray::Data<Elem = T>> Sealed for ndarray::ArrayBase<S, Ix2> {}
+    impl<T: bytemuck::Pod, S: ndarray::Data<Elem = T>> Sealed for ndarray::ArrayBase<S, Ix3> {}
+}
+
+impl<T: bytemuck::Pod, S: ndarray::Data<Elem = T>> NdRoi<T, ndarray::Ix3>
+    for ndarray::ArrayBase<S, ndarray::Ix3>
+{
+    fn roi(&self, rect: bounding_box::Aabb2<usize>) -> ndarray::ArrayView3<T> {
+        let y1 = rect.y1();
+        let y2 = rect.y2();
+        let x1 = rect.x1();
+        let x2 = rect.x2();
+        self.slice(ndarray::s![y1..y2, x1..x2, ..])
+    }
+}
+
+impl<T: bytemuck::Pod, S: ndarray::DataMut<Elem = T>> NdRoiMut<T, ndarray::Ix3>
+    for ndarray::ArrayBase<S, ndarray::Ix3>
+{
+    fn roi_mut(&mut self, rect: bounding_box::Aabb2<usize>) -> ndarray::ArrayViewMut3<T> {
+        let y1 = rect.y1();
+        let y2 = rect.y2();
+        let x1 = rect.x1();
+        let x2 = rect.x2();
+        self.slice_mut(ndarray::s![y1..y2, x1..x2, ..])
+    }
+}
+
+impl<T: bytemuck::Pod, S: ndarray::Data<Elem = T>> NdRoi<T, ndarray::Ix2>
+    for ndarray::ArrayBase<S, ndarray::Ix2>
+{
+    fn roi(&self, rect: bounding_box::Aabb2<usize>) -> ndarray::ArrayView2<T> {
+        let y1 = rect.y1();
+        let y2 = rect.y2();
+        let x1 = rect.x1();
+        let x2 = rect.x2();
+        self.slice(ndarray::s![y1..y2, x1..x2])
+    }
+}
+
+impl<T: bytemuck::Pod, S: ndarray::DataMut<Elem = T>> NdRoiMut<T, ndarray::Ix2>
+    for ndarray::ArrayBase<S, ndarray::Ix2>
+{
+    fn roi_mut(&mut self, rect: bounding_box::Aabb2<usize>) -> ndarray::ArrayViewMut2<T> {
+        let y1 = rect.y1();
+        let y2 = rect.y2();
+        let x1 = rect.x1();
+        let x2 = rect.x2();
+        self.slice_mut(ndarray::s![y1..y2, x1..x2])
+    }
+}
+
+#[test]
+fn test_roi() {
+    let arr = ndarray::Array3::<u8>::zeros((10, 10, 3));
+    let rect = bounding_box::Aabb2::from_xywh(1, 1, 3, 3);
+    let roi = arr.roi(rect);
+    assert_eq!(roi.shape(), &[3, 3, 3]);
+}
+
+#[test]
+fn test_roi_2d() {
+    let arr = ndarray::Array2::<u8>::zeros((10, 10));
+    let rect = bounding_box::Aabb2::from_xywh(1, 1, 3, 3);
+    let roi = arr.roi(rect);
+    assert_eq!(roi.shape(), &[3, 3]);
+}
+
+/// ```text
+///    ┌──────────────────┐
+///    │      padded      │
+///    │    ┌────────┐    │
+///    │    │        │    │
+///    │    │original│    │
+///    │    │        │    │
+///    │    └────────┘    │
+///    │      zeroed      │
+///    └──────────────────┘
+/// ```
+///
+/// Returns the padded bounding box and the padded segment
+/// The padded is the padded bounding box
+/// The original is the original bounding box
+/// Returns the padded bounding box as zeros and the original bbox as the original segment
+// Helper functions for missing methods from old bbox crate
+fn bbox_top_left_usize(bbox: &bounding_box::Aabb2<usize>) -> (usize, usize) {
+    (bbox.x1(), bbox.y1())
+}
+
+fn bbox_with_top_left_usize(
+    bbox: &bounding_box::Aabb2<usize>,
+    x: usize,
+    y: usize,
+) -> bounding_box::Aabb2<usize> {
+    let width = bbox.x2() - bbox.x1();
+    let height = bbox.y2() - bbox.y1();
+    bounding_box::Aabb2::from_xywh(x, y, width, height)
+}
+
+fn bbox_with_origin_usize(point: (usize, usize), origin: (usize, usize)) -> (usize, usize) {
+    (point.0 - origin.0, point.1 - origin.1)
+}
+
+fn bbox_zeros_ndarray_2d<T: num::Zero + Copy>(
+    bbox: &bounding_box::Aabb2<usize>,
+) -> ndarray::Array2<T> {
+    let width = bbox.x2() - bbox.x1();
+    let height = bbox.y2() - bbox.y1();
+    ndarray::Array2::<T>::zeros((height, width))
+}
+
+fn bbox_zeros_ndarray_3d<T: num::Zero + Copy>(
+    bbox: &bounding_box::Aabb2<usize>,
+    channels: usize,
+) -> ndarray::Array3<T> {
+    let width = bbox.x2() - bbox.x1();
+    let height = bbox.y2() - bbox.y1();
+    ndarray::Array3::<T>::zeros((height, width, channels))
+}
+
+fn bbox_round_f64(bbox: &bounding_box::Aabb2<f64>) -> bounding_box::Aabb2<f64> {
+    let x1 = bbox.x1().round();
+    let y1 = bbox.y1().round();
+    let x2 = bbox.x2().round();
+    let y2 = bbox.y2().round();
+    bounding_box::Aabb2::from_x1y1x2y2(x1, y1, x2, y2)
+}
+
+fn bbox_cast_f64_to_usize(bbox: &bounding_box::Aabb2<f64>) -> bounding_box::Aabb2<usize> {
+    let x1 = bbox.x1() as usize;
+    let y1 = bbox.y1() as usize;
+    let x2 = bbox.x2() as usize;
+    let y2 = bbox.y2() as usize;
+    bounding_box::Aabb2::from_x1y1x2y2(x1, y1, x2, y2)
+}
+
+pub trait NdRoiZeroPadded<T, D: ndarray::Dimension>: seal::Sealed {
+    fn roi_zero_padded(
+        &self,
+        original: bounding_box::Aabb2<usize>,
+        padded: bounding_box::Aabb2<usize>,
+    ) -> (bounding_box::Aabb2<usize>, ndarray::Array<T, D>);
+}
+
+impl<T: bytemuck::Pod + num::Zero> NdRoiZeroPadded<T, ndarray::Ix2> for ndarray::Array2<T> {
+    fn roi_zero_padded(
+        &self,
+        original: bounding_box::Aabb2<usize>,
+        padded: bounding_box::Aabb2<usize>,
+    ) -> (bounding_box::Aabb2<usize>, ndarray::Array2<T>) {
+        // The co-ordinates of both the original and the padded bounding boxes must be contained in
+        // self or it will panic
+
+        let self_bbox = bounding_box::Aabb2::from_xywh(0, 0, self.shape()[1], self.shape()[0]);
+        if !self_bbox.contains_bbox(&original) {
+            panic!("original bounding box is not contained in self");
+        }
+        if !self_bbox.contains_bbox(&padded) {
+            panic!("padded bounding box is not contained in self");
+        }
+
+        let original_top_left = bbox_top_left_usize(&original);
+        let padded_top_left = bbox_top_left_usize(&padded);
+        let origin_offset = bbox_with_origin_usize(original_top_left, padded_top_left);
+        let original_roi_in_padded =
+            bbox_with_top_left_usize(&original, origin_offset.0, origin_offset.1);
+
+        let original_segment = self.roi(original);
+        let mut padded_segment = bbox_zeros_ndarray_2d::<T>(&padded);
+        padded_segment
+            .roi_mut(original_roi_in_padded)
+            .assign(&original_segment);
+
+        (padded, padded_segment)
+    }
+}
+
+impl<T: bytemuck::Pod + num::Zero> NdRoiZeroPadded<T, ndarray::Ix3> for ndarray::Array3<T> {
+    fn roi_zero_padded(
+        &self,
+        original: bounding_box::Aabb2<usize>,
+        padded: bounding_box::Aabb2<usize>,
+    ) -> (bounding_box::Aabb2<usize>, ndarray::Array3<T>) {
+        let self_bbox = bounding_box::Aabb2::from_xywh(0, 0, self.shape()[1], self.shape()[0]);
+        if !self_bbox.contains_bbox(&original) {
+            panic!("original bounding box is not contained in self");
+        }
+        if !self_bbox.contains_bbox(&padded) {
+            panic!("padded bounding box is not contained in self");
+        }
+
+        let original_top_left = bbox_top_left_usize(&original);
+        let padded_top_left = bbox_top_left_usize(&padded);
+        let origin_offset = bbox_with_origin_usize(original_top_left, padded_top_left);
+        let original_roi_in_padded =
+            bbox_with_top_left_usize(&original, origin_offset.0, origin_offset.1);
+
+        let original_segment = self.roi(original);
+        let mut padded_segment = bbox_zeros_ndarray_3d::<T>(&padded, self.len_of(ndarray::Axis(2)));
+        padded_segment
+            .roi_mut(original_roi_in_padded)
+            .assign(&original_segment);
+
+        (padded, padded_segment)
+    }
+}
+
+#[test]
+fn test_roi_zero_padded() {
+    let arr = ndarray::Array2::<u8>::ones((10, 10));
+    let original = bounding_box::Aabb2::from_xywh(1.0, 1.0, 3.0, 3.0);
+    let clamp = bounding_box::Aabb2::from_xywh(0.0, 0.0, 10.0, 10.0);
+    let padded = original.padding(2.0).clamp(&clamp).unwrap();
+    let padded_cast = bbox_cast_f64_to_usize(&padded);
+    let original_cast = bbox_cast_f64_to_usize(&original);
+    let (padded_result, padded_segment) = arr.roi_zero_padded(original_cast, padded_cast);
+    assert_eq!(padded_result, bounding_box::Aabb2::from_xywh(0, 0, 6, 6));
+    assert_eq!(padded_segment.shape(), &[6, 6]);
+}
+
+#[test]
+pub fn bbox_clamp_failure_preload() {
+    let segment_mask = ndarray::Array2::<u8>::zeros((512, 512));
+    let og = bounding_box::Aabb2::from_xywh(475.0, 79.625, 37.0, 282.15);
+    let clamp = bounding_box::Aabb2::from_xywh(0.0, 0.0, 512.0, 512.0);
+    let padded = og
+        .scale(nalgebra::Vector2::new(1.2, 1.2))
+        .clamp(&clamp)
+        .unwrap();
+    let padded = bbox_round_f64(&padded);
+    let og_cast = bbox_cast_f64_to_usize(&bbox_round_f64(&og));
+    let padded_cast = bbox_cast_f64_to_usize(&padded);
+    let (_bbox, _segment) = segment_mask.roi_zero_padded(og_cast, padded_cast);
+}
+
+#[test]
+pub fn bbox_clamp_failure_preload_2() {
+    let segment_mask = ndarray::Array2::<u8>::zeros((512, 512));
+    let bbox = bounding_box::Aabb2::from_xywh(354.25, 98.5, 116.25, 413.5);
+    // let padded = bounding_box::Aabb2::from_xywh(342.625, 57.150000000000006, 139.5, 454.85);
+    let clamp = bounding_box::Aabb2::from_xywh(0.0, 0.0, 512.0, 512.0);
+    let padded = bbox
+        .scale(nalgebra::Vector2::new(1.2, 1.2))
+        .clamp(&clamp)
+        .unwrap();
+    let padded = bbox_round_f64(&padded);
+    let bbox_cast = bbox_cast_f64_to_usize(&bbox_round_f64(&bbox));
+    let padded_cast = bbox_cast_f64_to_usize(&padded);
+    let (_bbox, _segment) = segment_mask.roi_zero_padded(bbox_cast, padded_cast);
+}
+
+#[test]
+fn test_roi_zero_padded_3d() {
+    let arr = ndarray::Array3::<u8>::ones((10, 10, 3));
+    let original = bounding_box::Aabb2::from_xywh(1.0, 1.0, 3.0, 3.0);
+    let clamp = bounding_box::Aabb2::from_xywh(0.0, 0.0, 10.0, 10.0);
+    let padded = original.padding(2.0).clamp(&clamp).unwrap();
+    let padded_cast = bbox_cast_f64_to_usize(&padded);
+    let original_cast = bbox_cast_f64_to_usize(&original);
+    let (padded_result, padded_segment) = arr.roi_zero_padded(original_cast, padded_cast);
+    assert_eq!(padded_result, bounding_box::Aabb2::from_xywh(0, 0, 6, 6));
+    assert_eq!(padded_segment.shape(), &[6, 6, 3]);
+}
--- a/1
+++ b/1
--- a/sqlite3-safetensor-cosine/Cargo.toml
+++ b/sqlite3-safetensor-cosine/Cargo.toml
@@ -1,5 +1,5 @@
 [package]
-name = "sqlite3-safetensor-cosine"
+name = "sqlite3-ndarray-math"
 version.workspace = true
 edition.workspace = true

@@ -8,7 +8,7 @@ crate-type = ["cdylib", "staticlib"]

 [dependencies]
 ndarray = "0.16.1"
-# ndarray-math = { git = "https://git.darksailor.dev/servius/ndarray-math", version = "0.1.0" }
-ndarray-math = { path = "/Users/fs0c131y/Projects/ndarray-math", version = "0.1.0" }
+ndarray-math = { git = "https://git.darksailor.dev/servius/ndarray-math", version = "0.1.0" }
+# ndarray-math = { path = "/Users/fs0c131y/Projects/ndarray-math", version = "0.1.0" }
 ndarray-safetensors = { version = "0.1.0", path = "../ndarray-safetensors" }
 sqlite-loadable = "0.0.5"
--- a/sqlite3-safetensor-cosine/src/lib.rs
+++ b/sqlite3-safetensor-cosine/src/lib.rs
--- a/src/bin/detector-cli/cli.rs
+++ b/src/bin/detector-cli/cli.rs
@@ -21,6 +21,8 @@ pub enum SubCommand {
    Stats(Stats),
    #[clap(name = "compare")]
    Compare(Compare),
+    #[clap(name = "cluster")]
+    Cluster(Cluster),
    #[clap(name = "gui")]
    Gui,
    #[clap(name = "completions")]
@@ -187,6 +189,22 @@ pub struct Compare {
    pub image2: PathBuf,
 }

+#[derive(Debug, clap::Args)]
+pub struct Cluster {
+    #[clap(short = 'd', long, default_value = "face_detections.db")]
+    pub database: PathBuf,
+    #[clap(short, long, default_value_t = 5)]
+    pub clusters: usize,
+    #[clap(short, long, default_value_t = 100)]
+    pub max_iterations: u64,
+    #[clap(short, long, default_value_t = 1e-4)]
+    pub tolerance: f64,
+    #[clap(long, default_value = "facenet")]
+    pub model_name: String,
+    #[clap(short, long)]
+    pub output: Option<PathBuf>,
+}
+
 impl Cli {
    pub fn completions(shell: clap_complete::Shell) {
        let mut command = <Cli as clap::CommandFactory>::command();
--- a/src/bin/detector-cli/main.rs
+++ b/src/bin/detector-cli/main.rs
@@ -2,9 +2,11 @@ mod cli;
 use bounding_box::roi::MultiRoi;
 use detector::*;
 use detector::{database::FaceDatabase, facedet, facedet::FaceDetectionConfig, faceembed};
-use errors::*;
+
 use fast_image_resize::ResizeOptions;

+use linfa::prelude::*;
+use linfa_clustering::KMeans;
 use ndarray::*;
 use ndarray_image::*;
 use ndarray_resize::NdFir;
@@ -22,10 +24,12 @@ const FACENET_MODEL_ONNX: &[u8] =
    include_bytes!(concat!(env!("CARGO_MANIFEST_DIR"), "/models/facenet.onnx"));
 pub fn main() -> Result<()> {
    tracing_subscriber::fmt()
-        .with_env_filter("info")
-        .with_thread_ids(true)
-        .with_thread_names(true)
-        .with_target(false)
+        .with_env_filter("info,ort=warn")
+        // .with_thread_ids(true)
+        // .with_thread_names(true)
+        .with_file(true)
+        .with_line_number(true)
+        .with_target(true)
        .init();
    let args = <cli::Cli as clap::Parser>::parse();
    match args.cmd {
@@ -207,6 +211,9 @@ pub fn main() -> Result<()> {
                std::process::exit(1);
            }
        }
+        cli::SubCommand::Cluster(cluster) => {
+            run_clustering(cluster)?;
+        }
        cli::SubCommand::Completions { shell } => {
            cli::Cli::completions(shell);
        }
@@ -934,3 +941,123 @@ fn run_stats(stats: cli::Stats) -> Result<()> {

    Ok(())
 }
+
+fn run_clustering(cluster: cli::Cluster) -> Result<()> {
+    let db = FaceDatabase::new(&cluster.database).change_context(Error)?;
+
+    // Get all embeddings for the specified model
+    let embeddings = db
+        .get_all_embeddings(Some(&cluster.model_name))
+        .change_context(Error)?;
+
+    if embeddings.is_empty() {
+        println!("No embeddings found for model '{}'", cluster.model_name);
+        return Ok(());
+    }
+
+    println!(
+        "Found {} embeddings for model '{}'",
+        embeddings.len(),
+        cluster.model_name
+    );
+
+    if embeddings.len() < cluster.clusters {
+        println!(
+            "Warning: Number of embeddings ({}) is less than requested clusters ({})",
+            embeddings.len(),
+            cluster.clusters
+        );
+        return Ok(());
+    }
+
+    // Convert embeddings to a 2D array for clustering
+    let embedding_dim = embeddings[0].embedding.len();
+    let mut data = Array2::<f64>::zeros((embeddings.len(), embedding_dim));
+
+    for (i, embedding_record) in embeddings.iter().enumerate() {
+        for (j, &val) in embedding_record.embedding.iter().enumerate() {
+            data[[i, j]] = val as f64;
+        }
+    }
+
+    println!(
+        "Running K-means clustering with {} clusters...",
+        cluster.clusters
+    );
+
+    // Create dataset
+    let dataset = linfa::Dataset::new(data, Array1::<usize>::zeros(embeddings.len()));
+
+    // Configure and run K-means
+    let model = KMeans::params(cluster.clusters)
+        .max_n_iterations(cluster.max_iterations)
+        .tolerance(cluster.tolerance)
+        .fit(&dataset)
+        .change_context(Error)
+        .attach_printable("Failed to run K-means clustering")?;
+
+    // Get cluster assignments
+    let predictions = model.predict(&dataset);
+
+    // Group results by cluster
+    let mut clusters: std::collections::HashMap<usize, Vec<(i64, String)>> =
+        std::collections::HashMap::new();
+
+    for (i, &cluster_id) in predictions.iter().enumerate() {
+        let face_id = embeddings[i].face_id;
+
+        // Get image path for this face
+        let image_info = db.get_image_for_face(face_id).change_context(Error)?;
+        let image_path = image_info
+            .map(|info| info.file_path)
+            .unwrap_or_else(|| "Unknown".to_string());
+
+        clusters
+            .entry(cluster_id)
+            .or_insert_with(Vec::new)
+            .push((face_id, image_path));
+    }
+
+    // Print results
+    println!("\nClustering Results:");
+    for cluster_id in 0..cluster.clusters {
+        if let Some(faces) = clusters.get(&cluster_id) {
+            println!("\nCluster {}: {} faces", cluster_id, faces.len());
+            for (face_id, image_path) in faces {
+                println!("  Face ID: {}, Image: {}", face_id, image_path);
+            }
+        } else {
+            println!("\nCluster {}: 0 faces", cluster_id);
+        }
+    }
+
+    // Optionally save results to file
+    if let Some(output_path) = &cluster.output {
+        use std::io::Write;
+        let mut file = std::fs::File::create(output_path)
+            .change_context(Error)
+            .attach_printable("Failed to create output file")?;
+
+        writeln!(file, "K-means Clustering Results").change_context(Error)?;
+        writeln!(file, "Model: {}", cluster.model_name).change_context(Error)?;
+        writeln!(file, "Total embeddings: {}", embeddings.len()).change_context(Error)?;
+        writeln!(file, "Number of clusters: {}", cluster.clusters).change_context(Error)?;
+        writeln!(file, "").change_context(Error)?;
+
+        for cluster_id in 0..cluster.clusters {
+            if let Some(faces) = clusters.get(&cluster_id) {
+                writeln!(file, "Cluster {}: {} faces", cluster_id, faces.len())
+                    .change_context(Error)?;
+                for (face_id, image_path) in faces {
+                    writeln!(file, "  Face ID: {}, Image: {}", face_id, image_path)
+                        .change_context(Error)?;
+                }
+                writeln!(file, "").change_context(Error)?;
+            }
+        }
+
+        println!("\nResults saved to: {:?}", output_path);
+    }
+
+    Ok(())
+}
--- a/src/bin/gui.rs
+++ b/src/bin/gui.rs
@@ -1,10 +1,12 @@
-fn main() -> Result<(), Box<dyn std::error::Error>> {
+use detector::errors::*;
+fn main() -> Result<()> {
    // Initialize logging
    tracing_subscriber::fmt()
-        .with_env_filter("info")
-        .with_thread_ids(true)
-        .with_thread_names(true)
-        .with_target(false)
+        .with_env_filter("warn,ort=warn")
+        .with_file(true)
+        .with_line_number(true)
+        // .with_thread_names(true)
+        .with_target(true)
        .init();

    // Run the GUI
--- a/src/database.rs
+++ b/src/database.rs
@@ -65,14 +65,15 @@ impl FaceDatabase {
    /// Create a new database connection and initialize tables
    pub fn new<P: AsRef<Path>>(db_path: P) -> Result<Self> {
        let conn = Connection::open(db_path).change_context(Error)?;
-        unsafe {
-            let _guard = rusqlite::LoadExtensionGuard::new(&conn).change_context(Error)?;
-            conn.load_extension(
-                "/Users/fs0c131y/.cache/cargo/target/release/libsqlite3_safetensor_cosine.dylib",
-                None::<&str>,
-            )
-            .change_context(Error)?;
-        }
+        // Temporarily disable extension loading for clustering demo
+        // unsafe {
+        //     let _guard = rusqlite::LoadExtensionGuard::new(&conn).change_context(Error)?;
+        //     conn.load_extension(
+        //         "/Users/fs0c131y/.cache/cargo/target/release/libsqlite3_safetensor_cosine.dylib",
+        //         None::<&str>,
+        //     )
+        //     .change_context(Error)?;
+        // }
        let db = Self { conn };
        db.create_tables()?;
        Ok(db)
@@ -594,6 +595,76 @@ impl FaceDatabase {
            println!("{:?}", result);
        }
    }
+
+    /// Get all embeddings for a specific model
+    pub fn get_all_embeddings(&self, model_name: Option<&str>) -> Result<Vec<EmbeddingRecord>> {
+        let mut embeddings = Vec::new();
+
+        if let Some(model) = model_name {
+            let mut stmt = self.conn.prepare(
+                "SELECT id, face_id, embedding, model_name, created_at FROM embeddings WHERE model_name = ?1"
+            ).change_context(Error)?;
+
+            let embedding_iter = stmt
+                .query_map(params![model], |row| {
+                    let embedding_bytes: Vec<u8> = row.get(2)?;
+                    let embedding: ndarray::Array1<f32> = {
+                        let sf = ndarray_safetensors::SafeArraysView::from_bytes(&embedding_bytes)
+                            .change_context(Error)
+                            .unwrap();
+                        sf.tensor::<f32, ndarray::Ix1>("embedding")
+                            .unwrap()
+                            .to_owned()
+                    };
+
+                    Ok(EmbeddingRecord {
+                        id: row.get(0)?,
+                        face_id: row.get(1)?,
+                        embedding,
+                        model_name: row.get(3)?,
+                        created_at: row.get(4)?,
+                    })
+                })
+                .change_context(Error)?;
+
+            for embedding in embedding_iter {
+                embeddings.push(embedding.change_context(Error)?);
+            }
+        } else {
+            let mut stmt = self
+                .conn
+                .prepare("SELECT id, face_id, embedding, model_name, created_at FROM embeddings")
+                .change_context(Error)?;
+
+            let embedding_iter = stmt
+                .query_map([], |row| {
+                    let embedding_bytes: Vec<u8> = row.get(2)?;
+                    let embedding: ndarray::Array1<f32> = {
+                        let sf = ndarray_safetensors::SafeArraysView::from_bytes(&embedding_bytes)
+                            .change_context(Error)
+                            .unwrap();
+                        sf.tensor::<f32, ndarray::Ix1>("embedding")
+                            .unwrap()
+                            .to_owned()
+                    };
+
+                    Ok(EmbeddingRecord {
+                        id: row.get(0)?,
+                        face_id: row.get(1)?,
+                        embedding,
+                        model_name: row.get(3)?,
+                        created_at: row.get(4)?,
+                    })
+                })
+                .change_context(Error)?;
+
+            for embedding in embedding_iter {
+                embeddings.push(embedding.change_context(Error)?);
+            }
+        }
+
+        Ok(embeddings)
+    }
 }

 fn add_sqlite_cosine_similarity(db: &Connection) -> Result<()> {
--- a/src/facedet/retinaface.rs
+++ b/src/facedet/retinaface.rs
@@ -174,7 +174,6 @@ impl FaceDetectionModelOutput {
        // let mut decoded_landmarks = Vec::new();
        // let mut confidences = Vec::new();

-        dbg!(priors.shape());
        let (decoded_boxes, decoded_landmarks, confidences) = (0..priors.shape()[0])
            .filter(|&i| scores[i] > config.threshold)
            .map(|i| {
--- a/src/gui/app.rs
+++ b/src/gui/app.rs
@@ -1,5 +1,5 @@
 use iced::{
-    Alignment, Element, Length, Task, Theme,
+    Alignment, Element, Length, Settings, Task, Theme,
    widget::{
        Space, button, column, container, image, pick_list, progress_bar, row, scrollable, slider,
        text,
@@ -10,6 +10,7 @@ use std::path::PathBuf;
 use std::sync::Arc;

 use crate::gui::bridge::FaceDetectionBridge;
+use ::image::{DynamicImage, ImageFormat, RgbImage};

 #[derive(Debug, Clone)]
 pub enum Message {
@@ -43,6 +44,7 @@ pub enum Message {
    ImageLoaded(Option<Arc<Vec<u8>>>),
    SecondImageLoaded(Option<Arc<Vec<u8>>>),
    ProcessedImageUpdated(Option<Vec<u8>>),
+    FaceRoisLoaded(Vec<image::Handle>, Vec<image::Handle>),
 }

 #[derive(Debug, Clone, PartialEq)]
@@ -55,9 +57,13 @@ pub enum Tab {
 #[derive(Debug, Clone, PartialEq)]
 pub enum ExecutorType {
    MnnCpu,
+    #[cfg(feature = "mnn-metal")]
    MnnMetal,
+    #[cfg(feature = "mnn-coreml")]
    MnnCoreML,
    OnnxCpu,
+    #[cfg(feature = "ort-cuda")]
+    OrtCuda,
 }

 #[derive(Debug, Clone)]
@@ -76,6 +82,8 @@ pub enum ComparisonResult {
    Success {
        image1_faces: usize,
        image2_faces: usize,
+        image1_face_rois: Vec<ndarray::Array3<u8>>,
+        image2_face_rois: Vec<ndarray::Array3<u8>>,
        best_similarity: f32,
        processing_time: f64,
    },
@@ -110,6 +118,10 @@ pub struct FaceDetectorApp {
    current_image_handle: Option<image::Handle>,
    processed_image_handle: Option<image::Handle>,
    second_image_handle: Option<image::Handle>,
+
+    // Face ROI handles for comparison display
+    image1_face_roi_handles: Vec<image::Handle>,
+    image2_face_roi_handles: Vec<image::Handle>,
 }

 impl Default for FaceDetectorApp {
@@ -121,7 +133,10 @@ impl Default for FaceDetectorApp {
            output_path: None,
            threshold: 0.8,
            nms_threshold: 0.3,
+            #[cfg(not(any(feature = "mnn-metal", feature = "ort-cuda")))]
            executor_type: ExecutorType::MnnCpu,
+            #[cfg(feature = "ort-cuda")]
+            executor_type: ExecutorType::OrtCuda,
            is_processing: false,
            progress: 0.0,
            status_message: "Ready".to_string(),
@@ -130,6 +145,8 @@ impl Default for FaceDetectorApp {
            current_image_handle: None,
            processed_image_handle: None,
            second_image_handle: None,
+            image1_face_roi_handles: Vec::new(),
+            image2_face_roi_handles: Vec::new(),
        }
    }
 }
@@ -313,6 +330,8 @@ impl FaceDetectorApp {
                self.detection_result = None;
                self.comparison_result = None;
                self.processed_image_handle = None;
+                self.image1_face_roi_handles.clear();
+                self.image2_face_roi_handles.clear();
                self.status_message = "Results cleared".to_string();
                Task::none()
            }
@@ -356,6 +375,8 @@ impl FaceDetectorApp {
                    ComparisonResult::Success {
                        best_similarity,
                        processing_time,
+                        image1_face_rois,
+                        image2_face_rois,
                        ..
                    } => {
                        let interpretation = if *best_similarity > 0.8 {
@@ -372,6 +393,16 @@ impl FaceDetectorApp {
                            "Comparison complete! Similarity: {:.3} - {} (Processing time: {:.2}s)",
                            best_similarity, interpretation, processing_time
                        );
+
+                        // Convert face ROIs to image handles
+                        let image1_handles = convert_face_rois_to_handles(image1_face_rois.clone());
+                        let image2_handles = convert_face_rois_to_handles(image2_face_rois.clone());
+
+                        self.comparison_result = Some(result);
+                        return Task::perform(
+                            async move { (image1_handles, image2_handles) },
+                            |(h1, h2)| Message::FaceRoisLoaded(h1, h2),
+                        );
                    }
                    ComparisonResult::Error(error) => {
                        self.status_message = format!("Comparison failed: {}", error);
@@ -382,6 +413,12 @@ impl FaceDetectorApp {
                Task::none()
            }

+            Message::FaceRoisLoaded(image1_handles, image2_handles) => {
+                self.image1_face_roi_handles = image1_handles;
+                self.image2_face_roi_handles = image2_handles;
+                Task::none()
+            }
+
            Message::ProgressUpdate(progress) => {
                self.progress = progress;
                Task::none()
@@ -765,6 +802,8 @@ impl FaceDetectorApp {
                ComparisonResult::Success {
                    image1_faces,
                    image2_faces,
+                    image1_face_rois: _,
+                    image2_face_rois: _,
                    best_similarity,
                    processing_time,
                } => {
@@ -790,7 +829,7 @@ impl FaceDetectorApp {
                        )
                    };

-                    column![
+                    let mut result_column = column![
                        text("Comparison Results").size(18),
                        text(format!("First image faces: {}", image1_faces)),
                        text(format!("Second image faces: {}", image2_faces)),
@@ -800,7 +839,89 @@ impl FaceDetectorApp {
                        }),
                        text(format!("Processing time: {:.2}s", processing_time)),
                    ]
+                    .spacing(5);
+
+                    // Add face ROI displays if available
+                    if !self.image1_face_roi_handles.is_empty()
+                        || !self.image2_face_roi_handles.is_empty()
+                    {
+                        result_column = result_column.push(text("Detected Faces").size(16));
+
+                        // Create face ROI rows
+                        let image1_faces_row = if !self.image1_face_roi_handles.is_empty() {
+                            let faces: Element<'_, Message> = self
+                                .image1_face_roi_handles
+                                .iter()
+                                .enumerate()
+                                .fold(row![].spacing(5), |row, (i, handle)| {
+                                    row.push(
+                                        column![
+                                            text(format!("Face {}", i + 1)).size(12),
+                                            container(
+                                                image(handle.clone())
+                                                    .width(80)
+                                                    .height(80)
+                                                    .content_fit(iced::ContentFit::Cover)
+                                            )
+                                            .style(container::bordered_box)
+                                            .padding(2),
+                                        ]
+                                        .spacing(2)
+                                        .align_x(Alignment::Center),
+                                    )
+                                })
+                                .into();
+
+                            column![
+                                text("First Image Faces:").size(14),
+                                scrollable(faces).direction(scrollable::Direction::Horizontal(
+                                    scrollable::Scrollbar::new()
+                                )),
+                            ]
                            .spacing(5)
+                        } else {
+                            column![text("First Image Faces: None detected").size(14)]
+                        };
+
+                        let image2_faces_row = if !self.image2_face_roi_handles.is_empty() {
+                            let faces: Element<'_, Message> = self
+                                .image2_face_roi_handles
+                                .iter()
+                                .enumerate()
+                                .fold(row![].spacing(5), |row, (i, handle)| {
+                                    row.push(
+                                        column![
+                                            text(format!("Face {}", i + 1)).size(12),
+                                            container(
+                                                image(handle.clone())
+                                                    .width(80)
+                                                    .height(80)
+                                                    .content_fit(iced::ContentFit::Cover)
+                                            )
+                                            .style(container::bordered_box)
+                                            .padding(2),
+                                        ]
+                                        .spacing(2)
+                                        .align_x(Alignment::Center),
+                                    )
+                                })
+                                .into();
+
+                            column![
+                                text("Second Image Faces:").size(14),
+                                scrollable(faces).direction(scrollable::Direction::Horizontal(
+                                    scrollable::Scrollbar::new()
+                                )),
+                            ]
+                            .spacing(5)
+                        } else {
+                            column![text("Second Image Faces: None detected").size(14)]
+                        };
+
+                        result_column = result_column.push(image1_faces_row).push(image2_faces_row);
+                    }
+
+                    result_column
                }
                ComparisonResult::Error(error) => column![
                    text("Comparison Results").size(18),
@@ -816,19 +937,26 @@ impl FaceDetectorApp {
            ]
        };

+        scrollable(
            column![file_section, comparison_image_section, controls, results]
                .spacing(20)
-            .padding(20)
+                .padding(20),
+        )
        .into()
    }

    fn settings_view(&self) -> Element<'_, Message> {
-        let executor_options = vec![
-            ExecutorType::MnnCpu,
-            ExecutorType::MnnMetal,
-            ExecutorType::MnnCoreML,
-            ExecutorType::OnnxCpu,
-        ];
+        #[allow(unused_mut)]
+        let mut executor_options = vec![ExecutorType::MnnCpu, ExecutorType::OnnxCpu];
+
+        #[cfg(feature = "mnn-metal")]
+        executor_options.push(ExecutorType::MnnMetal);
+
+        #[cfg(feature = "mnn-coreml")]
+        executor_options.push(ExecutorType::MnnCoreML);
+
+        #[cfg(feature = "ort-cuda")]
+        executor_options.push(ExecutorType::OrtCuda);

        container(
            column![
@@ -874,18 +1002,52 @@ impl std::fmt::Display for ExecutorType {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            ExecutorType::MnnCpu => write!(f, "MNN (CPU)"),
+            #[cfg(feature = "mnn-metal")]
            ExecutorType::MnnMetal => write!(f, "MNN (Metal)"),
+            #[cfg(feature = "mnn-coreml")]
            ExecutorType::MnnCoreML => write!(f, "MNN (CoreML)"),
            ExecutorType::OnnxCpu => write!(f, "ONNX (CPU)"),
+            #[cfg(feature = "ort-cuda")]
+            ExecutorType::OrtCuda => write!(f, "ONNX (CUDA)"),
        }
    }
 }

+// Helper function to convert face ROIs to image handles
+fn convert_face_rois_to_handles(face_rois: Vec<ndarray::Array3<u8>>) -> Vec<image::Handle> {
+    face_rois
+        .into_iter()
+        .filter_map(|roi| {
+            // Convert ndarray to image::RgbImage
+            let (height, width, _) = roi.dim();
+            let (raw_data, _offset) = roi.into_raw_vec_and_offset();
+
+            if let Some(img) = RgbImage::from_raw(width as u32, height as u32, raw_data) {
+                // Convert to PNG bytes
+                let mut buffer = Vec::new();
+                let mut cursor = std::io::Cursor::new(&mut buffer);
+                if DynamicImage::ImageRgb8(img)
+                    .write_to(&mut cursor, ImageFormat::Png)
+                    .is_ok()
+                {
+                    return Some(image::Handle::from_bytes(buffer));
+                }
+            }
+            None
+        })
+        .collect()
+}
+
 pub fn run() -> iced::Result {
+    let settings = Settings {
+        antialiasing: true,
+        ..Default::default()
+    };
    iced::application(
        "Face Detector",
        FaceDetectorApp::update,
        FaceDetectorApp::view,
    )
+    .settings(settings)
    .run_with(FaceDetectorApp::new)
 }
--- a/src/gui/bridge.rs
+++ b/src/gui/bridge.rs
@@ -1,9 +1,17 @@
 use std::path::PathBuf;

+use crate::errors;
 use crate::facedet::{FaceDetectionConfig, FaceDetector, retinaface};
 use crate::faceembed::facenet;
 use crate::gui::app::{ComparisonResult, DetectionResult, ExecutorType};
+use bounding_box::Aabb2;
+use bounding_box::roi::MultiRoi as _;
+use error_stack::ResultExt;
+use fast_image_resize::ResizeOptions;
+use ndarray::{Array1, Array2, Array3, Array4};
 use ndarray_image::ImageToNdarray;
+use ndarray_math::CosineSimilarity;
+use ndarray_resize::NdFir;

 const RETINAFACE_MODEL_MNN: &[u8] = include_bytes!("../../models/retinaface.mnn");
 const FACENET_MODEL_MNN: &[u8] = include_bytes!("../../models/facenet.mnn");
@@ -62,11 +70,19 @@ impl FaceDetectionBridge {
        )
        .await
        {
-            Ok((image1_faces, image2_faces, best_similarity)) => {
+            Ok((
+                image1_faces,
+                image2_faces,
+                image1_face_rois,
+                image2_face_rois,
+                best_similarity,
+            )) => {
                let processing_time = start_time.elapsed().as_secs_f64();
                ComparisonResult::Success {
                    image1_faces,
                    image2_faces,
+                    image1_face_rois,
+                    image2_face_rois,
                    best_similarity,
                    processing_time,
                }
@@ -98,17 +114,34 @@ impl FaceDetectionBridge {

        // Create detector and detect faces
        let faces = match executor_type {
-            ExecutorType::MnnCpu | ExecutorType::MnnMetal | ExecutorType::MnnCoreML => {
-                let forward_type = match executor_type {
-                    ExecutorType::MnnCpu => mnn::ForwardType::CPU,
-                    ExecutorType::MnnMetal => mnn::ForwardType::Metal,
-                    ExecutorType::MnnCoreML => mnn::ForwardType::CoreML,
-                    _ => unreachable!(),
-                };
-
+            ExecutorType::MnnCpu => {
                let mut detector = retinaface::mnn::FaceDetection::builder(RETINAFACE_MODEL_MNN)
                    .map_err(|e| format!("Failed to create MNN detector: {}", e))?
-                    .with_forward_type(forward_type)
+                    .with_forward_type(mnn::ForwardType::CPU)
+                    .build()
+                    .map_err(|e| format!("Failed to build MNN detector: {}", e))?;
+
+                detector
+                    .detect_faces(image_array.view(), &config)
+                    .map_err(|e| format!("Detection failed: {}", e))?
+            }
+            #[cfg(feature = "mnn-metal")]
+            ExecutorType::MnnMetal => {
+                let mut detector = retinaface::mnn::FaceDetection::builder(RETINAFACE_MODEL_MNN)
+                    .map_err(|e| format!("Failed to create MNN detector: {}", e))?
+                    .with_forward_type(mnn::ForwardType::Metal)
+                    .build()
+                    .map_err(|e| format!("Failed to build MNN detector: {}", e))?;
+
+                detector
+                    .detect_faces(image_array.view(), &config)
+                    .map_err(|e| format!("Detection failed: {}", e))?
+            }
+            #[cfg(feature = "mnn-coreml")]
+            ExecutorType::MnnCoreML => {
+                let mut detector = retinaface::mnn::FaceDetection::builder(RETINAFACE_MODEL_MNN)
+                    .map_err(|e| format!("Failed to create MNN detector: {}", e))?
+                    .with_forward_type(mnn::ForwardType::CoreML)
                    .build()
                    .map_err(|e| format!("Failed to build MNN detector: {}", e))?;

@@ -126,6 +159,21 @@ impl FaceDetectionBridge {
                    .detect_faces(image_array.view(), &config)
                    .map_err(|e| format!("Detection failed: {}", e))?
            }
+            #[cfg(feature = "ort-cuda")]
+            ExecutorType::OrtCuda => {
+                use crate::ort_ep::ExecutionProvider;
+
+                let ep = ExecutionProvider::CUDA;
+                let mut detector = retinaface::ort::FaceDetection::builder(RETINAFACE_MODEL_ONNX)
+                    .map_err(|e| format!("Failed to create ONNX CUDA detector: {}", e))?
+                    .with_execution_providers([ep])
+                    .build()
+                    .map_err(|e| format!("Failed to build ONNX CUDA detector: {}", e))?;
+
+                detector
+                    .detect_faces(image_array.view(), &config)
+                    .map_err(|e| format!("CUDA detection failed: {}", e))?
+            }
        };

        let faces_count = faces.bbox.len();
@@ -172,196 +220,350 @@ impl FaceDetectionBridge {
        threshold: f32,
        nms_threshold: f32,
        executor_type: ExecutorType,
-    ) -> Result<(usize, usize, f32), Box<dyn std::error::Error + Send + Sync>> {
-        // Load both images
-        let img1 = image::open(&image1_path)?.to_rgb8();
-        let img2 = image::open(&image2_path)?.to_rgb8();
-
-        // Convert to ndarray format
-        let image1_array = img1.as_ndarray()?;
-        let image2_array = img2.as_ndarray()?;
-
-        // Create detection configuration
-        let config1 = FaceDetectionConfig::default()
-            .with_threshold(threshold)
-            .with_nms_threshold(nms_threshold)
-            .with_input_width(1024)
-            .with_input_height(1024);
-
-        let config2 = FaceDetectionConfig::default()
-            .with_threshold(threshold)
-            .with_nms_threshold(nms_threshold)
-            .with_input_width(1024)
-            .with_input_height(1024);
-
+    ) -> Result<
+        (usize, usize, Vec<Array3<u8>>, Vec<Array3<u8>>, f32),
+        Box<dyn std::error::Error + Send + Sync>,
+    > {
        // Create detector and embedder, detect faces and generate embeddings
-        let (faces1, faces2, best_similarity) = match executor_type {
-            ExecutorType::MnnCpu | ExecutorType::MnnMetal | ExecutorType::MnnCoreML => {
-                let forward_type = match executor_type {
-                    ExecutorType::MnnCpu => mnn::ForwardType::CPU,
-                    ExecutorType::MnnMetal => mnn::ForwardType::Metal,
-                    ExecutorType::MnnCoreML => mnn::ForwardType::CoreML,
-                    _ => unreachable!(),
-                };
-
-                let mut detector = retinaface::mnn::FaceDetection::builder(RETINAFACE_MODEL_MNN)
+        let (image1_faces, image2_faces, image1_rois, image2_rois, best_similarity) =
+            match executor_type {
+                ExecutorType::MnnCpu => {
+                    let mut detector =
+                        retinaface::mnn::FaceDetection::builder(RETINAFACE_MODEL_MNN)
                            .map_err(|e| format!("Failed to create MNN detector: {}", e))?
-                    .with_forward_type(forward_type.clone())
+                            .with_forward_type(mnn::ForwardType::CPU)
                            .build()
                            .map_err(|e| format!("Failed to build MNN detector: {}", e))?;

-                let embedder = facenet::mnn::EmbeddingGenerator::builder(FACENET_MODEL_MNN)
+                    let mut embedder = facenet::mnn::EmbeddingGenerator::builder(FACENET_MODEL_MNN)
                        .map_err(|e| format!("Failed to create MNN embedder: {}", e))?
-                    .with_forward_type(forward_type)
+                        .with_forward_type(mnn::ForwardType::CPU)
                        .build()
                        .map_err(|e| format!("Failed to build MNN embedder: {}", e))?;

-                // Detect faces in both images
-                let faces1 = detector
-                    .detect_faces(image1_array.view(), &config1)
-                    .map_err(|e| format!("Detection failed for image 1: {}", e))?;
-                let faces2 = detector
-                    .detect_faces(image2_array.view(), &config2)
-                    .map_err(|e| format!("Detection failed for image 2: {}", e))?;
-
-                // Extract face crops and generate embeddings
-                let mut best_similarity = 0.0f32;
-
-                for bbox1 in &faces1.bbox {
-                    let crop1 = Self::crop_face_from_image(&img1, bbox1)?;
-                    let crop1_array = ndarray::Array::from_shape_vec(
-                        (1, crop1.height() as usize, crop1.width() as usize, 3),
-                        crop1
-                            .pixels()
-                            .flat_map(|p| [p.0[0], p.0[1], p.0[2]])
-                            .collect(),
+                    let img_1 = run_detection(
+                        image1_path,
+                        &mut detector,
+                        &mut embedder,
+                        threshold,
+                        nms_threshold,
+                        2,
+                    )?;
+                    let img_2 = run_detection(
+                        image2_path,
+                        &mut detector,
+                        &mut embedder,
+                        threshold,
+                        nms_threshold,
+                        2,
                    )?;

-                    let embedding1 = embedder
-                        .run_models(crop1_array.view())
-                        .map_err(|e| format!("Embedding generation failed: {}", e))?;
+                    let image1_rois = img_1.rois;
+                    let image2_rois = img_2.rois;
+                    let image1_bbox_len = img_1.bbox.len();
+                    let image2_bbox_len = img_2.bbox.len();
+                    let best_similarity = compare_faces(&img_1.embeddings, &img_2.embeddings)?;

-                    for bbox2 in &faces2.bbox {
-                        let crop2 = Self::crop_face_from_image(&img2, bbox2)?;
-                        let crop2_array = ndarray::Array::from_shape_vec(
-                            (1, crop2.height() as usize, crop2.width() as usize, 3),
-                            crop2
-                                .pixels()
-                                .flat_map(|p| [p.0[0], p.0[1], p.0[2]])
-                                .collect(),
-                        )?;
-
-                        let embedding2 = embedder
-                            .run_models(crop2_array.view())
-                            .map_err(|e| format!("Embedding generation failed: {}", e))?;
-
-                        let similarity = Self::cosine_similarity(
-                            embedding1.row(0).as_slice().unwrap(),
-                            embedding2.row(0).as_slice().unwrap(),
-                        );
-                        best_similarity = best_similarity.max(similarity);
+                    (
+                        image1_bbox_len,
+                        image2_bbox_len,
+                        image1_rois,
+                        image2_rois,
+                        best_similarity,
+                    )
                }
-                }
-
-                (faces1, faces2, best_similarity)
-            }
-            ExecutorType::OnnxCpu => {
-                let mut detector = retinaface::ort::FaceDetection::builder(RETINAFACE_MODEL_ONNX)
-                    .map_err(|e| format!("Failed to create ONNX detector: {}", e))?
+                #[cfg(feature = "mnn-metal")]
+                ExecutorType::MnnMetal => {
+                    let mut detector =
+                        retinaface::mnn::FaceDetection::builder(RETINAFACE_MODEL_MNN)
+                            .map_err(|e| format!("Failed to create MNN detector: {}", e))?
+                            .with_forward_type(mnn::ForwardType::Metal)
                            .build()
-                    .map_err(|e| format!("Failed to build ONNX detector: {}", e))?;
+                            .map_err(|e| format!("Failed to build MNN detector: {}", e))?;

-                let mut embedder = facenet::ort::EmbeddingGenerator::builder(FACENET_MODEL_ONNX)
-                    .map_err(|e| format!("Failed to create ONNX embedder: {}", e))?
+                    let mut embedder = facenet::mnn::EmbeddingGenerator::builder(FACENET_MODEL_MNN)
+                        .map_err(|e| format!("Failed to create MNN embedder: {}", e))?
+                        .with_forward_type(mnn::ForwardType::Metal)
                        .build()
-                    .map_err(|e| format!("Failed to build ONNX embedder: {}", e))?;
+                        .map_err(|e| format!("Failed to build MNN embedder: {}", e))?;

-                // Detect faces in both images
-                let faces1 = detector
-                    .detect_faces(image1_array.view(), &config1)
-                    .map_err(|e| format!("Detection failed for image 1: {}", e))?;
-                let faces2 = detector
-                    .detect_faces(image2_array.view(), &config2)
-                    .map_err(|e| format!("Detection failed for image 2: {}", e))?;
-
-                // Extract face crops and generate embeddings
-                let mut best_similarity = 0.0f32;
-
-                for bbox1 in &faces1.bbox {
-                    let crop1 = Self::crop_face_from_image(&img1, bbox1)?;
-                    let crop1_array = ndarray::Array::from_shape_vec(
-                        (1, crop1.height() as usize, crop1.width() as usize, 3),
-                        crop1
-                            .pixels()
-                            .flat_map(|p| [p.0[0], p.0[1], p.0[2]])
-                            .collect(),
+                    let img_1 = run_detection(
+                        image1_path,
+                        &mut detector,
+                        &mut embedder,
+                        threshold,
+                        nms_threshold,
+                        2,
+                    )?;
+                    let img_2 = run_detection(
+                        image2_path,
+                        &mut detector,
+                        &mut embedder,
+                        threshold,
+                        nms_threshold,
+                        2,
                    )?;

-                    let embedding1 = embedder
-                        .run_models(crop1_array.view())
-                        .map_err(|e| format!("Embedding generation failed: {}", e))?;
+                    let image1_rois = img_1.rois;
+                    let image2_rois = img_2.rois;
+                    let image1_bbox_len = img_1.bbox.len();
+                    let image2_bbox_len = img_2.bbox.len();
+                    let best_similarity = compare_faces(&img_1.embeddings, &img_2.embeddings)?;

-                    for bbox2 in &faces2.bbox {
-                        let crop2 = Self::crop_face_from_image(&img2, bbox2)?;
-                        let crop2_array = ndarray::Array::from_shape_vec(
-                            (1, crop2.height() as usize, crop2.width() as usize, 3),
-                            crop2
-                                .pixels()
-                                .flat_map(|p| [p.0[0], p.0[1], p.0[2]])
-                                .collect(),
+                    (
+                        image1_bbox_len,
+                        image2_bbox_len,
+                        image1_rois,
+                        image2_rois,
+                        best_similarity,
+                    )
+                }
+                #[cfg(feature = "mnn-coreml")]
+                ExecutorType::MnnCoreML => {
+                    let mut detector =
+                        retinaface::mnn::FaceDetection::builder(RETINAFACE_MODEL_MNN)
+                            .map_err(|e| format!("Failed to create MNN detector: {}", e))?
+                            .with_forward_type(mnn::ForwardType::CoreML)
+                            .build()
+                            .map_err(|e| format!("Failed to build MNN detector: {}", e))?;
+
+                    let mut embedder = facenet::mnn::EmbeddingGenerator::builder(FACENET_MODEL_MNN)
+                        .map_err(|e| format!("Failed to create MNN embedder: {}", e))?
+                        .with_forward_type(mnn::ForwardType::CoreML)
+                        .build()
+                        .map_err(|e| format!("Failed to build MNN embedder: {}", e))?;
+
+                    let img_1 = run_detection(
+                        image1_path,
+                        &mut detector,
+                        &mut embedder,
+                        threshold,
+                        nms_threshold,
+                        2,
+                    )?;
+                    let img_2 = run_detection(
+                        image2_path,
+                        &mut detector,
+                        &mut embedder,
+                        threshold,
+                        nms_threshold,
+                        2,
                    )?;

-                        let embedding2 = embedder
-                            .run_models(crop2_array.view())
-                            .map_err(|e| format!("Embedding generation failed: {}", e))?;
+                    let image1_rois = img_1.rois;
+                    let image2_rois = img_2.rois;
+                    let image1_bbox_len = img_1.bbox.len();
+                    let image2_bbox_len = img_2.bbox.len();
+                    let best_similarity = compare_faces(&img_1.embeddings, &img_2.embeddings)?;

-                        let similarity = Self::cosine_similarity(
-                            embedding1.row(0).as_slice().unwrap(),
-                            embedding2.row(0).as_slice().unwrap(),
-                        );
-                        best_similarity = best_similarity.max(similarity);
-                    }
+                    (
+                        image1_bbox_len,
+                        image2_bbox_len,
+                        image1_rois,
+                        image2_rois,
+                        best_similarity,
+                    )
                }
+                ExecutorType::OnnxCpu => unimplemented!("ONNX face comparison not yet implemented"),
+                #[cfg(feature = "ort-cuda")]
+                ExecutorType::OrtCuda => {
+                    use crate::ort_ep::ExecutionProvider;
+                    let ep = ExecutionProvider::CUDA;
+                    let mut detector =
+                        retinaface::ort::FaceDetection::builder(RETINAFACE_MODEL_ONNX)
+                            .map_err(|e| format!("Failed to create MNN detector: {}", e))?
+                            .with_execution_providers([ep])
+                            .build()
+                            .map_err(|e| format!("Failed to build MNN detector: {}", e))?;

-                (faces1, faces2, best_similarity)
+                    let mut embedder =
+                        facenet::ort::EmbeddingGenerator::builder(FACENET_MODEL_ONNX)
+                            .map_err(|e| format!("Failed to create MNN embedder: {}", e))?
+                            .with_execution_providers([ep])
+                            .build()
+                            .map_err(|e| format!("Failed to build MNN embedder: {}", e))?;
+
+                    let img_1 = run_detection(
+                        image1_path,
+                        &mut detector,
+                        &mut embedder,
+                        threshold,
+                        nms_threshold,
+                        2,
+                    )?;
+                    let img_2 = run_detection(
+                        image2_path,
+                        &mut detector,
+                        &mut embedder,
+                        threshold,
+                        nms_threshold,
+                        2,
+                    )?;
+
+                    let image1_rois = img_1.rois;
+                    let image2_rois = img_2.rois;
+                    let image1_bbox_len = img_1.bbox.len();
+                    let image2_bbox_len = img_2.bbox.len();
+                    let best_similarity = compare_faces(&img_1.embeddings, &img_2.embeddings)?;
+
+                    (
+                        image1_bbox_len,
+                        image2_bbox_len,
+                        image1_rois,
+                        image2_rois,
+                        best_similarity,
+                    )
                }
            };

-        Ok((faces1.bbox.len(), faces2.bbox.len(), best_similarity))
-    }
-
-    fn crop_face_from_image(
-        img: &image::RgbImage,
-        bbox: &bounding_box::Aabb2<usize>,
-    ) -> Result<image::RgbImage, Box<dyn std::error::Error + Send + Sync>> {
-        let min_point = bbox.min_vertex();
-        let size = bbox.size();
-        let x = min_point.x as u32;
-        let y = min_point.y as u32;
-        let width = size.x as u32;
-        let height = size.y as u32;
-
-        // Ensure crop bounds are within image
-        let img_width = img.width();
-        let img_height = img.height();
-
-        let crop_x = x.min(img_width.saturating_sub(1));
-        let crop_y = y.min(img_height.saturating_sub(1));
-        let crop_width = width.min(img_width - crop_x);
-        let crop_height = height.min(img_height - crop_y);
-
-        Ok(image::imageops::crop_imm(img, crop_x, crop_y, crop_width, crop_height).to_image())
-    }
-
-    fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
-        let dot_product: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
-        let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
-        let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
-
-        if norm_a == 0.0 || norm_b == 0.0 {
-            0.0
-        } else {
-            dot_product / (norm_a * norm_b)
-        }
+        Ok((
+            image1_faces,
+            image2_faces,
+            image1_rois,
+            image2_rois,
+            best_similarity,
+        ))
    }
 }
+
+use crate::errors::Error;
+pub fn compare_faces(
+    faces_1: &[Array1<f32>],
+    faces_2: &[Array1<f32>],
+) -> Result<f32, error_stack::Report<crate::errors::Error>> {
+    use error_stack::Report;
+
+    if faces_1.is_empty() || faces_2.is_empty() {
+        Err(Report::new(crate::errors::Error))
+            .attach_printable("One or both images have no detected faces")?;
+    }
+    if faces_1.len() != faces_2.len() {
+        Err(Report::new(crate::errors::Error))
+            .attach_printable("Face count mismatch between images")?;
+    }
+    Ok(faces_1
+        .iter()
+        .zip(faces_2)
+        .flat_map(|(face_1, face_2)| face_1.cosine_similarity(face_2))
+        .inspect(|v| tracing::info!("Cosine similarity: {}", v))
+        .map(|v| ordered_float::OrderedFloat(v))
+        .max()
+        .map(|v| v.0)
+        .ok_or(Report::new(Error))?)
+}
+
+#[derive(Debug)]
+pub struct DetectionOutput {
+    bbox: Vec<Aabb2<usize>>,
+    rois: Vec<ndarray::Array3<u8>>,
+    embeddings: Vec<Array1<f32>>,
+}
+
+fn run_detection<D, E>(
+    image: impl AsRef<std::path::Path>,
+    retinaface: &mut D,
+    facenet: &mut E,
+    threshold: f32,
+    nms_threshold: f32,
+    chunk_size: usize,
+) -> crate::errors::Result<DetectionOutput>
+where
+    D: crate::facedet::FaceDetector,
+    E: crate::faceembed::FaceEmbedder,
+{
+    use errors::*;
+    // Initialize database if requested
+    let image = image.as_ref();
+    let image = image::open(image)
+        .change_context(Error)
+        .attach_printable(image.to_string_lossy().to_string())?;
+    let image = image.into_rgb8();
+    let mut array = image
+        .into_ndarray()
+        .change_context(errors::Error)
+        .attach_printable("Failed to convert image to ndarray")?;
+    let output = retinaface
+        .detect_faces(
+            array.view(),
+            &FaceDetectionConfig::default()
+                .with_threshold(threshold)
+                .with_nms_threshold(nms_threshold),
+        )
+        .change_context(errors::Error)
+        .attach_printable("Failed to detect faces")?;
+
+    let bboxes = output
+        .bbox
+        .iter()
+        .inspect(|bbox| tracing::info!("Raw bbox: {:?}", bbox))
+        .map(|bbox| bbox.as_::<f32>().scale_uniform(1.30).as_::<usize>())
+        .inspect(|bbox| tracing::info!("Padded bbox: {:?}", bbox))
+        .collect_vec();
+    for bbox in &bboxes {
+        tracing::info!("Detected face: {:?}", bbox);
+        use bounding_box::draw::*;
+        array.draw(bbox, color::palette::css::GREEN_YELLOW.to_rgba8(), 1);
+    }
+    use itertools::Itertools;
+    let face_rois = array
+        .view()
+        .multi_roi(&bboxes)
+        .change_context(Error)?
+        .into_iter()
+        .map(|roi| {
+            roi.as_standard_layout()
+                .fast_resize(224, 224, &ResizeOptions::default())
+                .change_context(Error)
+        })
+        .collect::<Result<Vec<_>>>()?;
+    let face_roi_views = face_rois.iter().map(|roi| roi.view()).collect::<Vec<_>>();
+
+    let embeddings: Vec<Array1<f32>> = face_roi_views
+        .chunks(chunk_size)
+        .map(|chunk| {
+            tracing::info!("Processing chunk of size: {}", chunk.len());
+
+            let og_size = chunk.len();
+            if chunk.len() < chunk_size {
+                tracing::warn!("Chunk size is less than 8, padding with zeros");
+                let zeros = Array3::zeros((224, 224, 3));
+                let chunk: Vec<_> = chunk
+                    .iter()
+                    .map(|arr| arr.reborrow())
+                    .chain(core::iter::repeat(zeros.view()))
+                    .take(chunk_size)
+                    .collect();
+                let face_rois: Array4<u8> = ndarray::stack(ndarray::Axis(0), chunk.as_slice())
+                    .change_context(errors::Error)
+                    .attach_printable("Failed to stack rois together")?;
+                let output = facenet.run_models(face_rois.view()).change_context(Error)?;
+                Ok((output, og_size))
+            } else {
+                let face_rois: Array4<u8> = ndarray::stack(ndarray::Axis(0), chunk)
+                    .change_context(errors::Error)
+                    .attach_printable("Failed to stack rois together")?;
+                let output = facenet.run_models(face_rois.view()).change_context(Error)?;
+                Ok((output, og_size))
+            }
+        })
+        .collect::<Result<Vec<(Array2<f32>, usize)>>>()?
+        .into_iter()
+        .map(|(chunk, size): (Array2<f32>, usize)| {
+            use itertools::Itertools;
+            chunk
+                .rows()
+                .into_iter()
+                .take(size)
+                .map(|row| row.to_owned())
+                .collect_vec()
+                .into_iter()
+        })
+        .flatten()
+        .collect::<Vec<Array1<f32>>>();
+
+    Ok(DetectionOutput {
+        bbox: bboxes,
+        rois: face_rois,
+        embeddings,
+    })
+}
--- a/src/ort_ep.rs
+++ b/src/ort_ep.rs
@@ -13,7 +13,7 @@ use ort::execution_providers::TensorRTExecutionProvider;
 use ort::execution_providers::{CPUExecutionProvider, ExecutionProviderDispatch};

 /// Supported execution providers for ONNX Runtime
-#[derive(Debug, Clone)]
+#[derive(Debug, Copy, Clone)]
 pub enum ExecutionProvider {
    /// CPU execution provider (always available)
    CPU,
Author	SHA1	Message	Date
uttarayan21	59a3fddc0b	chore: delete unused files and outdated GUI_DEMO documentation Some checks failed build / checks-matrix (push) Successful in 19m22s Details build / codecov (push) Failing after 19m22s Details docs / docs (push) Failing after 28m48s Details build / checks-build (push) Has been cancelled Details	2025-09-23 16:13:56 +05:30
uttarayan21	eb9451aad8	chore: remove submodule 'rfcs' from the project Some checks failed build / checks-matrix (push) Successful in 19m24s Details build / codecov (push) Failing after 19m23s Details docs / docs (push) Has been cancelled Details build / checks-build (push) Has been cancelled Details	2025-09-23 15:08:54 +05:30
uttarayan21	c6b3f5279f	feat(flake): add uv package to build inputs Some checks failed build / checks-matrix (push) Successful in 19m21s Details build / codecov (push) Failing after 19m25s Details docs / docs (push) Failing after 28m47s Details build / checks-build (push) Has been cancelled Details	2025-09-16 12:28:09 +05:30
uttarayan21	a419a5ac4a	chore(models): remove Facenet and RetinaFace model files Some checks failed build / checks-matrix (push) Has been cancelled Details build / checks-build (push) Has been cancelled Details build / codecov (push) Has been cancelled Details docs / docs (push) Has been cancelled Details	2025-09-16 12:22:38 +05:30
uttarayan21	a340552257	feat(cli): add clustering command with K-means support Some checks failed build / checks-matrix (push) Successful in 19m25s Details build / codecov (push) Failing after 19m26s Details docs / docs (push) Failing after 28m52s Details build / checks-build (push) Has been cancelled Details	2025-09-13 17:45:55 +05:30
uttarayan21	aaf34ef74e	refactor: rename sqlite3-safetensor-cosine to sqlite3-ndarray-math Some checks failed build / checks-matrix (push) Successful in 19m20s Details build / codecov (push) Failing after 19m22s Details docs / docs (push) Failing after 28m47s Details build / checks-build (push) Has been cancelled Details	2025-08-28 18:42:35 +05:30
uttarayan21	ac8f1d01b4	feat(detector): add CUDA support for ONNX face detection Some checks failed build / checks-build (push) Has been cancelled Details build / codecov (push) Has been cancelled Details docs / docs (push) Has been cancelled Details build / checks-matrix (push) Has been cancelled Details	2025-08-28 18:32:00 +05:30
uttarayan21	4256c0af74	feat(makefile): add conversion task and update model binaries Some checks failed build / checks-matrix (push) Successful in 19m22s Details build / codecov (push) Failing after 19m22s Details docs / docs (push) Failing after 28m50s Details build / checks-build (push) Has been cancelled Details	2025-08-28 13:43:23 +05:30
uttarayan21	3eec262076	feat(bounding-box): add scale_uniform method for consistent scaling Some checks failed build / checks-matrix (push) Successful in 19m22s Details build / codecov (push) Failing after 19m26s Details docs / docs (push) Failing after 28m51s Details build / checks-build (push) Has been cancelled Details feat(gui): display face ROIs in comparison results refactor(bridge): pad detected face bounding boxes uniformly	2025-08-22 19:01:34 +05:30
uttarayan21	c758fd8d41	feat(gui): add face ROIs to comparison results and update image size	2025-08-22 18:26:29 +05:30
uttarayan21	34eaf9348a	refactor(gui): remove commented-out code in face detection function Some checks failed build / checks-matrix (push) Successful in 19m20s Details build / codecov (push) Failing after 19m18s Details docs / docs (push) Has been cancelled Details build / checks-build (push) Has been cancelled Details	2025-08-22 18:15:55 +05:30
uttarayan21	dab7719206	refactor: replace bbox::BBox with bounding_box::Aabb2 across codebase Some checks failed build / checks-matrix (push) Has been cancelled Details build / checks-build (push) Has been cancelled Details build / codecov (push) Has been cancelled Details docs / docs (push) Has been cancelled Details	2025-08-22 18:14:58 +05:30
uttarayan21	4b4d23d1d4	feat(bbox): add bounding box implementation with serialization Add initial implementation of the `BBox` struct in the `bbox` module, including basic operations and serialization/deserialization support with Serde.	2025-08-22 15:27:47 +05:30
uttarayan21	aab3d84db0	feat(ndcv-bridge): add ndcv-bridge for ndarray and opencv interaction	2025-08-22 15:27:36 +05:30