Spaces:

MogensR
/

VideoBackgroundReplacer

Paused

App Files Files Community

MogensR commited on Aug 28, 2025

Commit

dae1677

1 Parent(s): 521249b

Update models/loaders/matanyone_loader.py

Browse files

Files changed (1) hide show

models/loaders/matanyone_loader.py +194 -112

models/loaders/matanyone_loader.py CHANGED Viewed

@@ -1,10 +1,11 @@
 #!/usr/bin/env python3
 """
-MatAnyone Loader + Stateful Adapter
-- Loads the official model from Hugging Face.
-- Drives InferenceCore as intended: first-frame encode + warm-up, then propagation.
-- Normalizes inputs so conv2d never sees 5-D tensors.
-- Always outputs a 2-D, contiguous float32 mask [H,W] for OpenCV.
 """
 import os
@@ -20,23 +21,20 @@
 logger = logging.getLogger(__name__)
 # ------------------------- Shape & dtype utilities ------------------------- #
 def _select_device(pref: str) -> str:
-    pref = (pref or "").lower() if pref else ""
     if pref.startswith("cuda"):
         return "cuda" if torch.cuda.is_available() else "cpu"
     if pref == "cpu":
         return "cpu"
     return "cuda" if torch.cuda.is_available() else "cpu"
 def _as_tensor_on_device(x, device: str) -> torch.Tensor:
     if isinstance(x, torch.Tensor):
-        return x.to(device)
-    return torch.from_numpy(np.asarray(x)).to(device)
 def _to_bchw(x, device: str, is_mask: bool = False) -> torch.Tensor:
     """
@@ -51,7 +49,7 @@ def _to_bchw(x, device: str, is_mask: bool = False) -> torch.Tensor:
     elif x.dtype in (torch.int16, torch.int32, torch.int64):
         x = x.float()
-    # 5D [B,T,*,H,W] or [B,T,H,W,*] -> take first frame
     if x.ndim == 5:
         x = x[:, 0]  # -> 4D
@@ -83,20 +81,16 @@ def _to_bchw(x, device: str, is_mask: bool = False) -> torch.Tensor:
     else:
         if x.shape[1] == 1:
             x = x.repeat(1, 3, 1, 1)
-        x = x.clamp_(0.0, 1.0).to(torch.float32)
     return x
 def _to_chw_image(img_bchw: torch.Tensor) -> torch.Tensor:
-    """Prefer CHW for InferenceCore.step."""
     if img_bchw.ndim == 4 and img_bchw.shape[0] == 1:
         return img_bchw[0]
-    return img_bchw  # some builds may accept batched; we try CHW first
-def _to_1hw_mask(msk_b1hw: torch.Tensor) -> torch.Tensor:
-    """Non-idx path expects [1,H,W] for single target."""
     if msk_b1hw is None:
         return None
     if msk_b1hw.ndim == 4 and msk_b1hw.shape[1] == 1:
@@ -105,19 +99,15 @@ def _to_1hw_mask(msk_b1hw: torch.Tensor) -> torch.Tensor:
         return msk_b1hw
     raise ValueError(f"Expected B1HW or 1HW, got {tuple(msk_b1hw.shape)}")
-def _resize_mask_to(img_bchw: torch.Tensor, mask_b1hw: torch.Tensor) -> torch.Tensor:
-    if mask_b1hw is None:
         return None
-    if img_bchw.shape[-2:] == mask_b1hw.shape[-2:]:
-        return mask_b1hw
-    return F.interpolate(mask_b1hw, size=img_bchw.shape[-2:], mode="nearest")
 def _to_2d_alpha_numpy(x) -> np.ndarray:
-    """
-    Convert probabilities/mattes to 2-D float32 [H,W] contiguous.
-    """
     t = torch.as_tensor(x).float()
     while t.ndim > 2:
         if t.ndim == 3:
@@ -128,7 +118,6 @@ def _to_2d_alpha_numpy(x) -> np.ndarray:
     out = t.detach().cpu().numpy().astype(np.float32)
     return np.ascontiguousarray(out)
 def debug_shapes(tag: str, image, mask) -> None:
     def _info(name, v):
         try:
@@ -141,35 +130,42 @@ def _info(name, v):
     _info("image", image)
     _info("mask", mask)
 # ------------------------------ Stateful Adapter --------------------------- #
 class _MatAnyoneSession:
     """
-    Minimal stateful controller around InferenceCore.
     Usage:
-        # frame 0 (has initial coarse mask):
-        alpha0 = session(frame0_rgb, mask0)      # encode + warm-up predict
         # frames 1..N (no mask):
-        alpha  = session(frame_rgb)              # propagate/refine
     """
-    def __init__(self, core, device: str):
         self.core = core
         self.device = device
         self.started = False
-        # discover supported step() kwargs
         try:
-            self._step_sig = inspect.signature(self.core.step)
-            self._has_first_frame_pred = "first_frame_pred" in self._step_sig.parameters
-            self._has_idx_mask = "idx_mask" in self._step_sig.parameters
         except Exception:
-            self._step_sig = None
             self._has_first_frame_pred = True
-            self._has_idx_mask = True
-        # discover output conversion helper
         self._has_prob_to_mask = hasattr(self.core, "output_prob_to_mask")
     def reset(self):
@@ -180,63 +176,23 @@ def reset(self):
             pass
         self.started = False
-    def __call__(self, image, mask=None, **kwargs) -> np.ndarray:
-        """
-        Returns a 2-D float32 alpha [H,W] suitable for OpenCV.
-        Expects RGB image in HWC or similar; mask as [H,W] or broadcastable.
-        """
-        # Normalize inputs
-        img_bchw = _to_bchw(image, self.device, is_mask=False)   # [B,C,H,W]
-        msk_b1hw = _to_bchw(mask,  self.device, is_mask=True) if mask is not None else None
-        if msk_b1hw is not None:
-            msk_b1hw = _resize_mask_to(img_bchw, msk_b1hw)
-        img_chw = _to_chw_image(img_bchw)
-        m_1hw = _to_1hw_mask(msk_b1hw) if msk_b1hw is not None else None
-        try:
-            if not self.started:
-                if m_1hw is None:
-                    logger.warning("First frame arrived without a mask; returning neutral alpha.")
-                    return np.full(img_chw.shape[-2:], 0.5, dtype=np.float32)
-                # 1) Encode target on first frame
-                kwargs1 = {}
-                if self._has_idx_mask:
-                    kwargs1["idx_mask"] = False
-                _ = self.core.step(image=img_chw, mask=m_1hw, **kwargs1)
-                # 2) First-frame warm-up prediction + memorize
-                kwargs2 = {}
-                if self._has_first_frame_pred:
-                    kwargs2["first_frame_pred"] = True
-                out_prob = self.core.step(image=img_chw, **kwargs2)
-                alpha = self._to_alpha(out_prob)
-                self.started = True
-                return _to_2d_alpha_numpy(alpha)
-            # Subsequent frames: propagate without mask
-            out_prob = self.core.step(image=img_chw)
-            alpha = self._to_alpha(out_prob)
-            return _to_2d_alpha_numpy(alpha)
-        except Exception as e:
-            logger.debug(traceback.format_exc())
-            logger.warning(f"MatAnyone call failed; returning input mask as fallback: {e}")
-            if m_1hw is not None:
-                return _to_2d_alpha_numpy(m_1hw)
-            return np.full(img_chw.shape[-2:], 0.5, dtype=np.float32)
     def _to_alpha(self, out_prob):
-        """
-        Convert core output to alpha. Prefer core.output_prob_to_mask(matting=True) if available.
-        """
         if self._has_prob_to_mask:
             try:
                 return self.core.output_prob_to_mask(out_prob, matting=True)
             except Exception:
                 pass
-        # Fallback heuristics
         t = torch.as_tensor(out_prob).float()
         if t.ndim == 3 and t.shape[0] >= 1:
             return t[0]
@@ -244,12 +200,123 @@ def _to_alpha(self, out_prob):
             return t
         return torch.full((1, 1), 0.5, dtype=torch.float32, device=t.device if t.is_cuda else "cpu")
 # -------------------------------- Loader ---------------------------------- #
 class MatAnyoneLoader:
     """
-    Official MatAnyone loader with stateful adapter.
     """
     def __init__(self, device: str = "cuda", cache_dir: str = "./checkpoints/matanyone_cache"):
@@ -267,16 +334,14 @@ def _import_model_and_core(self):
         """
         Import MatAnyone + InferenceCore with resilient fallbacks (different dist layouts).
         """
-        # Try several possible import paths to be robust
         model_cls = core_cls = None
         err_msgs = []
         # Candidates for model class
-        model_paths = [
             ("matanyone.model.matanyone", "MatAnyone"),
             ("matanyone", "MatAnyone"),
-        ]
-        for mod, cls in model_paths:
             try:
                 m = __import__(mod, fromlist=[cls])
                 model_cls = getattr(m, cls)
@@ -285,11 +350,10 @@ def _import_model_and_core(self):
                 err_msgs.append(f"model {mod}.{cls}: {e}")
         # Candidates for InferenceCore
-        core_paths = [
             ("matanyone.inference.inference_core", "InferenceCore"),
             ("matanyone", "InferenceCore"),
-        ]
-        for mod, cls in core_paths:
             try:
                 m = __import__(mod, fromlist=[cls])
                 core_cls = getattr(m, cls)
@@ -312,9 +376,21 @@ def load(self) -> Optional[Any]:
         try:
             model_cls, core_cls = self._import_model_and_core()
             # Official pattern: model -> eval -> core(model, cfg=model.cfg)
             self.model = model_cls.from_pretrained(self.model_id)
-            self.model = self.model.to(self.device).eval()
             # Some builds require cfg; fall back if not present
             try:
@@ -324,17 +400,28 @@ def load(self) -> Optional[Any]:
                 else:
                     self.core = core_cls(self.model)
             except TypeError:
-                # signature without cfg
                 self.core = core_cls(self.model)
-            # Move core to device if it supports .to
             try:
                 if hasattr(self.core, "to"):
                     self.core.to(self.device)
             except Exception:
                 pass
-            self.adapter = _MatAnyoneSession(self.core, self.device)
             self.load_time = time.time() - start
             logger.info(f"MatAnyone loaded in {self.load_time:.2f}s")
             return self.adapter
@@ -345,11 +432,6 @@ def load(self) -> Optional[Any]:
             return None
     def cleanup(self):
-        if self.adapter:
-            try:
-                self.adapter.reset()
-            except Exception:
-                pass
         self.adapter = None
         self.core = None
         if self.model:

 #!/usr/bin/env python3
 """
+MatAnyone Loader + Stateful Adapter (OOM-resilient)
+- Canonical HF load (MatAnyone.from_pretrained -> InferenceCore(model, cfg))
+- Mixed precision (bf16/fp16) with safe fallback to fp32
+- Autocast + inference_mode around every call
+- Auto downscale with progressive retry on OOM, then upsample alpha back
+- Returns 2-D float32 [H,W] alpha for OpenCV
 """
 import os
 logger = logging.getLogger(__name__)
 # ------------------------- Shape & dtype utilities ------------------------- #
 def _select_device(pref: str) -> str:
+    pref = (pref or "").lower()
     if pref.startswith("cuda"):
         return "cuda" if torch.cuda.is_available() else "cpu"
     if pref == "cpu":
         return "cpu"
     return "cuda" if torch.cuda.is_available() else "cpu"
 def _as_tensor_on_device(x, device: str) -> torch.Tensor:
     if isinstance(x, torch.Tensor):
+        return x.to(device, non_blocking=True)
+    return torch.from_numpy(np.asarray(x)).to(device, non_blocking=True)
 def _to_bchw(x, device: str, is_mask: bool = False) -> torch.Tensor:
     """
     elif x.dtype in (torch.int16, torch.int32, torch.int64):
         x = x.float()
+    # 5D -> take first time slice
     if x.ndim == 5:
         x = x[:, 0]  # -> 4D
     else:
         if x.shape[1] == 1:
             x = x.repeat(1, 3, 1, 1)
+        x = x.clamp_(0.0, 1.0)
     return x
 def _to_chw_image(img_bchw: torch.Tensor) -> torch.Tensor:
     if img_bchw.ndim == 4 and img_bchw.shape[0] == 1:
         return img_bchw[0]
+    return img_bchw
+def _to_1hw_mask(msk_b1hw: torch.Tensor) -> Optional[torch.Tensor]:
     if msk_b1hw is None:
         return None
     if msk_b1hw.ndim == 4 and msk_b1hw.shape[1] == 1:
         return msk_b1hw
     raise ValueError(f"Expected B1HW or 1HW, got {tuple(msk_b1hw.shape)}")
+def _resize_bchw(x: Optional[torch.Tensor], size_hw: Tuple[int, int], is_mask=False) -> Optional[torch.Tensor]:
+    if x is None:
         return None
+    if x.shape[-2:] == size_hw:
+        return x
+    mode = "nearest" if is_mask else "bilinear"
+    return F.interpolate(x, size=size_hw, mode=mode, align_corners=False if mode == "bilinear" else None)
 def _to_2d_alpha_numpy(x) -> np.ndarray:
     t = torch.as_tensor(x).float()
     while t.ndim > 2:
         if t.ndim == 3:
     out = t.detach().cpu().numpy().astype(np.float32)
     return np.ascontiguousarray(out)
 def debug_shapes(tag: str, image, mask) -> None:
     def _info(name, v):
         try:
     _info("image", image)
     _info("mask", mask)
 # ------------------------------ Stateful Adapter --------------------------- #
 class _MatAnyoneSession:
     """
+    Stateful controller around InferenceCore with OOM-resilient inference.
     Usage:
+        # frame 0 (has mask):
+        alpha0 = session(frame0_rgb01, mask01)
         # frames 1..N (no mask):
+        alpha  = session(frame_rgb01)
     """
+    def __init__(
+        self,
+        core,
+        device: str,
+        model_dtype: torch.dtype,
+        use_autocast: bool,
+        autocast_dtype: Optional[torch.dtype],
+        max_edge: int = 768,
+        target_pixels: int = 600_000,   # ~775x775 cap by area
+    ):
         self.core = core
         self.device = device
+        self.model_dtype = model_dtype
+        self.use_autocast = use_autocast and (device == "cuda")
+        self.autocast_dtype = autocast_dtype if self.use_autocast else None
+        self.max_edge = int(max_edge)
+        self.target_pixels = int(target_pixels)
         self.started = False
+        # feature detection
         try:
+            sig = inspect.signature(self.core.step)
+            self._has_first_frame_pred = "first_frame_pred" in sig.parameters
         except Exception:
             self._has_first_frame_pred = True
         self._has_prob_to_mask = hasattr(self.core, "output_prob_to_mask")
     def reset(self):
             pass
         self.started = False
+    # ---- helpers ----
+    def _compute_scaled_size(self, h: int, w: int) -> Tuple[int, int, float]:
+        if h <= 0 or w <= 0:
+            return h, w, 1.0
+        s1 = min(1.0, self.max_edge / max(h, w))
+        s2 = min(1.0, (self.target_pixels / (h * w)) ** 0.5) if self.target_pixels > 0 else 1.0
+        s = min(s1, s2)
+        nh = max(1, int(round(h * s)))
+        nw = max(1, int(round(w * s)))
+        return nh, nw, s
     def _to_alpha(self, out_prob):
         if self._has_prob_to_mask:
             try:
                 return self.core.output_prob_to_mask(out_prob, matting=True)
             except Exception:
                 pass
         t = torch.as_tensor(out_prob).float()
         if t.ndim == 3 and t.shape[0] >= 1:
             return t[0]
             return t
         return torch.full((1, 1), 0.5, dtype=torch.float32, device=t.device if t.is_cuda else "cpu")
+    # ---- main call ----
+    def __call__(self, image, mask=None, **kwargs) -> np.ndarray:
+        """
+        Returns a 2-D float32 alpha [H,W]. On first call, provide a coarse mask.
+        Subsequent calls propagate without a mask.
+        """
+        # Boundary normalization
+        img_bchw = _to_bchw(image, self.device, is_mask=False)   # [1,C,H,W]
+        msk_b1hw = _to_bchw(mask,  self.device, is_mask=True) if mask is not None else None
+        H, W = img_bchw.shape[-2], img_bchw.shape[-1]
+        if msk_b1hw is not None:
+            msk_b1hw = _resize_bchw(msk_b1hw, (H, W), is_mask=True)
+        # dtype alignment for activations
+        img_bchw = img_bchw.to(self.model_dtype, non_blocking=True)
+        # initial scale + fallbacks
+        nh, nw, s = self._compute_scaled_size(H, W)
+        scales = [(nh, nw)]
+        if s < 1.0:
+            scales.append((max(1, int(nh * 0.85)), max(1, int(nw * 0.85))))
+            scales.append((max(1, int(nh * 0.70)), max(1, int(nw * 0.70))))
+        last_exc = None
+        for (th, tw) in scales:
+            try:
+                # downscale for inference if needed
+                img_in = _resize_bchw(img_bchw, (th, tw), is_mask=False)
+                msk_in = _resize_bchw(msk_b1hw, (th, tw), is_mask=True) if msk_b1hw is not None else None
+                img_chw = _to_chw_image(img_in)
+                m_1hw  = _to_1hw_mask(msk_in) if msk_in is not None else None
+                # inference with autocast + inference_mode
+                with torch.inference_mode():
+                    if self.use_autocast:
+                        amp_ctx = torch.cuda.amp.autocast(dtype=self.autocast_dtype)
+                    else:
+                        class _NoOp:
+                            def __enter__(self): return None
+                            def __exit__(self, *args): return False
+                        amp_ctx = _NoOp()
+                    with amp_ctx:
+                        if not self.started:
+                            if m_1hw is None:
+                                logger.warning("First frame arrived without a mask; returning neutral alpha.")
+                                return np.full((H, W), 0.5, dtype=np.float32)
+                            # encode/memorize
+                            _ = self.core.step(image=img_chw, mask=m_1hw)
+                            # warm-up predict
+                            if self._has_first_frame_pred:
+                                out_prob = self.core.step(image=img_chw, first_frame_pred=True)
+                            else:
+                                out_prob = self.core.step(image=img_chw)
+                            alpha = self._to_alpha(out_prob)
+                            self.started = True
+                        else:
+                            out_prob = self.core.step(image=img_chw)
+                            alpha = self._to_alpha(out_prob)
+                # upsample back to original resolution if scaled
+                if (th, tw) != (H, W):
+                    alpha = torch.as_tensor(alpha).unsqueeze(0).unsqueeze(0).float()
+                    alpha = F.interpolate(alpha, size=(H, W), mode="bilinear", align_corners=False)
+                    alpha = alpha.squeeze(0).squeeze(0)
+                return _to_2d_alpha_numpy(alpha)
+            except torch.cuda.OutOfMemoryError as e:
+                last_exc = e
+                logger.warning(f"MatAnyone OOM at {th}x{tw}; retrying smaller. {e}")
+                torch.cuda.empty_cache()
+                continue
+            except Exception as e:
+                last_exc = e
+                logger.debug(traceback.format_exc())
+                logger.warning(f"MatAnyone call failed at {th}x{tw}; retrying smaller. {e}")
+                torch.cuda.empty_cache()
+                continue
+        # All attempts failed → return fallback
+        logger.warning(f"MatAnyone calls failed; returning input mask as fallback. {last_exc}")
+        if msk_b1hw is not None:
+            return _to_2d_alpha_numpy(msk_b1hw)
+        return np.full((H, W), 0.5, dtype=np.float32)
 # -------------------------------- Loader ---------------------------------- #
+def _choose_precision(device: str) -> Tuple[torch.dtype, bool, Optional[torch.dtype]]:
+    """
+    Decide model+autocast dtypes.
+    Strategy:
+      - Prefer bf16 autocast if supported (Ampere+), keep weights bf16 if possible.
+      - Else use fp16 autocast, keep weights fp16 if safe.
+      - Else fp32 without autocast.
+    """
+    if device != "cuda":
+        return torch.float32, False, None
+    bf16_ok = hasattr(torch.cuda, "is_bf16_supported") and torch.cuda.is_bf16_supported()
+    cc = torch.cuda.get_device_capability() if torch.cuda.is_available() else (0, 0)
+    fp16_ok = cc[0] >= 7  # Volta+
+    if bf16_ok:
+        return torch.bfloat16, True, torch.bfloat16
+    if fp16_ok:
+        return torch.float16, True, torch.float16
+    return torch.float32, False, None
 class MatAnyoneLoader:
     """
+    Official MatAnyone loader with stateful, OOM-resilient adapter.
     """
     def __init__(self, device: str = "cuda", cache_dir: str = "./checkpoints/matanyone_cache"):
         """
         Import MatAnyone + InferenceCore with resilient fallbacks (different dist layouts).
         """
         model_cls = core_cls = None
         err_msgs = []
         # Candidates for model class
+        for mod, cls in [
             ("matanyone.model.matanyone", "MatAnyone"),
             ("matanyone", "MatAnyone"),
+        ]:
             try:
                 m = __import__(mod, fromlist=[cls])
                 model_cls = getattr(m, cls)
                 err_msgs.append(f"model {mod}.{cls}: {e}")
         # Candidates for InferenceCore
+        for mod, cls in [
             ("matanyone.inference.inference_core", "InferenceCore"),
             ("matanyone", "InferenceCore"),
+        ]:
             try:
                 m = __import__(mod, fromlist=[cls])
                 core_cls = getattr(m, cls)
         try:
             model_cls, core_cls = self._import_model_and_core()
+            # pick precision strategy
+            model_dtype, use_autocast, autocast_dtype = _choose_precision(self.device)
+            logger.info(f"MatAnyone precision: weights={model_dtype}, autocast={use_autocast and autocast_dtype}")
             # Official pattern: model -> eval -> core(model, cfg=model.cfg)
             self.model = model_cls.from_pretrained(self.model_id)
+            # Try to move weights to selected dtype (safe try)
+            try:
+                self.model = self.model.to(self.device).to(model_dtype)
+            except Exception:
+                self.model = self.model.to(self.device)
+                # keep weights fp32; still benefit from autocast
+            self.model.eval()
             # Some builds require cfg; fall back if not present
             try:
                 else:
                     self.core = core_cls(self.model)
             except TypeError:
                 self.core = core_cls(self.model)
             try:
                 if hasattr(self.core, "to"):
                     self.core.to(self.device)
             except Exception:
                 pass
+            # tune scaling from env (optional)
+            max_edge = int(os.environ.get("MATANYONE_MAX_EDGE", "768"))
+            target_pixels = int(os.environ.get("MATANYONE_TARGET_PIXELS", "600000"))
+            self.adapter = _MatAnyoneSession(
+                self.core,
+                device=self.device,
+                model_dtype=model_dtype,
+                use_autocast=use_autocast,
+                autocast_dtype=autocast_dtype,
+                max_edge=max_edge,
+                target_pixels=target_pixels,
+            )
             self.load_time = time.time() - start
             logger.info(f"MatAnyone loaded in {self.load_time:.2f}s")
             return self.adapter
             return None
     def cleanup(self):
         self.adapter = None
         self.core = None
         if self.model: