Spaces:

MogensR
/

VideoBackgroundReplacer2

Paused

App Files Files Community

MogensR commited on Oct 4

Commit

27430ce

verified ·

1 Parent(s): d838606

Update pipeline/video_pipeline.py

Browse files

Files changed (1) hide show

pipeline/video_pipeline.py +70 -11

pipeline/video_pipeline.py CHANGED Viewed

@@ -21,7 +21,7 @@
 from collections import deque
 import torch
 from PIL import Image
-import contextlib  # <-- added
 import streamlit as st
@@ -107,7 +107,7 @@ def _normalize_input(inp, work_dir: Path) -> str:
     return str(target)
 # --- SAM2 Mask Generation (multi-frame, CUDA-for-seed only; returns mask at ORIGINAL size) ---
-def generate_first_frame_mask(video_path, predictor, num_frames: int = 3):
     """
     Build a robust seed mask by running SAM2 on the first N frames (default 3),
     upsampling each mask back to the ORIGINAL video resolution, and combining
@@ -115,6 +115,9 @@ def generate_first_frame_mask(video_path, predictor, num_frames: int = 3):
     offloaded back to CPU to free VRAM before MatAnyone runs.
     Output is a uint8 mask in {0, 255} at (orig_h, orig_w).
     """
     # Move SAM2 model to CUDA only for seeding
     try:
         if torch.cuda.is_available() and hasattr(predictor, "model"):
@@ -157,6 +160,9 @@ def generate_first_frame_mask(video_path, predictor, num_frames: int = 3):
     autocast_ctx = torch.autocast("cuda", dtype=torch.float16) if torch.cuda.is_available() else contextlib.nullcontext()
     with torch.inference_mode(), autocast_ctx:
         for idx, frame in enumerate(frames):
             h, w = frame.shape[:2]
             # Downscale for inference if needed (≤1080 on the long side)
             if max(h, w) > 1080:
@@ -212,6 +218,9 @@ def generate_first_frame_mask(video_path, predictor, num_frames: int = 3):
     logger.info(f"[sam2] multi-frame seed: N={len(masks_fullres)}, "
                 f"orig_size={orig_w}x{orig_h}, majority={required}/{len(masks_fullres)}")
     # Offload SAM2 weights + free CUDA cache BEFORE MatAnyone
     try:
         if hasattr(predictor, "model"):
@@ -227,8 +236,11 @@ def generate_first_frame_mask(video_path, predictor, num_frames: int = 3):
     return vote
 # --- Temporal Smoothing ---
-def smooth_alpha_video(alpha_path, output_path, window_size=5):
     """Apply temporal smoothing to alpha masks"""
     cap = cv2.VideoCapture(alpha_path)
     fps = cap.get(cv2.CAP_PROP_FPS)
     width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
@@ -250,8 +262,11 @@ def smooth_alpha_video(alpha_path, output_path, window_size=5):
     return output_path
 # --- Transparent MOV Creation (FFmpeg) ---
-def create_transparent_mov(foreground_path, alpha_path, output_dir):
     """Create transparent MOV using FFmpeg (reliable alpha handling)"""
     output_path = str(output_dir / "transparent.mov")
     logger.info(f"[create_transparent_mov] Foreground: {foreground_path}, Alpha: {alpha_path}, Output: {output_path}")
     try:
@@ -288,9 +303,13 @@ def create_transparent_mov(foreground_path, alpha_path, output_dir):
         return None
 # --- Stage 1: Transparent Video Creation (with watchdog for MatAnyone) ---
-def stage1_create_transparent_video(input_file, sam2_predictor, matanyone_processor, mat_timeout_sec: int = 180):
     """Pipeline: SAM2 → MatAnyone → FFmpeg MOV (with watchdog timeout on MatAnyone)"""
     logger.info("Stage 1: Creating transparent video")
     heartbeat_flag = {"running": True}
     threading.Thread(target=heartbeat_monitor, args=(heartbeat_flag,), daemon=True).start()
     try:
@@ -308,6 +327,9 @@ def stage1_create_transparent_video(input_file, sam2_predictor, matanyone_proces
                 raise FileNotFoundError(f"Input not found: {input_path}")
             # 1) Extract audio (best effort)
             audio_path = str(temp_dir / "audio.aac")
             if extract_audio(input_path, audio_path):
                 try:
@@ -320,7 +342,7 @@ def stage1_create_transparent_video(input_file, sam2_predictor, matanyone_proces
                 audio_path = None
             # 2) Seed mask via SAM2 (multi-frame at original size)
-            mask = generate_first_frame_mask(input_path, sam2_predictor)
             mask_path = str(temp_dir / "mask.png")
             ok = cv2.imwrite(mask_path, mask)
             if not ok or not os.path.exists(mask_path):
@@ -328,6 +350,9 @@ def stage1_create_transparent_video(input_file, sam2_predictor, matanyone_proces
             logger.info(f"[stage1] First-frame mask saved: {mask_path}")
             # 3) MatAnyone with watchdog timeout
             if torch.cuda.is_available():
                 try:
                     name = torch.cuda.get_device_name(0)
@@ -346,6 +371,7 @@ def stage1_create_transparent_video(input_file, sam2_predictor, matanyone_proces
             )
             result_holder = {"ok": False, "fg": None, "alpha": None, "exc": None}
             def _run_matanyone():
                 try:
@@ -363,7 +389,15 @@ def _run_matanyone():
             t = threading.Thread(target=_run_matanyone, daemon=True)
             t.start()
-            t.join(timeout=mat_timeout_sec)
             if t.is_alive():
                 logger.error(f"[stage1] MatAnyone timed out after {mat_timeout_sec}s")
@@ -375,6 +409,9 @@ def _run_matanyone():
             foreground_path, alpha_path = result_holder["fg"], result_holder["alpha"]
             logger.info(f"[stage1] MatAnyone output: foreground={foreground_path}, alpha={alpha_path}")
             if not foreground_path or not os.path.exists(foreground_path):
                 raise FileNotFoundError(f"MatAnyone foreground missing: {foreground_path}")
             if not alpha_path or not os.path.exists(alpha_path):
@@ -388,13 +425,13 @@ def _run_matanyone():
             logger.info(f"[stage1] Sizes: foreground={fg_sz} bytes, alpha={al_sz} bytes")
             # 4) Temporal smoothing (alpha)
-            smoothed_alpha = smooth_alpha_video(alpha_path, str(temp_dir / "alpha_smoothed.mp4"))
             if not os.path.exists(smoothed_alpha):
                 raise FileNotFoundError(f"Smoothed alpha missing: {smoothed_alpha}")
             logger.info(f"[stage1] Smoothed alpha: {smoothed_alpha}")
             # 5) Create transparent MOV
-            transparent_path = create_transparent_mov(foreground_path, smoothed_alpha, temp_dir)
             if not transparent_path or not os.path.exists(transparent_path):
                 raise RuntimeError("Transparent MOV creation failed")
@@ -404,6 +441,9 @@ def _run_matanyone():
             shutil.copyfile(transparent_path, persist_path)
             logger.info(f"[stage1] Transparent video saved: {persist_path}")
             # Return paths for Stage 2
             return str(persist_path), audio_path
@@ -418,15 +458,22 @@ def _run_matanyone():
         gc.collect()
 # --- Stage 2: Background Compositing + Audio Muxing ---
-def stage2_composite_background(transparent_video_path, audio_path, background, bg_type):
     """Composite transparent video with background and restore audio"""
     logger.info("Stage 2: Compositing with background and audio")
     try:
         cap = cv2.VideoCapture(transparent_video_path)
         fps = cap.get(cv2.CAP_PROP_FPS)
         width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
         height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
         # Prepare background
         if bg_type.lower() == "image" and isinstance(background, Image.Image):
             bg_array = cv2.cvtColor(np.array(background.resize((width, height))), cv2.COLOR_RGB2BGR)
@@ -438,6 +485,9 @@ def stage2_composite_background(transparent_video_path, audio_path, background,
         bg_resized = cv2.resize(bg_array, (width, height))
         # Composite frames (no audio yet)
         temp_output_path = str(Path("tmp") / "final_video_no_audio.mp4")
         fourcc = cv2.VideoWriter_fourcc(*'mp4v')
@@ -455,17 +505,26 @@ def stage2_composite_background(transparent_video_path, audio_path, background,
         cap.release()
         out.release()
         # Mux audio back into the final video
         final_output_path = str(Path("tmp") / "final_output.mp4")
         if audio_path and os.path.exists(audio_path):
             success = mux_audio(temp_output_path, audio_path, final_output_path)
             if not success:
                 logger.warning("Audio muxing failed, returning video without audio")
                 return temp_output_path
             os.remove(temp_output_path)  # Clean up temp file
             return final_output_path
         else:
             logger.warning("No audio found, returning video without audio")
             return temp_output_path
     except Exception as e:
         logger.error(f"Stage 2 failed: {e}", exc_info=True)
@@ -482,4 +541,4 @@ def check_gpu(logger):
     return False
 # --- Initialize T4 tuning immediately if imported as module ---
-setup_t4_environment()

 from collections import deque
 import torch
 from PIL import Image
+import contextlib
 import streamlit as st
     return str(target)
 # --- SAM2 Mask Generation (multi-frame, CUDA-for-seed only; returns mask at ORIGINAL size) ---
+def generate_first_frame_mask(video_path, predictor, num_frames: int = 3, progress_callback=None):
     """
     Build a robust seed mask by running SAM2 on the first N frames (default 3),
     upsampling each mask back to the ORIGINAL video resolution, and combining
     offloaded back to CPU to free VRAM before MatAnyone runs.
     Output is a uint8 mask in {0, 255} at (orig_h, orig_w).
     """
+    if progress_callback:
+        progress_callback("🎯 GPU engaged - SAM2 generating seed mask...")
     # Move SAM2 model to CUDA only for seeding
     try:
         if torch.cuda.is_available() and hasattr(predictor, "model"):
     autocast_ctx = torch.autocast("cuda", dtype=torch.float16) if torch.cuda.is_available() else contextlib.nullcontext()
     with torch.inference_mode(), autocast_ctx:
         for idx, frame in enumerate(frames):
+            if progress_callback:
+                progress_callback(f"🎯 SAM2 processing frame {idx+1}/{len(frames)}...")
             h, w = frame.shape[:2]
             # Downscale for inference if needed (≤1080 on the long side)
             if max(h, w) > 1080:
     logger.info(f"[sam2] multi-frame seed: N={len(masks_fullres)}, "
                 f"orig_size={orig_w}x{orig_h}, majority={required}/{len(masks_fullres)}")
+    if progress_callback:
+        progress_callback("🧹 SAM2 complete - clearing GPU memory...")
     # Offload SAM2 weights + free CUDA cache BEFORE MatAnyone
     try:
         if hasattr(predictor, "model"):
     return vote
 # --- Temporal Smoothing ---
+def smooth_alpha_video(alpha_path, output_path, window_size=5, progress_callback=None):
     """Apply temporal smoothing to alpha masks"""
+    if progress_callback:
+        progress_callback("🎬 Smoothing alpha channel...")
     cap = cv2.VideoCapture(alpha_path)
     fps = cap.get(cv2.CAP_PROP_FPS)
     width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
     return output_path
 # --- Transparent MOV Creation (FFmpeg) ---
+def create_transparent_mov(foreground_path, alpha_path, output_dir, progress_callback=None):
     """Create transparent MOV using FFmpeg (reliable alpha handling)"""
+    if progress_callback:
+        progress_callback("🎞️ Creating transparent video with alpha channel...")
     output_path = str(output_dir / "transparent.mov")
     logger.info(f"[create_transparent_mov] Foreground: {foreground_path}, Alpha: {alpha_path}, Output: {output_path}")
     try:
         return None
 # --- Stage 1: Transparent Video Creation (with watchdog for MatAnyone) ---
+def stage1_create_transparent_video(input_file, sam2_predictor, matanyone_processor, mat_timeout_sec: int = 180, progress_callback=None):
     """Pipeline: SAM2 → MatAnyone → FFmpeg MOV (with watchdog timeout on MatAnyone)"""
     logger.info("Stage 1: Creating transparent video")
+    if progress_callback:
+        progress_callback("✅ Stage 1 initiated")
     heartbeat_flag = {"running": True}
     threading.Thread(target=heartbeat_monitor, args=(heartbeat_flag,), daemon=True).start()
     try:
                 raise FileNotFoundError(f"Input not found: {input_path}")
             # 1) Extract audio (best effort)
+            if progress_callback:
+                progress_callback("🎵 Extracting audio from video...")
             audio_path = str(temp_dir / "audio.aac")
             if extract_audio(input_path, audio_path):
                 try:
                 audio_path = None
             # 2) Seed mask via SAM2 (multi-frame at original size)
+            mask = generate_first_frame_mask(input_path, sam2_predictor, progress_callback=progress_callback)
             mask_path = str(temp_dir / "mask.png")
             ok = cv2.imwrite(mask_path, mask)
             if not ok or not os.path.exists(mask_path):
             logger.info(f"[stage1] First-frame mask saved: {mask_path}")
             # 3) MatAnyone with watchdog timeout
+            if progress_callback:
+                progress_callback("🎬 MatAnyone starting video matting...")
             if torch.cuda.is_available():
                 try:
                     name = torch.cuda.get_device_name(0)
             )
             result_holder = {"ok": False, "fg": None, "alpha": None, "exc": None}
+            start_time = time.time()
             def _run_matanyone():
                 try:
             t = threading.Thread(target=_run_matanyone, daemon=True)
             t.start()
+            # Poll with progress updates
+            while t.is_alive():
+                elapsed = int(time.time() - start_time)
+                if progress_callback:
+                    progress_callback(f"🎬 MatAnyone processing... {elapsed}s elapsed")
+                t.join(timeout=5)  # Check every 5 seconds
+                if elapsed > mat_timeout_sec:
+                    break
             if t.is_alive():
                 logger.error(f"[stage1] MatAnyone timed out after {mat_timeout_sec}s")
             foreground_path, alpha_path = result_holder["fg"], result_holder["alpha"]
             logger.info(f"[stage1] MatAnyone output: foreground={foreground_path}, alpha={alpha_path}")
+            if progress_callback:
+                progress_callback("✅ MatAnyone complete")
             if not foreground_path or not os.path.exists(foreground_path):
                 raise FileNotFoundError(f"MatAnyone foreground missing: {foreground_path}")
             if not alpha_path or not os.path.exists(alpha_path):
             logger.info(f"[stage1] Sizes: foreground={fg_sz} bytes, alpha={al_sz} bytes")
             # 4) Temporal smoothing (alpha)
+            smoothed_alpha = smooth_alpha_video(alpha_path, str(temp_dir / "alpha_smoothed.mp4"), progress_callback=progress_callback)
             if not os.path.exists(smoothed_alpha):
                 raise FileNotFoundError(f"Smoothed alpha missing: {smoothed_alpha}")
             logger.info(f"[stage1] Smoothed alpha: {smoothed_alpha}")
             # 5) Create transparent MOV
+            transparent_path = create_transparent_mov(foreground_path, smoothed_alpha, temp_dir, progress_callback=progress_callback)
             if not transparent_path or not os.path.exists(transparent_path):
                 raise RuntimeError("Transparent MOV creation failed")
             shutil.copyfile(transparent_path, persist_path)
             logger.info(f"[stage1] Transparent video saved: {persist_path}")
+            if progress_callback:
+                progress_callback("✅ Stage 1 complete")
             # Return paths for Stage 2
             return str(persist_path), audio_path
         gc.collect()
 # --- Stage 2: Background Compositing + Audio Muxing ---
+def stage2_composite_background(transparent_video_path, audio_path, background, bg_type, progress_callback=None):
     """Composite transparent video with background and restore audio"""
     logger.info("Stage 2: Compositing with background and audio")
+    if progress_callback:
+        progress_callback("🚀 Stage 2 begun")
     try:
         cap = cv2.VideoCapture(transparent_video_path)
         fps = cap.get(cv2.CAP_PROP_FPS)
         width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
         height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        if progress_callback:
+            progress_callback("🎨 Preparing background...")
         # Prepare background
         if bg_type.lower() == "image" and isinstance(background, Image.Image):
             bg_array = cv2.cvtColor(np.array(background.resize((width, height))), cv2.COLOR_RGB2BGR)
         bg_resized = cv2.resize(bg_array, (width, height))
+        if progress_callback:
+            progress_callback("🎬 Compositing frames...")
         # Composite frames (no audio yet)
         temp_output_path = str(Path("tmp") / "final_video_no_audio.mp4")
         fourcc = cv2.VideoWriter_fourcc(*'mp4v')
         cap.release()
         out.release()
+        if progress_callback:
+            progress_callback("🎵 Restoring audio...")
         # Mux audio back into the final video
         final_output_path = str(Path("tmp") / "final_output.mp4")
         if audio_path and os.path.exists(audio_path):
             success = mux_audio(temp_output_path, audio_path, final_output_path)
             if not success:
                 logger.warning("Audio muxing failed, returning video without audio")
+                if progress_callback:
+                    progress_callback("⚠️ Stage 2 complete (no audio)")
                 return temp_output_path
             os.remove(temp_output_path)  # Clean up temp file
+            if progress_callback:
+                progress_callback("✅ Stage 2 complete")
             return final_output_path
         else:
             logger.warning("No audio found, returning video without audio")
+            if progress_callback:
+                progress_callback("✅ Stage 2 complete (no audio)")
             return temp_output_path
     except Exception as e:
         logger.error(f"Stage 2 failed: {e}", exc_info=True)
     return False
 # --- Initialize T4 tuning immediately if imported as module ---
+setup_t4_environment()