Spaces:

merve
/

SAM3-video-segmentation

Running on Zero

App Files Files Community

hysts HF Staff commited on 13 days ago

Commit

0551705

1 Parent(s): 2ca03b4

Clean up

Browse files

Files changed (1) hide show

app.py +31 -46

app.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import colorsys
 import gc
-from typing import Optional
 import cv2
 import gradio as gr
@@ -10,24 +9,15 @@ from gradio.themes import Soft
 from PIL import Image, ImageDraw, ImageFont
 from transformers import Sam3TrackerVideoModel, Sam3TrackerVideoProcessor, Sam3VideoModel, Sam3VideoProcessor
-def get_device_and_dtype() -> tuple[str, torch.dtype]:
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    dtype = torch.bfloat16
-    return device, dtype
-_GLOBAL_DEVICE, _GLOBAL_DTYPE = get_device_and_dtype()
-_GLOBAL_MODEL_REPO_ID = "facebook/sam3"
-_GLOBAL_TRACKER_MODEL = Sam3TrackerVideoModel.from_pretrained(
-    _GLOBAL_MODEL_REPO_ID, torch_dtype=_GLOBAL_DTYPE, device_map=_GLOBAL_DEVICE
-).eval()
-_GLOBAL_TRACKER_PROCESSOR = Sam3TrackerVideoProcessor.from_pretrained(_GLOBAL_MODEL_REPO_ID)
-_GLOBAL_TEXT_VIDEO_MODEL = Sam3VideoModel.from_pretrained(_GLOBAL_MODEL_REPO_ID)
-_GLOBAL_TEXT_VIDEO_MODEL = _GLOBAL_TEXT_VIDEO_MODEL.to(_GLOBAL_DEVICE, dtype=_GLOBAL_DTYPE).eval()
-_GLOBAL_TEXT_VIDEO_PROCESSOR = Sam3VideoProcessor.from_pretrained(_GLOBAL_MODEL_REPO_ID)
 print("Models loaded successfully!")
@@ -149,9 +139,6 @@ def init_video_session(
     GLOBAL_STATE.inference_session = None
     GLOBAL_STATE.active_tab = active_tab
-    device = _GLOBAL_DEVICE
-    dtype = _GLOBAL_DTYPE
     video_path: str | None = None
     if isinstance(video, dict):
         video_path = video.get("name") or video.get("path") or video.get("data")
@@ -182,23 +169,23 @@ def init_video_session(
     raw_video = [np.array(frame) for frame in frames]
     if active_tab == "text":
-        processor = _GLOBAL_TEXT_VIDEO_PROCESSOR
         GLOBAL_STATE.inference_session = processor.init_video_session(
             video=frames,
-            inference_device=device,
             processing_device="cpu",
             video_storage_device="cpu",
-            dtype=dtype,
         )
     else:
-        processor = _GLOBAL_TRACKER_PROCESSOR
         GLOBAL_STATE.inference_session = processor.init_video_session(
             video=raw_video,
-            inference_device=device,
             video_storage_device="cpu",
             processing_device="cpu",
-            inference_state_device=device,
-            dtype=dtype,
         )
     first_frame = frames[0]
@@ -206,12 +193,12 @@ def init_video_session(
     if active_tab == "text":
         status = (
             f"Loaded {len(frames)} frames @ {GLOBAL_STATE.video_fps or 'unknown'} fps{trimmed_note}. "
-            f"Device: {device}, dtype: bfloat16. Ready for text prompting."
         )
     else:
         status = (
             f"Loaded {len(frames)} frames @ {GLOBAL_STATE.video_fps or 'unknown'} fps{trimmed_note}. "
-            f"Device: {device}, dtype: bfloat16. Video session initialized."
         )
     return GLOBAL_STATE, 0, max_idx, first_frame, status
@@ -384,8 +371,8 @@ def on_image_click(
     if state is None or state.inference_session is None:
         return img
-    model = _GLOBAL_TRACKER_MODEL
-    processor = _GLOBAL_TRACKER_PROCESSOR
     x = y = None
     if evt is not None:
@@ -492,8 +479,8 @@ def on_text_prompt(
     if state is None or state.inference_session is None:
         return None, "Upload a video and enter text prompt.", "**Active prompts:** None"
-    model = _GLOBAL_TEXT_VIDEO_MODEL
-    processor = _GLOBAL_TEXT_VIDEO_PROCESSOR
     if not text_prompt or not text_prompt.strip():
         active_prompts = _get_active_prompts_display(state)
@@ -626,8 +613,8 @@ def propagate_masks(GLOBAL_STATE: gr.State):
                 yield GLOBAL_STATE, "Text video model not loaded.", gr.update()
                 return
-            model = _GLOBAL_TEXT_VIDEO_MODEL
-            processor = _GLOBAL_TEXT_VIDEO_PROCESSOR
             # Collect all unique prompts from existing frame annotations
             text_prompt_to_obj_ids = {}
@@ -723,8 +710,8 @@ def propagate_masks(GLOBAL_STATE: gr.State):
                 yield GLOBAL_STATE, "Tracker model not loaded.", gr.update()
                 return
-            model = _GLOBAL_TRACKER_MODEL
-            processor = _GLOBAL_TRACKER_PROCESSOR
             for sam2_video_output in model.propagate_in_video_iterator(
                 inference_session=GLOBAL_STATE.inference_session
@@ -826,27 +813,27 @@ def reset_session(GLOBAL_STATE: gr.State) -> tuple[AppState, Image.Image, int, i
     if GLOBAL_STATE.active_tab == "text":
         if GLOBAL_STATE.video_frames:
-            processor = _GLOBAL_TEXT_VIDEO_PROCESSOR
             GLOBAL_STATE.inference_session = processor.init_video_session(
                 video=GLOBAL_STATE.video_frames,
-                inference_device=_GLOBAL_DEVICE,
                 processing_device="cpu",
                 video_storage_device="cpu",
-                dtype=_GLOBAL_DTYPE,
             )
     elif GLOBAL_STATE.inference_session is not None and hasattr(
         GLOBAL_STATE.inference_session, "reset_inference_session"
     ):
         GLOBAL_STATE.inference_session.reset_inference_session()
     elif GLOBAL_STATE.video_frames:
-        processor = _GLOBAL_TRACKER_PROCESSOR
         raw_video = [np.array(frame) for frame in GLOBAL_STATE.video_frames]
         GLOBAL_STATE.inference_session = processor.init_video_session(
             video=raw_video,
-            inference_device=_GLOBAL_DEVICE,
             video_storage_device="cpu",
             processing_device="cpu",
-            dtype=_GLOBAL_DTYPE,
         )
     GLOBAL_STATE.masks_by_frame.clear()
@@ -894,9 +881,7 @@ def _on_video_change_text(GLOBAL_STATE: gr.State, video):
     )
-theme = Soft(primary_hue="blue", secondary_hue="rose", neutral_hue="slate")
-with gr.Blocks(title="SAM3", theme=theme) as demo:
     GLOBAL_STATE = gr.State(AppState())
     gr.Markdown(

 import colorsys
 import gc
 import cv2
 import gradio as gr
 from PIL import Image, ImageDraw, ImageFont
 from transformers import Sam3TrackerVideoModel, Sam3TrackerVideoProcessor, Sam3VideoModel, Sam3VideoProcessor
+MODEL_ID = "facebook/sam3"
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+DTYPE = torch.bfloat16
+TRACKER_MODEL = Sam3TrackerVideoModel.from_pretrained(MODEL_ID, torch_dtype=DTYPE, device_map=DEVICE).eval()
+TRACKER_PROCESSOR = Sam3TrackerVideoProcessor.from_pretrained(MODEL_ID)
+TEXT_VIDEO_MODEL = Sam3VideoModel.from_pretrained(MODEL_ID).to(DEVICE, dtype=DTYPE).eval()
+TEXT_VIDEO_PROCESSOR = Sam3VideoProcessor.from_pretrained(MODEL_ID)
 print("Models loaded successfully!")
     GLOBAL_STATE.inference_session = None
     GLOBAL_STATE.active_tab = active_tab
     video_path: str | None = None
     if isinstance(video, dict):
         video_path = video.get("name") or video.get("path") or video.get("data")
     raw_video = [np.array(frame) for frame in frames]
     if active_tab == "text":
+        processor = TEXT_VIDEO_PROCESSOR
         GLOBAL_STATE.inference_session = processor.init_video_session(
             video=frames,
+            inference_device=DEVICE,
             processing_device="cpu",
             video_storage_device="cpu",
+            dtype=DTYPE,
         )
     else:
+        processor = TRACKER_PROCESSOR
         GLOBAL_STATE.inference_session = processor.init_video_session(
             video=raw_video,
+            inference_device=DEVICE,
             video_storage_device="cpu",
             processing_device="cpu",
+            inference_state_device=DEVICE,
+            dtype=DTYPE,
         )
     first_frame = frames[0]
     if active_tab == "text":
         status = (
             f"Loaded {len(frames)} frames @ {GLOBAL_STATE.video_fps or 'unknown'} fps{trimmed_note}. "
+            f"Device: {DEVICE}, dtype: bfloat16. Ready for text prompting."
         )
     else:
         status = (
             f"Loaded {len(frames)} frames @ {GLOBAL_STATE.video_fps or 'unknown'} fps{trimmed_note}. "
+            f"Device: {DEVICE}, dtype: bfloat16. Video session initialized."
         )
     return GLOBAL_STATE, 0, max_idx, first_frame, status
     if state is None or state.inference_session is None:
         return img
+    model = TRACKER_MODEL
+    processor = TRACKER_PROCESSOR
     x = y = None
     if evt is not None:
     if state is None or state.inference_session is None:
         return None, "Upload a video and enter text prompt.", "**Active prompts:** None"
+    model = TEXT_VIDEO_MODEL
+    processor = TEXT_VIDEO_PROCESSOR
     if not text_prompt or not text_prompt.strip():
         active_prompts = _get_active_prompts_display(state)
                 yield GLOBAL_STATE, "Text video model not loaded.", gr.update()
                 return
+            model = TEXT_VIDEO_MODEL
+            processor = TEXT_VIDEO_PROCESSOR
             # Collect all unique prompts from existing frame annotations
             text_prompt_to_obj_ids = {}
                 yield GLOBAL_STATE, "Tracker model not loaded.", gr.update()
                 return
+            model = TRACKER_MODEL
+            processor = TRACKER_PROCESSOR
             for sam2_video_output in model.propagate_in_video_iterator(
                 inference_session=GLOBAL_STATE.inference_session
     if GLOBAL_STATE.active_tab == "text":
         if GLOBAL_STATE.video_frames:
+            processor = TEXT_VIDEO_PROCESSOR
             GLOBAL_STATE.inference_session = processor.init_video_session(
                 video=GLOBAL_STATE.video_frames,
+                inference_device=DEVICE,
                 processing_device="cpu",
                 video_storage_device="cpu",
+                dtype=DTYPE,
             )
     elif GLOBAL_STATE.inference_session is not None and hasattr(
         GLOBAL_STATE.inference_session, "reset_inference_session"
     ):
         GLOBAL_STATE.inference_session.reset_inference_session()
     elif GLOBAL_STATE.video_frames:
+        processor = TRACKER_PROCESSOR
         raw_video = [np.array(frame) for frame in GLOBAL_STATE.video_frames]
         GLOBAL_STATE.inference_session = processor.init_video_session(
             video=raw_video,
+            inference_device=DEVICE,
             video_storage_device="cpu",
             processing_device="cpu",
+            dtype=DTYPE,
         )
     GLOBAL_STATE.masks_by_frame.clear()
     )
+with gr.Blocks(title="SAM3", theme=Soft(primary_hue="blue", secondary_hue="rose", neutral_hue="slate")) as demo:
     GLOBAL_STATE = gr.State(AppState())
     gr.Markdown(