Spaces:

VeuReu
/

svision

Running on Zero

App Files Files Community

VeuReu commited on 25 days ago

Commit

de07c6b

verified ·

1 Parent(s): af1ccf1

Update app.py

Browse files

Files changed (1) hide show

app.py +259 -61

app.py CHANGED Viewed

@@ -1,21 +1,37 @@
-# app.py — veureu/svision (Salamandra Vision 7B · ZeroGPU) — compatible con ENGINE
-import os
 import json
-from typing import Dict, List, Optional, Tuple, Union, Any
 import gradio as gr
 import spaces
 import torch
-from facenet_pytorch import MTCNN, InceptionResnetV1
-import numpy as np
 from PIL import Image
 from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration
-import cv2
-from scenedetect import VideoManager, SceneManager
-from scenedetect.detectors import ContentDetector
 MODEL_ID = os.environ.get("MODEL_ID", "BSC-LT/salamandra-7b-vision")
 DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
@@ -27,6 +43,21 @@ _facenet = None
 def _load_face_models() -> Tuple[MTCNN, InceptionResnetV1]:
     global _mtcnn, _facenet
     if _mtcnn is None or _facenet is None:
         device = DEVICE if DEVICE == "cuda" and torch.cuda.is_available() else "cpu"
@@ -36,6 +67,26 @@ def _load_face_models() -> Tuple[MTCNN, InceptionResnetV1]:
 def _lazy_load() -> Tuple[LlavaOnevisionForConditionalGeneration, AutoProcessor]:
     global _model, _processor
     if _model is None or _processor is None:
         _processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
@@ -50,54 +101,190 @@ def _lazy_load() -> Tuple[LlavaOnevisionForConditionalGeneration, AutoProcessor]
         _model.to(DEVICE)
     return _model, _processor
-def _compose_prompt(user_text: str, context: Optional[Dict] = None) -> List[Dict]:
-    """Construye el chat template con imagen + texto + contexto opcional."""
-    ctx_txt = ""
-    if context:
-        try:
-            # breve, sin ruido
-            ctx_txt = "\n\nContexto adicional:\n" + json.dumps(context, ensure_ascii=False)[:2000]
-        except Exception:
-            pass
-    user_txt = (user_text or "Describe la imagen con detalle.") + ctx_txt
-    convo = [
-        {
-            "role": "user",
-            "content": [
-                {"type": "image"},
-                {"type": "text", "text": user_txt},
-            ],
-        }
-    ]
-    return convo
-@spaces.GPU  # en HF Spaces usará GPU cuando haya disponibilidad (ZeroGPU)
-def _infer_one(image: Image.Image, text: str, max_new_tokens: int = 256, temperature: float = 0.7,
-               context: Optional[Dict] = None) -> str:
-    # Reducir el tamaño de la imagen para ahorrar memoria en la GPU
     image.thumbnail((1024, 1024))
     model, processor = _lazy_load()
     prompt = processor.apply_chat_template(_compose_prompt(text, context), add_generation_prompt=True)
     inputs = processor(images=image, text=prompt, return_tensors="pt").to(DEVICE, dtype=DTYPE)
     with torch.inference_mode():
-        out = model.generate(**inputs, max_new_tokens=int(max_new_tokens), temperature=float(temperature))
     return processor.decode(out[0], skip_special_tokens=True).strip()
-# ----------------------------- API helpers -----------------------------------
 def describe_raw(image: Image.Image, text: str = "Describe la imagen con detalle.",
                  max_new_tokens: int = 256, temperature: float = 0.7) -> Dict[str, str]:
     result = _infer_one(image, text, max_new_tokens, temperature, context=None)
     return {"text": result}
-def describe_batch(images: List[Image.Image], context_json: str,
-                   max_new_tokens: int = 256, temperature: float = 0.7) -> List[str]:
-    """Endpoint batch para ENGINE: lista de imágenes + contexto (JSON) → lista de textos."""
     try:
         context = json.loads(context_json) if context_json else None
     except Exception:
@@ -109,32 +296,22 @@ def describe_batch(images: List[Image.Image], context_json: str,
     return outputs
-@spaces.GPU
 def face_image_embedding(image: Image.Image) -> List[float] | None:
-    try:
-        mtcnn, facenet = _load_face_models()
-        # detectar y extraer cara
-        face = mtcnn(image)
-        if face is None:
-            return None
-        # FaceNet espera tensor shape (1,3,160,160)
-        device = DEVICE if DEVICE == "cuda" and torch.cuda.is_available() else "cpu"
-        face = face.unsqueeze(0).to(device)
-        # obtener embedding
-        with torch.no_grad():
-            emb = facenet(face).cpu().numpy()[0]
-        # normalizar igual que tu código original
-        emb = emb / np.linalg.norm(emb)
-        return emb.astype(float).tolist()
-    except Exception as e:
-        print(f"Fallo embedding cara: {e}")
-        return None
 @spaces.GPU
 def scenes_extraction(video_file: str, threshold: float, offset_frames: int, crop_ratio: float) -> Tuple[List[Image.Image], List[Dict]] | None:
@@ -185,6 +362,27 @@ def scenes_extraction(video_file: str, threshold: float, offset_frames: int, cro
 # ----------------------------- UI & Endpoints --------------------------------
 with gr.Blocks(title="Salamandra Vision 7B · ZeroGPU") as demo:
     gr.Markdown("## Salamandra-Vision 7B · ZeroGPU\nImagen + texto → descripción.")
     with gr.Row():

+# Standard library
 import json
+import os
+from typing import Any, Dict, List, Optional, Tuple, Union
+# Third-party libraries
+import cv2
 import gradio as gr
+import numpy as np
 import spaces
 import torch
+from facenet_pytorch import InceptionResnetV1, MTCNN
 from PIL import Image
+from scenedetect import SceneManager, VideoManager
+from scenedetect.detectors import ContentDetector
 from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration
+'''
+# ==============================================================================
+# Lazy-loading utilities for vision-language and face recognition models
+# ==============================================================================
+This module provides on-demand initialization of heavyweight components, including:
+- MTCNN: Face detector used to locate and align faces.
+- FaceNet (InceptionResnetV1): Generates 512-dimensional facial embeddings.
+- LLaVA OneVision: Vision-language model for multimodal inference.
+By loading models lazily and caching them in global variables, the system avoids
+unnecessary reinitialization and reduces startup time, improving performance in
+production environments such as FastAPI services, Docker deployments, and
+Hugging Face Spaces.
+# ==============================================================================
+'''
 MODEL_ID = os.environ.get("MODEL_ID", "BSC-LT/salamandra-7b-vision")
 DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 def _load_face_models() -> Tuple[MTCNN, InceptionResnetV1]:
+    """
+    Lazily loads and initializes the facial detection and facial embedding models.
+    This function loads:
+    - **MTCNN**: Used for face detection and cropping.
+    - **InceptionResnetV1 (FaceNet)**: Used to generate 512-dimensional face embeddings.
+    Both models are loaded only once and stored in global variables to avoid
+    unnecessary re-initialization. They are automatically placed on GPU if available,
+    otherwise CPU is used.
+    Returns:
+        Tuple[MTCNN, InceptionResnetV1]: A tuple containing the initialized
+        face detection model and the face embedding model.
+    """
     global _mtcnn, _facenet
     if _mtcnn is None or _facenet is None:
         device = DEVICE if DEVICE == "cuda" and torch.cuda.is_available() else "cpu"
 def _lazy_load() -> Tuple[LlavaOnevisionForConditionalGeneration, AutoProcessor]:
+    """
+    Lazily loads the vision-language model and its processor.
+    This function performs a first-time load of:
+    - **AutoProcessor**: Handles preprocessing of text and images for the model.
+    - **LlavaOnevisionForConditionalGeneration**: The main multimodal model used
+      for inference and text generation.
+    The model is moved to GPU if available and configured with:
+    - The appropriate floating-point precision (`float16` or `float32`)
+    - Low memory usage mode
+    - SafeTensors loading enabled
+    Both components are cached in global variables to ensure subsequent calls
+    reuse the loaded instances without reinitialization.
+    Returns:
+        Tuple[LlavaOnevisionForConditionalGeneration, AutoProcessor]:
+        The loaded model and processor ready for inference.
+    """
     global _model, _processor
     if _model is None or _processor is None:
         _processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
         _model.to(DEVICE)
     return _model, _processor
+'''
+# ==============================================================================
+# Auxiliary Model Loading Utilities for API Endpoints
+# ==============================================================================
+This module contains helper functions used internally by the API endpoints to
+efficiently load and manage heavy machine learning components. These utilities
+handle on-demand initialization ("lazy loading") of both the vision-language
+model (LLaVA OneVision) and the facial detection/embedding models (MTCNN and
+FaceNet).
+The goal of this helper block is to:
+- Avoid repeated loading of large models across requests.
+- Reduce GPU/CPU memory pressure by reusing cached instances.
+- Provide clean separation between endpoint logic and model-handling logic.
+- Improve performance and stability in production environments
+  (FastAPI, Docker, Hugging Face Spaces).
+All functions here are intended for internal use and should be called by
+endpoint handlers when a model is required for a given request.
+# ==============================================================================
+'''
+@spaces.GPU
+def _infer_one(
+    image: Image.Image,
+    text: str,
+    max_new_tokens: int = 256,
+    temperature: float = 0.7,
+    context: Optional[Dict] = None,
+) -> str:
+    """
+    Run a single multimodal inference step using the LLaVA OneVision model.
+    This function:
+    - Optionally downsizes the input image to reduce GPU memory consumption.
+    - Loads the model and processor through lazy initialization.
+    - Builds the final prompt by applying the chat template and injecting optional context.
+    - Performs autoregressive generation with configurable token and temperature settings.
+    - Returns the decoded textual output.
+    Args:
+        image (Image.Image): Input PIL image used for multimodal conditioning.
+        text (str): User-provided instruction or query.
+        max_new_tokens (int): Maximum number of tokens to generate.
+        temperature (float): Sampling temperature controlling output randomness.
+        context (Optional[Dict]): Additional context injected into the prompt.
+    Returns:
+        str: The generated textual response.
+    """
     image.thumbnail((1024, 1024))
     model, processor = _lazy_load()
     prompt = processor.apply_chat_template(_compose_prompt(text, context), add_generation_prompt=True)
     inputs = processor(images=image, text=prompt, return_tensors="pt").to(DEVICE, dtype=DTYPE)
     with torch.inference_mode():
+        out = model.generate(
+            **inputs,
+            max_new_tokens=int(max_new_tokens),
+            temperature=float(temperature),
+        )
     return processor.decode(out[0], skip_special_tokens=True).strip()
+@spaces.GPU
+def _get_face_embedding(
+    image: Image.Image
+) -> list[float] | None:
+    """
+    Generate a FaceNet embedding for a single face in an image.
+    Args:
+        image (Image.Image): A PIL Image containing a face.
+    Returns:
+        list[float] | None: Normalized embedding vector for the detected face,
+        or None if no face is detected or an error occurs.
+    """
+    try:
+        mtcnn, facenet = _load_face_models()
+        # Detect and extract face
+        face = mtcnn(image)
+        if face is None:
+            return None
+        # FaceNet expects tensor of shape (1,3,160,160)
+        device = DEVICE if DEVICE == "cuda" and torch.cuda.is_available() else "cpu"
+        face = face.unsqueeze(0).to(device)
+        # Get embedding
+        with torch.no_grad():
+            emb = facenet(face).cpu().numpy()[0]
+        # Normalize embedding
+        emb = emb / np.linalg.norm(emb)
+        return emb.astype(float).tolist()
+    except Exception as e:
+        print(f"Face embedding failed: {e}")
+        return None
+"""
+# ==============================================================================
+# API Helpers
+# ==============================================================================
+Collection of public-facing API endpoints used by the application.
+This section exposes functions that process incoming requests,
+perform validation, interact with the model inference helpers,
+and return structured responses. Each endpoint is designed to be
+stateless, deterministic, and safe to call from external clients.
+Endpoints in this module typically:
+- Receive raw data (images, text, base64-encoded content, etc.)
+- Preprocess inputs before forwarding them to internal inference utilities
+- Handle optional parameters such as temperature or token limits
+- Return JSON-serializable dictionaries as responses
+The functions below constitute the interface layer between users
+and the underlying model logic implemented in the helper utilities.
+# ==============================================================================
+"""
 def describe_raw(image: Image.Image, text: str = "Describe la imagen con detalle.",
                  max_new_tokens: int = 256, temperature: float = 0.7) -> Dict[str, str]:
+    """
+    Endpoint to generate a detailed description of an input image.
+    This function receives an image and an optional text prompt, then forwards
+    the request to the internal inference helper `_infer_one`. It returns a JSON-
+    serializable dictionary containing the generated text description.
+    Parameters
+    ----------
+    image : PIL.Image.Image
+        The input image to be analyzed and described.
+    text : str, optional
+        Instruction or prompt for the model guiding how the image should be described.
+        Defaults to a general "describe in detail" prompt (in Spanish).
+    max_new_tokens : int, optional
+        Maximum number of tokens the model is allowed to generate. Default is 256.
+    temperature : float, optional
+        Sampling temperature controlling randomness of the output. Default is 0.7.
+    Returns
+    -------
+    Dict[str, str]
+        A dictionary with a single key `"text"` containing the generated description.
+    """
     result = _infer_one(image, text, max_new_tokens, temperature, context=None)
     return {"text": result}
+def describe_batch(
+    images: List[Image.Image],
+    context_json: str,
+    max_new_tokens: int = 256,
+    temperature: float = 0.7
+) -> List[str]:
+    """
+    Batch endpoint for the image description engine.
+    This endpoint receives a list of images along with an optional JSON-formatted
+    context, and returns a list of textual descriptions generated by the model.
+    Each image is processed individually using the internal `_infer_one` function,
+    optionally incorporating the context into the prompt.
+    Args:
+        images (List[Image.Image]):
+            A list of PIL Image objects to describe.
+        context_json (str):
+            A JSON-formatted string providing additional context for the prompt.
+            If empty or invalid, no context will be used.
+        max_new_tokens (int, optional):
+            Maximum number of tokens to generate per image. Defaults to 256.
+        temperature (float, optional):
+            Sampling temperature controlling text randomness. Defaults to 0.7.
+    Returns:
+        List[str]: A list of text descriptions, one for each input image, in order.
+    """
     try:
         context = json.loads(context_json) if context_json else None
     except Exception:
     return outputs
 def face_image_embedding(image: Image.Image) -> List[float] | None:
+    """
+    Endpoint to generate a face embedding for a given image.
+    This function wraps the core `_get_face_embedding` logic for use in endpoints.
+    The MTCNN and FaceNet models must be preloaded before calling this function.
+    Args:
+        image (Image.Image): Input image containing a face.
+        mtcnn (MTCNN): Preloaded MTCNN face detector.
+        facenet (InceptionResnetV1): Preloaded FaceNet model.
+    Returns:
+        list[float] | None: Normalized embedding vector or None if no face detected.
+    """
+    return _get_face_embedding(image)
 @spaces.GPU
 def scenes_extraction(video_file: str, threshold: float, offset_frames: int, crop_ratio: float) -> Tuple[List[Image.Image], List[Dict]] | None:
 # ----------------------------- UI & Endpoints --------------------------------
+def _compose_prompt(user_text: str, context: Optional[Dict] = None) -> List[Dict]:
+    """Construye el chat template con imagen + texto + contexto opcional."""
+    ctx_txt = ""
+    if context:
+        try:
+            # breve, sin ruido
+            ctx_txt = "\n\nContexto adicional:\n" + json.dumps(context, ensure_ascii=False)[:2000]
+        except Exception:
+            pass
+    user_txt = (user_text or "Describe la imagen con detalle.") + ctx_txt
+    convo = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image"},
+                {"type": "text", "text": user_txt},
+            ],
+        }
+    ]
+    return convo
 with gr.Blocks(title="Salamandra Vision 7B · ZeroGPU") as demo:
     gr.Markdown("## Salamandra-Vision 7B · ZeroGPU\nImagen + texto → descripción.")
     with gr.Row():