Update app.py
Browse files
app.py
CHANGED
|
@@ -1,21 +1,37 @@
|
|
| 1 |
-
#
|
| 2 |
-
import os
|
| 3 |
import json
|
| 4 |
-
|
|
|
|
| 5 |
|
|
|
|
|
|
|
| 6 |
import gradio as gr
|
|
|
|
| 7 |
import spaces
|
| 8 |
import torch
|
| 9 |
-
from facenet_pytorch import
|
| 10 |
-
import numpy as np
|
| 11 |
from PIL import Image
|
|
|
|
|
|
|
| 12 |
from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration
|
| 13 |
|
| 14 |
-
import cv2
|
| 15 |
-
from scenedetect import VideoManager, SceneManager
|
| 16 |
-
from scenedetect.detectors import ContentDetector
|
| 17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
MODEL_ID = os.environ.get("MODEL_ID", "BSC-LT/salamandra-7b-vision")
|
| 20 |
DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32
|
| 21 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
|
@@ -27,6 +43,21 @@ _facenet = None
|
|
| 27 |
|
| 28 |
|
| 29 |
def _load_face_models() -> Tuple[MTCNN, InceptionResnetV1]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
global _mtcnn, _facenet
|
| 31 |
if _mtcnn is None or _facenet is None:
|
| 32 |
device = DEVICE if DEVICE == "cuda" and torch.cuda.is_available() else "cpu"
|
|
@@ -36,6 +67,26 @@ def _load_face_models() -> Tuple[MTCNN, InceptionResnetV1]:
|
|
| 36 |
|
| 37 |
|
| 38 |
def _lazy_load() -> Tuple[LlavaOnevisionForConditionalGeneration, AutoProcessor]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
global _model, _processor
|
| 40 |
if _model is None or _processor is None:
|
| 41 |
_processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
|
|
@@ -50,54 +101,190 @@ def _lazy_load() -> Tuple[LlavaOnevisionForConditionalGeneration, AutoProcessor]
|
|
| 50 |
_model.to(DEVICE)
|
| 51 |
return _model, _processor
|
| 52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
|
|
|
|
|
|
| 80 |
image.thumbnail((1024, 1024))
|
| 81 |
|
| 82 |
model, processor = _lazy_load()
|
| 83 |
prompt = processor.apply_chat_template(_compose_prompt(text, context), add_generation_prompt=True)
|
|
|
|
| 84 |
inputs = processor(images=image, text=prompt, return_tensors="pt").to(DEVICE, dtype=DTYPE)
|
|
|
|
| 85 |
with torch.inference_mode():
|
| 86 |
-
out = model.generate(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
return processor.decode(out[0], skip_special_tokens=True).strip()
|
| 88 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
|
| 90 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
|
| 92 |
def describe_raw(image: Image.Image, text: str = "Describe la imagen con detalle.",
|
| 93 |
max_new_tokens: int = 256, temperature: float = 0.7) -> Dict[str, str]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
result = _infer_one(image, text, max_new_tokens, temperature, context=None)
|
| 95 |
return {"text": result}
|
| 96 |
|
| 97 |
|
| 98 |
-
def describe_batch(
|
| 99 |
-
|
| 100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
try:
|
| 102 |
context = json.loads(context_json) if context_json else None
|
| 103 |
except Exception:
|
|
@@ -109,32 +296,22 @@ def describe_batch(images: List[Image.Image], context_json: str,
|
|
| 109 |
return outputs
|
| 110 |
|
| 111 |
|
| 112 |
-
@spaces.GPU
|
| 113 |
def face_image_embedding(image: Image.Image) -> List[float] | None:
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
# detectar y extraer cara
|
| 117 |
-
face = mtcnn(image)
|
| 118 |
-
|
| 119 |
-
if face is None:
|
| 120 |
-
return None
|
| 121 |
-
|
| 122 |
-
# FaceNet espera tensor shape (1,3,160,160)
|
| 123 |
-
device = DEVICE if DEVICE == "cuda" and torch.cuda.is_available() else "cpu"
|
| 124 |
-
face = face.unsqueeze(0).to(device)
|
| 125 |
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
emb = facenet(face).cpu().numpy()[0]
|
| 129 |
|
| 130 |
-
|
| 131 |
-
|
|
|
|
|
|
|
| 132 |
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
return None
|
| 138 |
|
| 139 |
@spaces.GPU
|
| 140 |
def scenes_extraction(video_file: str, threshold: float, offset_frames: int, crop_ratio: float) -> Tuple[List[Image.Image], List[Dict]] | None:
|
|
@@ -185,6 +362,27 @@ def scenes_extraction(video_file: str, threshold: float, offset_frames: int, cro
|
|
| 185 |
|
| 186 |
# ----------------------------- UI & Endpoints --------------------------------
|
| 187 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
with gr.Blocks(title="Salamandra Vision 7B · ZeroGPU") as demo:
|
| 189 |
gr.Markdown("## Salamandra-Vision 7B · ZeroGPU\nImagen + texto → descripción.")
|
| 190 |
with gr.Row():
|
|
|
|
| 1 |
+
# Standard library
|
|
|
|
| 2 |
import json
|
| 3 |
+
import os
|
| 4 |
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
| 5 |
|
| 6 |
+
# Third-party libraries
|
| 7 |
+
import cv2
|
| 8 |
import gradio as gr
|
| 9 |
+
import numpy as np
|
| 10 |
import spaces
|
| 11 |
import torch
|
| 12 |
+
from facenet_pytorch import InceptionResnetV1, MTCNN
|
|
|
|
| 13 |
from PIL import Image
|
| 14 |
+
from scenedetect import SceneManager, VideoManager
|
| 15 |
+
from scenedetect.detectors import ContentDetector
|
| 16 |
from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration
|
| 17 |
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
+
'''
|
| 20 |
+
# ==============================================================================
|
| 21 |
+
# Lazy-loading utilities for vision-language and face recognition models
|
| 22 |
+
# ==============================================================================
|
| 23 |
+
|
| 24 |
+
This module provides on-demand initialization of heavyweight components, including:
|
| 25 |
+
- MTCNN: Face detector used to locate and align faces.
|
| 26 |
+
- FaceNet (InceptionResnetV1): Generates 512-dimensional facial embeddings.
|
| 27 |
+
- LLaVA OneVision: Vision-language model for multimodal inference.
|
| 28 |
|
| 29 |
+
By loading models lazily and caching them in global variables, the system avoids
|
| 30 |
+
unnecessary reinitialization and reduces startup time, improving performance in
|
| 31 |
+
production environments such as FastAPI services, Docker deployments, and
|
| 32 |
+
Hugging Face Spaces.
|
| 33 |
+
# ==============================================================================
|
| 34 |
+
'''
|
| 35 |
MODEL_ID = os.environ.get("MODEL_ID", "BSC-LT/salamandra-7b-vision")
|
| 36 |
DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32
|
| 37 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
|
| 43 |
|
| 44 |
|
| 45 |
def _load_face_models() -> Tuple[MTCNN, InceptionResnetV1]:
|
| 46 |
+
"""
|
| 47 |
+
Lazily loads and initializes the facial detection and facial embedding models.
|
| 48 |
+
|
| 49 |
+
This function loads:
|
| 50 |
+
- **MTCNN**: Used for face detection and cropping.
|
| 51 |
+
- **InceptionResnetV1 (FaceNet)**: Used to generate 512-dimensional face embeddings.
|
| 52 |
+
|
| 53 |
+
Both models are loaded only once and stored in global variables to avoid
|
| 54 |
+
unnecessary re-initialization. They are automatically placed on GPU if available,
|
| 55 |
+
otherwise CPU is used.
|
| 56 |
+
|
| 57 |
+
Returns:
|
| 58 |
+
Tuple[MTCNN, InceptionResnetV1]: A tuple containing the initialized
|
| 59 |
+
face detection model and the face embedding model.
|
| 60 |
+
"""
|
| 61 |
global _mtcnn, _facenet
|
| 62 |
if _mtcnn is None or _facenet is None:
|
| 63 |
device = DEVICE if DEVICE == "cuda" and torch.cuda.is_available() else "cpu"
|
|
|
|
| 67 |
|
| 68 |
|
| 69 |
def _lazy_load() -> Tuple[LlavaOnevisionForConditionalGeneration, AutoProcessor]:
|
| 70 |
+
"""
|
| 71 |
+
Lazily loads the vision-language model and its processor.
|
| 72 |
+
|
| 73 |
+
This function performs a first-time load of:
|
| 74 |
+
- **AutoProcessor**: Handles preprocessing of text and images for the model.
|
| 75 |
+
- **LlavaOnevisionForConditionalGeneration**: The main multimodal model used
|
| 76 |
+
for inference and text generation.
|
| 77 |
+
|
| 78 |
+
The model is moved to GPU if available and configured with:
|
| 79 |
+
- The appropriate floating-point precision (`float16` or `float32`)
|
| 80 |
+
- Low memory usage mode
|
| 81 |
+
- SafeTensors loading enabled
|
| 82 |
+
|
| 83 |
+
Both components are cached in global variables to ensure subsequent calls
|
| 84 |
+
reuse the loaded instances without reinitialization.
|
| 85 |
+
|
| 86 |
+
Returns:
|
| 87 |
+
Tuple[LlavaOnevisionForConditionalGeneration, AutoProcessor]:
|
| 88 |
+
The loaded model and processor ready for inference.
|
| 89 |
+
"""
|
| 90 |
global _model, _processor
|
| 91 |
if _model is None or _processor is None:
|
| 92 |
_processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
|
|
|
|
| 101 |
_model.to(DEVICE)
|
| 102 |
return _model, _processor
|
| 103 |
|
| 104 |
+
'''
|
| 105 |
+
# ==============================================================================
|
| 106 |
+
# Auxiliary Model Loading Utilities for API Endpoints
|
| 107 |
+
# ==============================================================================
|
| 108 |
+
This module contains helper functions used internally by the API endpoints to
|
| 109 |
+
efficiently load and manage heavy machine learning components. These utilities
|
| 110 |
+
handle on-demand initialization ("lazy loading") of both the vision-language
|
| 111 |
+
model (LLaVA OneVision) and the facial detection/embedding models (MTCNN and
|
| 112 |
+
FaceNet).
|
| 113 |
+
|
| 114 |
+
The goal of this helper block is to:
|
| 115 |
+
- Avoid repeated loading of large models across requests.
|
| 116 |
+
- Reduce GPU/CPU memory pressure by reusing cached instances.
|
| 117 |
+
- Provide clean separation between endpoint logic and model-handling logic.
|
| 118 |
+
- Improve performance and stability in production environments
|
| 119 |
+
(FastAPI, Docker, Hugging Face Spaces).
|
| 120 |
+
|
| 121 |
+
All functions here are intended for internal use and should be called by
|
| 122 |
+
endpoint handlers when a model is required for a given request.
|
| 123 |
+
# ==============================================================================
|
| 124 |
+
'''
|
| 125 |
|
| 126 |
+
@spaces.GPU
|
| 127 |
+
def _infer_one(
|
| 128 |
+
image: Image.Image,
|
| 129 |
+
text: str,
|
| 130 |
+
max_new_tokens: int = 256,
|
| 131 |
+
temperature: float = 0.7,
|
| 132 |
+
context: Optional[Dict] = None,
|
| 133 |
+
) -> str:
|
| 134 |
+
"""
|
| 135 |
+
Run a single multimodal inference step using the LLaVA OneVision model.
|
| 136 |
+
|
| 137 |
+
This function:
|
| 138 |
+
- Optionally downsizes the input image to reduce GPU memory consumption.
|
| 139 |
+
- Loads the model and processor through lazy initialization.
|
| 140 |
+
- Builds the final prompt by applying the chat template and injecting optional context.
|
| 141 |
+
- Performs autoregressive generation with configurable token and temperature settings.
|
| 142 |
+
- Returns the decoded textual output.
|
| 143 |
+
|
| 144 |
+
Args:
|
| 145 |
+
image (Image.Image): Input PIL image used for multimodal conditioning.
|
| 146 |
+
text (str): User-provided instruction or query.
|
| 147 |
+
max_new_tokens (int): Maximum number of tokens to generate.
|
| 148 |
+
temperature (float): Sampling temperature controlling output randomness.
|
| 149 |
+
context (Optional[Dict]): Additional context injected into the prompt.
|
| 150 |
+
|
| 151 |
+
Returns:
|
| 152 |
+
str: The generated textual response.
|
| 153 |
+
"""
|
| 154 |
image.thumbnail((1024, 1024))
|
| 155 |
|
| 156 |
model, processor = _lazy_load()
|
| 157 |
prompt = processor.apply_chat_template(_compose_prompt(text, context), add_generation_prompt=True)
|
| 158 |
+
|
| 159 |
inputs = processor(images=image, text=prompt, return_tensors="pt").to(DEVICE, dtype=DTYPE)
|
| 160 |
+
|
| 161 |
with torch.inference_mode():
|
| 162 |
+
out = model.generate(
|
| 163 |
+
**inputs,
|
| 164 |
+
max_new_tokens=int(max_new_tokens),
|
| 165 |
+
temperature=float(temperature),
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
return processor.decode(out[0], skip_special_tokens=True).strip()
|
| 169 |
|
| 170 |
+
@spaces.GPU
|
| 171 |
+
def _get_face_embedding(
|
| 172 |
+
image: Image.Image
|
| 173 |
+
) -> list[float] | None:
|
| 174 |
+
"""
|
| 175 |
+
Generate a FaceNet embedding for a single face in an image.
|
| 176 |
+
|
| 177 |
+
Args:
|
| 178 |
+
image (Image.Image): A PIL Image containing a face.
|
| 179 |
+
|
| 180 |
+
Returns:
|
| 181 |
+
list[float] | None: Normalized embedding vector for the detected face,
|
| 182 |
+
or None if no face is detected or an error occurs.
|
| 183 |
+
"""
|
| 184 |
+
try:
|
| 185 |
+
mtcnn, facenet = _load_face_models()
|
| 186 |
+
# Detect and extract face
|
| 187 |
+
face = mtcnn(image)
|
| 188 |
+
if face is None:
|
| 189 |
+
return None
|
| 190 |
+
|
| 191 |
+
# FaceNet expects tensor of shape (1,3,160,160)
|
| 192 |
+
device = DEVICE if DEVICE == "cuda" and torch.cuda.is_available() else "cpu"
|
| 193 |
+
face = face.unsqueeze(0).to(device)
|
| 194 |
|
| 195 |
+
# Get embedding
|
| 196 |
+
with torch.no_grad():
|
| 197 |
+
emb = facenet(face).cpu().numpy()[0]
|
| 198 |
+
|
| 199 |
+
# Normalize embedding
|
| 200 |
+
emb = emb / np.linalg.norm(emb)
|
| 201 |
+
return emb.astype(float).tolist()
|
| 202 |
+
|
| 203 |
+
except Exception as e:
|
| 204 |
+
print(f"Face embedding failed: {e}")
|
| 205 |
+
return None
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
"""
|
| 209 |
+
# ==============================================================================
|
| 210 |
+
# API Helpers
|
| 211 |
+
# ==============================================================================
|
| 212 |
+
Collection of public-facing API endpoints used by the application.
|
| 213 |
+
|
| 214 |
+
This section exposes functions that process incoming requests,
|
| 215 |
+
perform validation, interact with the model inference helpers,
|
| 216 |
+
and return structured responses. Each endpoint is designed to be
|
| 217 |
+
stateless, deterministic, and safe to call from external clients.
|
| 218 |
+
|
| 219 |
+
Endpoints in this module typically:
|
| 220 |
+
- Receive raw data (images, text, base64-encoded content, etc.)
|
| 221 |
+
- Preprocess inputs before forwarding them to internal inference utilities
|
| 222 |
+
- Handle optional parameters such as temperature or token limits
|
| 223 |
+
- Return JSON-serializable dictionaries as responses
|
| 224 |
+
|
| 225 |
+
The functions below constitute the interface layer between users
|
| 226 |
+
and the underlying model logic implemented in the helper utilities.
|
| 227 |
+
# ==============================================================================
|
| 228 |
+
"""
|
| 229 |
|
| 230 |
def describe_raw(image: Image.Image, text: str = "Describe la imagen con detalle.",
|
| 231 |
max_new_tokens: int = 256, temperature: float = 0.7) -> Dict[str, str]:
|
| 232 |
+
"""
|
| 233 |
+
Endpoint to generate a detailed description of an input image.
|
| 234 |
+
|
| 235 |
+
This function receives an image and an optional text prompt, then forwards
|
| 236 |
+
the request to the internal inference helper `_infer_one`. It returns a JSON-
|
| 237 |
+
serializable dictionary containing the generated text description.
|
| 238 |
+
|
| 239 |
+
Parameters
|
| 240 |
+
----------
|
| 241 |
+
image : PIL.Image.Image
|
| 242 |
+
The input image to be analyzed and described.
|
| 243 |
+
text : str, optional
|
| 244 |
+
Instruction or prompt for the model guiding how the image should be described.
|
| 245 |
+
Defaults to a general "describe in detail" prompt (in Spanish).
|
| 246 |
+
max_new_tokens : int, optional
|
| 247 |
+
Maximum number of tokens the model is allowed to generate. Default is 256.
|
| 248 |
+
temperature : float, optional
|
| 249 |
+
Sampling temperature controlling randomness of the output. Default is 0.7.
|
| 250 |
+
|
| 251 |
+
Returns
|
| 252 |
+
-------
|
| 253 |
+
Dict[str, str]
|
| 254 |
+
A dictionary with a single key `"text"` containing the generated description.
|
| 255 |
+
"""
|
| 256 |
result = _infer_one(image, text, max_new_tokens, temperature, context=None)
|
| 257 |
return {"text": result}
|
| 258 |
|
| 259 |
|
| 260 |
+
def describe_batch(
|
| 261 |
+
images: List[Image.Image],
|
| 262 |
+
context_json: str,
|
| 263 |
+
max_new_tokens: int = 256,
|
| 264 |
+
temperature: float = 0.7
|
| 265 |
+
) -> List[str]:
|
| 266 |
+
"""
|
| 267 |
+
Batch endpoint for the image description engine.
|
| 268 |
+
|
| 269 |
+
This endpoint receives a list of images along with an optional JSON-formatted
|
| 270 |
+
context, and returns a list of textual descriptions generated by the model.
|
| 271 |
+
Each image is processed individually using the internal `_infer_one` function,
|
| 272 |
+
optionally incorporating the context into the prompt.
|
| 273 |
+
|
| 274 |
+
Args:
|
| 275 |
+
images (List[Image.Image]):
|
| 276 |
+
A list of PIL Image objects to describe.
|
| 277 |
+
context_json (str):
|
| 278 |
+
A JSON-formatted string providing additional context for the prompt.
|
| 279 |
+
If empty or invalid, no context will be used.
|
| 280 |
+
max_new_tokens (int, optional):
|
| 281 |
+
Maximum number of tokens to generate per image. Defaults to 256.
|
| 282 |
+
temperature (float, optional):
|
| 283 |
+
Sampling temperature controlling text randomness. Defaults to 0.7.
|
| 284 |
+
|
| 285 |
+
Returns:
|
| 286 |
+
List[str]: A list of text descriptions, one for each input image, in order.
|
| 287 |
+
"""
|
| 288 |
try:
|
| 289 |
context = json.loads(context_json) if context_json else None
|
| 290 |
except Exception:
|
|
|
|
| 296 |
return outputs
|
| 297 |
|
| 298 |
|
|
|
|
| 299 |
def face_image_embedding(image: Image.Image) -> List[float] | None:
|
| 300 |
+
"""
|
| 301 |
+
Endpoint to generate a face embedding for a given image.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 302 |
|
| 303 |
+
This function wraps the core `_get_face_embedding` logic for use in endpoints.
|
| 304 |
+
The MTCNN and FaceNet models must be preloaded before calling this function.
|
|
|
|
| 305 |
|
| 306 |
+
Args:
|
| 307 |
+
image (Image.Image): Input image containing a face.
|
| 308 |
+
mtcnn (MTCNN): Preloaded MTCNN face detector.
|
| 309 |
+
facenet (InceptionResnetV1): Preloaded FaceNet model.
|
| 310 |
|
| 311 |
+
Returns:
|
| 312 |
+
list[float] | None: Normalized embedding vector or None if no face detected.
|
| 313 |
+
"""
|
| 314 |
+
return _get_face_embedding(image)
|
|
|
|
| 315 |
|
| 316 |
@spaces.GPU
|
| 317 |
def scenes_extraction(video_file: str, threshold: float, offset_frames: int, crop_ratio: float) -> Tuple[List[Image.Image], List[Dict]] | None:
|
|
|
|
| 362 |
|
| 363 |
# ----------------------------- UI & Endpoints --------------------------------
|
| 364 |
|
| 365 |
+
def _compose_prompt(user_text: str, context: Optional[Dict] = None) -> List[Dict]:
|
| 366 |
+
"""Construye el chat template con imagen + texto + contexto opcional."""
|
| 367 |
+
ctx_txt = ""
|
| 368 |
+
if context:
|
| 369 |
+
try:
|
| 370 |
+
# breve, sin ruido
|
| 371 |
+
ctx_txt = "\n\nContexto adicional:\n" + json.dumps(context, ensure_ascii=False)[:2000]
|
| 372 |
+
except Exception:
|
| 373 |
+
pass
|
| 374 |
+
user_txt = (user_text or "Describe la imagen con detalle.") + ctx_txt
|
| 375 |
+
convo = [
|
| 376 |
+
{
|
| 377 |
+
"role": "user",
|
| 378 |
+
"content": [
|
| 379 |
+
{"type": "image"},
|
| 380 |
+
{"type": "text", "text": user_txt},
|
| 381 |
+
],
|
| 382 |
+
}
|
| 383 |
+
]
|
| 384 |
+
return convo
|
| 385 |
+
|
| 386 |
with gr.Blocks(title="Salamandra Vision 7B · ZeroGPU") as demo:
|
| 387 |
gr.Markdown("## Salamandra-Vision 7B · ZeroGPU\nImagen + texto → descripción.")
|
| 388 |
with gr.Row():
|