VeuReu commited on
Commit
de07c6b
·
verified ·
1 Parent(s): af1ccf1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +259 -61
app.py CHANGED
@@ -1,21 +1,37 @@
1
- # app.py — veureu/svision (Salamandra Vision 7B · ZeroGPU) — compatible con ENGINE
2
- import os
3
  import json
4
- from typing import Dict, List, Optional, Tuple, Union, Any
 
5
 
 
 
6
  import gradio as gr
 
7
  import spaces
8
  import torch
9
- from facenet_pytorch import MTCNN, InceptionResnetV1
10
- import numpy as np
11
  from PIL import Image
 
 
12
  from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration
13
 
14
- import cv2
15
- from scenedetect import VideoManager, SceneManager
16
- from scenedetect.detectors import ContentDetector
17
 
 
 
 
 
 
 
 
 
 
18
 
 
 
 
 
 
 
19
  MODEL_ID = os.environ.get("MODEL_ID", "BSC-LT/salamandra-7b-vision")
20
  DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32
21
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
@@ -27,6 +43,21 @@ _facenet = None
27
 
28
 
29
  def _load_face_models() -> Tuple[MTCNN, InceptionResnetV1]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  global _mtcnn, _facenet
31
  if _mtcnn is None or _facenet is None:
32
  device = DEVICE if DEVICE == "cuda" and torch.cuda.is_available() else "cpu"
@@ -36,6 +67,26 @@ def _load_face_models() -> Tuple[MTCNN, InceptionResnetV1]:
36
 
37
 
38
  def _lazy_load() -> Tuple[LlavaOnevisionForConditionalGeneration, AutoProcessor]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  global _model, _processor
40
  if _model is None or _processor is None:
41
  _processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
@@ -50,54 +101,190 @@ def _lazy_load() -> Tuple[LlavaOnevisionForConditionalGeneration, AutoProcessor]
50
  _model.to(DEVICE)
51
  return _model, _processor
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
- def _compose_prompt(user_text: str, context: Optional[Dict] = None) -> List[Dict]:
55
- """Construye el chat template con imagen + texto + contexto opcional."""
56
- ctx_txt = ""
57
- if context:
58
- try:
59
- # breve, sin ruido
60
- ctx_txt = "\n\nContexto adicional:\n" + json.dumps(context, ensure_ascii=False)[:2000]
61
- except Exception:
62
- pass
63
- user_txt = (user_text or "Describe la imagen con detalle.") + ctx_txt
64
- convo = [
65
- {
66
- "role": "user",
67
- "content": [
68
- {"type": "image"},
69
- {"type": "text", "text": user_txt},
70
- ],
71
- }
72
- ]
73
- return convo
74
-
75
-
76
- @spaces.GPU # en HF Spaces usará GPU cuando haya disponibilidad (ZeroGPU)
77
- def _infer_one(image: Image.Image, text: str, max_new_tokens: int = 256, temperature: float = 0.7,
78
- context: Optional[Dict] = None) -> str:
79
- # Reducir el tamaño de la imagen para ahorrar memoria en la GPU
 
 
80
  image.thumbnail((1024, 1024))
81
 
82
  model, processor = _lazy_load()
83
  prompt = processor.apply_chat_template(_compose_prompt(text, context), add_generation_prompt=True)
 
84
  inputs = processor(images=image, text=prompt, return_tensors="pt").to(DEVICE, dtype=DTYPE)
 
85
  with torch.inference_mode():
86
- out = model.generate(**inputs, max_new_tokens=int(max_new_tokens), temperature=float(temperature))
 
 
 
 
 
87
  return processor.decode(out[0], skip_special_tokens=True).strip()
88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
- # ----------------------------- API helpers -----------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
  def describe_raw(image: Image.Image, text: str = "Describe la imagen con detalle.",
93
  max_new_tokens: int = 256, temperature: float = 0.7) -> Dict[str, str]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  result = _infer_one(image, text, max_new_tokens, temperature, context=None)
95
  return {"text": result}
96
 
97
 
98
- def describe_batch(images: List[Image.Image], context_json: str,
99
- max_new_tokens: int = 256, temperature: float = 0.7) -> List[str]:
100
- """Endpoint batch para ENGINE: lista de imágenes + contexto (JSON) → lista de textos."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  try:
102
  context = json.loads(context_json) if context_json else None
103
  except Exception:
@@ -109,32 +296,22 @@ def describe_batch(images: List[Image.Image], context_json: str,
109
  return outputs
110
 
111
 
112
- @spaces.GPU
113
  def face_image_embedding(image: Image.Image) -> List[float] | None:
114
- try:
115
- mtcnn, facenet = _load_face_models()
116
- # detectar y extraer cara
117
- face = mtcnn(image)
118
-
119
- if face is None:
120
- return None
121
-
122
- # FaceNet espera tensor shape (1,3,160,160)
123
- device = DEVICE if DEVICE == "cuda" and torch.cuda.is_available() else "cpu"
124
- face = face.unsqueeze(0).to(device)
125
 
126
- # obtener embedding
127
- with torch.no_grad():
128
- emb = facenet(face).cpu().numpy()[0]
129
 
130
- # normalizar igual que tu código original
131
- emb = emb / np.linalg.norm(emb)
 
 
132
 
133
- return emb.astype(float).tolist()
134
-
135
- except Exception as e:
136
- print(f"Fallo embedding cara: {e}")
137
- return None
138
 
139
  @spaces.GPU
140
  def scenes_extraction(video_file: str, threshold: float, offset_frames: int, crop_ratio: float) -> Tuple[List[Image.Image], List[Dict]] | None:
@@ -185,6 +362,27 @@ def scenes_extraction(video_file: str, threshold: float, offset_frames: int, cro
185
 
186
  # ----------------------------- UI & Endpoints --------------------------------
187
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
  with gr.Blocks(title="Salamandra Vision 7B · ZeroGPU") as demo:
189
  gr.Markdown("## Salamandra-Vision 7B · ZeroGPU\nImagen + texto → descripción.")
190
  with gr.Row():
 
1
+ # Standard library
 
2
  import json
3
+ import os
4
+ from typing import Any, Dict, List, Optional, Tuple, Union
5
 
6
+ # Third-party libraries
7
+ import cv2
8
  import gradio as gr
9
+ import numpy as np
10
  import spaces
11
  import torch
12
+ from facenet_pytorch import InceptionResnetV1, MTCNN
 
13
  from PIL import Image
14
+ from scenedetect import SceneManager, VideoManager
15
+ from scenedetect.detectors import ContentDetector
16
  from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration
17
 
 
 
 
18
 
19
+ '''
20
+ # ==============================================================================
21
+ # Lazy-loading utilities for vision-language and face recognition models
22
+ # ==============================================================================
23
+
24
+ This module provides on-demand initialization of heavyweight components, including:
25
+ - MTCNN: Face detector used to locate and align faces.
26
+ - FaceNet (InceptionResnetV1): Generates 512-dimensional facial embeddings.
27
+ - LLaVA OneVision: Vision-language model for multimodal inference.
28
 
29
+ By loading models lazily and caching them in global variables, the system avoids
30
+ unnecessary reinitialization and reduces startup time, improving performance in
31
+ production environments such as FastAPI services, Docker deployments, and
32
+ Hugging Face Spaces.
33
+ # ==============================================================================
34
+ '''
35
  MODEL_ID = os.environ.get("MODEL_ID", "BSC-LT/salamandra-7b-vision")
36
  DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32
37
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 
43
 
44
 
45
  def _load_face_models() -> Tuple[MTCNN, InceptionResnetV1]:
46
+ """
47
+ Lazily loads and initializes the facial detection and facial embedding models.
48
+
49
+ This function loads:
50
+ - **MTCNN**: Used for face detection and cropping.
51
+ - **InceptionResnetV1 (FaceNet)**: Used to generate 512-dimensional face embeddings.
52
+
53
+ Both models are loaded only once and stored in global variables to avoid
54
+ unnecessary re-initialization. They are automatically placed on GPU if available,
55
+ otherwise CPU is used.
56
+
57
+ Returns:
58
+ Tuple[MTCNN, InceptionResnetV1]: A tuple containing the initialized
59
+ face detection model and the face embedding model.
60
+ """
61
  global _mtcnn, _facenet
62
  if _mtcnn is None or _facenet is None:
63
  device = DEVICE if DEVICE == "cuda" and torch.cuda.is_available() else "cpu"
 
67
 
68
 
69
  def _lazy_load() -> Tuple[LlavaOnevisionForConditionalGeneration, AutoProcessor]:
70
+ """
71
+ Lazily loads the vision-language model and its processor.
72
+
73
+ This function performs a first-time load of:
74
+ - **AutoProcessor**: Handles preprocessing of text and images for the model.
75
+ - **LlavaOnevisionForConditionalGeneration**: The main multimodal model used
76
+ for inference and text generation.
77
+
78
+ The model is moved to GPU if available and configured with:
79
+ - The appropriate floating-point precision (`float16` or `float32`)
80
+ - Low memory usage mode
81
+ - SafeTensors loading enabled
82
+
83
+ Both components are cached in global variables to ensure subsequent calls
84
+ reuse the loaded instances without reinitialization.
85
+
86
+ Returns:
87
+ Tuple[LlavaOnevisionForConditionalGeneration, AutoProcessor]:
88
+ The loaded model and processor ready for inference.
89
+ """
90
  global _model, _processor
91
  if _model is None or _processor is None:
92
  _processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 
101
  _model.to(DEVICE)
102
  return _model, _processor
103
 
104
+ '''
105
+ # ==============================================================================
106
+ # Auxiliary Model Loading Utilities for API Endpoints
107
+ # ==============================================================================
108
+ This module contains helper functions used internally by the API endpoints to
109
+ efficiently load and manage heavy machine learning components. These utilities
110
+ handle on-demand initialization ("lazy loading") of both the vision-language
111
+ model (LLaVA OneVision) and the facial detection/embedding models (MTCNN and
112
+ FaceNet).
113
+
114
+ The goal of this helper block is to:
115
+ - Avoid repeated loading of large models across requests.
116
+ - Reduce GPU/CPU memory pressure by reusing cached instances.
117
+ - Provide clean separation between endpoint logic and model-handling logic.
118
+ - Improve performance and stability in production environments
119
+ (FastAPI, Docker, Hugging Face Spaces).
120
+
121
+ All functions here are intended for internal use and should be called by
122
+ endpoint handlers when a model is required for a given request.
123
+ # ==============================================================================
124
+ '''
125
 
126
+ @spaces.GPU
127
+ def _infer_one(
128
+ image: Image.Image,
129
+ text: str,
130
+ max_new_tokens: int = 256,
131
+ temperature: float = 0.7,
132
+ context: Optional[Dict] = None,
133
+ ) -> str:
134
+ """
135
+ Run a single multimodal inference step using the LLaVA OneVision model.
136
+
137
+ This function:
138
+ - Optionally downsizes the input image to reduce GPU memory consumption.
139
+ - Loads the model and processor through lazy initialization.
140
+ - Builds the final prompt by applying the chat template and injecting optional context.
141
+ - Performs autoregressive generation with configurable token and temperature settings.
142
+ - Returns the decoded textual output.
143
+
144
+ Args:
145
+ image (Image.Image): Input PIL image used for multimodal conditioning.
146
+ text (str): User-provided instruction or query.
147
+ max_new_tokens (int): Maximum number of tokens to generate.
148
+ temperature (float): Sampling temperature controlling output randomness.
149
+ context (Optional[Dict]): Additional context injected into the prompt.
150
+
151
+ Returns:
152
+ str: The generated textual response.
153
+ """
154
  image.thumbnail((1024, 1024))
155
 
156
  model, processor = _lazy_load()
157
  prompt = processor.apply_chat_template(_compose_prompt(text, context), add_generation_prompt=True)
158
+
159
  inputs = processor(images=image, text=prompt, return_tensors="pt").to(DEVICE, dtype=DTYPE)
160
+
161
  with torch.inference_mode():
162
+ out = model.generate(
163
+ **inputs,
164
+ max_new_tokens=int(max_new_tokens),
165
+ temperature=float(temperature),
166
+ )
167
+
168
  return processor.decode(out[0], skip_special_tokens=True).strip()
169
 
170
+ @spaces.GPU
171
+ def _get_face_embedding(
172
+ image: Image.Image
173
+ ) -> list[float] | None:
174
+ """
175
+ Generate a FaceNet embedding for a single face in an image.
176
+
177
+ Args:
178
+ image (Image.Image): A PIL Image containing a face.
179
+
180
+ Returns:
181
+ list[float] | None: Normalized embedding vector for the detected face,
182
+ or None if no face is detected or an error occurs.
183
+ """
184
+ try:
185
+ mtcnn, facenet = _load_face_models()
186
+ # Detect and extract face
187
+ face = mtcnn(image)
188
+ if face is None:
189
+ return None
190
+
191
+ # FaceNet expects tensor of shape (1,3,160,160)
192
+ device = DEVICE if DEVICE == "cuda" and torch.cuda.is_available() else "cpu"
193
+ face = face.unsqueeze(0).to(device)
194
 
195
+ # Get embedding
196
+ with torch.no_grad():
197
+ emb = facenet(face).cpu().numpy()[0]
198
+
199
+ # Normalize embedding
200
+ emb = emb / np.linalg.norm(emb)
201
+ return emb.astype(float).tolist()
202
+
203
+ except Exception as e:
204
+ print(f"Face embedding failed: {e}")
205
+ return None
206
+
207
+
208
+ """
209
+ # ==============================================================================
210
+ # API Helpers
211
+ # ==============================================================================
212
+ Collection of public-facing API endpoints used by the application.
213
+
214
+ This section exposes functions that process incoming requests,
215
+ perform validation, interact with the model inference helpers,
216
+ and return structured responses. Each endpoint is designed to be
217
+ stateless, deterministic, and safe to call from external clients.
218
+
219
+ Endpoints in this module typically:
220
+ - Receive raw data (images, text, base64-encoded content, etc.)
221
+ - Preprocess inputs before forwarding them to internal inference utilities
222
+ - Handle optional parameters such as temperature or token limits
223
+ - Return JSON-serializable dictionaries as responses
224
+
225
+ The functions below constitute the interface layer between users
226
+ and the underlying model logic implemented in the helper utilities.
227
+ # ==============================================================================
228
+ """
229
 
230
  def describe_raw(image: Image.Image, text: str = "Describe la imagen con detalle.",
231
  max_new_tokens: int = 256, temperature: float = 0.7) -> Dict[str, str]:
232
+ """
233
+ Endpoint to generate a detailed description of an input image.
234
+
235
+ This function receives an image and an optional text prompt, then forwards
236
+ the request to the internal inference helper `_infer_one`. It returns a JSON-
237
+ serializable dictionary containing the generated text description.
238
+
239
+ Parameters
240
+ ----------
241
+ image : PIL.Image.Image
242
+ The input image to be analyzed and described.
243
+ text : str, optional
244
+ Instruction or prompt for the model guiding how the image should be described.
245
+ Defaults to a general "describe in detail" prompt (in Spanish).
246
+ max_new_tokens : int, optional
247
+ Maximum number of tokens the model is allowed to generate. Default is 256.
248
+ temperature : float, optional
249
+ Sampling temperature controlling randomness of the output. Default is 0.7.
250
+
251
+ Returns
252
+ -------
253
+ Dict[str, str]
254
+ A dictionary with a single key `"text"` containing the generated description.
255
+ """
256
  result = _infer_one(image, text, max_new_tokens, temperature, context=None)
257
  return {"text": result}
258
 
259
 
260
+ def describe_batch(
261
+ images: List[Image.Image],
262
+ context_json: str,
263
+ max_new_tokens: int = 256,
264
+ temperature: float = 0.7
265
+ ) -> List[str]:
266
+ """
267
+ Batch endpoint for the image description engine.
268
+
269
+ This endpoint receives a list of images along with an optional JSON-formatted
270
+ context, and returns a list of textual descriptions generated by the model.
271
+ Each image is processed individually using the internal `_infer_one` function,
272
+ optionally incorporating the context into the prompt.
273
+
274
+ Args:
275
+ images (List[Image.Image]):
276
+ A list of PIL Image objects to describe.
277
+ context_json (str):
278
+ A JSON-formatted string providing additional context for the prompt.
279
+ If empty or invalid, no context will be used.
280
+ max_new_tokens (int, optional):
281
+ Maximum number of tokens to generate per image. Defaults to 256.
282
+ temperature (float, optional):
283
+ Sampling temperature controlling text randomness. Defaults to 0.7.
284
+
285
+ Returns:
286
+ List[str]: A list of text descriptions, one for each input image, in order.
287
+ """
288
  try:
289
  context = json.loads(context_json) if context_json else None
290
  except Exception:
 
296
  return outputs
297
 
298
 
 
299
  def face_image_embedding(image: Image.Image) -> List[float] | None:
300
+ """
301
+ Endpoint to generate a face embedding for a given image.
 
 
 
 
 
 
 
 
 
302
 
303
+ This function wraps the core `_get_face_embedding` logic for use in endpoints.
304
+ The MTCNN and FaceNet models must be preloaded before calling this function.
 
305
 
306
+ Args:
307
+ image (Image.Image): Input image containing a face.
308
+ mtcnn (MTCNN): Preloaded MTCNN face detector.
309
+ facenet (InceptionResnetV1): Preloaded FaceNet model.
310
 
311
+ Returns:
312
+ list[float] | None: Normalized embedding vector or None if no face detected.
313
+ """
314
+ return _get_face_embedding(image)
 
315
 
316
  @spaces.GPU
317
  def scenes_extraction(video_file: str, threshold: float, offset_frames: int, crop_ratio: float) -> Tuple[List[Image.Image], List[Dict]] | None:
 
362
 
363
  # ----------------------------- UI & Endpoints --------------------------------
364
 
365
+ def _compose_prompt(user_text: str, context: Optional[Dict] = None) -> List[Dict]:
366
+ """Construye el chat template con imagen + texto + contexto opcional."""
367
+ ctx_txt = ""
368
+ if context:
369
+ try:
370
+ # breve, sin ruido
371
+ ctx_txt = "\n\nContexto adicional:\n" + json.dumps(context, ensure_ascii=False)[:2000]
372
+ except Exception:
373
+ pass
374
+ user_txt = (user_text or "Describe la imagen con detalle.") + ctx_txt
375
+ convo = [
376
+ {
377
+ "role": "user",
378
+ "content": [
379
+ {"type": "image"},
380
+ {"type": "text", "text": user_txt},
381
+ ],
382
+ }
383
+ ]
384
+ return convo
385
+
386
  with gr.Blocks(title="Salamandra Vision 7B · ZeroGPU") as demo:
387
  gr.Markdown("## Salamandra-Vision 7B · ZeroGPU\nImagen + texto → descripción.")
388
  with gr.Row():