VeuReu commited on
Commit
0aa10f8
·
verified ·
1 Parent(s): c55a9c1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +374 -108
app.py CHANGED
@@ -28,6 +28,7 @@ APIs/UI and the underlying machine learning models.
28
  # Standard library
29
  import json
30
  import os
 
31
  from typing import Any, Dict, List, Optional, Tuple, Union
32
 
33
  # Third-party libraries
@@ -41,6 +42,8 @@ from PIL import Image
41
  from scenedetect import SceneManager, VideoManager
42
  from scenedetect.detectors import ContentDetector
43
  from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration
 
 
44
 
45
 
46
  '''
@@ -210,22 +213,31 @@ def _get_face_embedding(
210
  """
211
  try:
212
  mtcnn, facenet = _load_face_models()
213
- # Detect and extract face
214
- face = mtcnn(image)
215
- if face is None:
216
- return None
217
 
218
- # FaceNet expects tensor of shape (1,3,160,160)
 
 
 
 
 
219
  device = DEVICE if DEVICE == "cuda" and torch.cuda.is_available() else "cpu"
220
- face = face.unsqueeze(0).to(device)
 
 
 
221
 
222
- # Get embedding
223
- with torch.no_grad():
224
- emb = facenet(face).cpu().numpy()[0]
 
225
 
226
- # Normalize embedding
227
- emb = emb / np.linalg.norm(emb)
228
- return emb.astype(float).tolist()
 
 
 
 
229
 
230
  except Exception as e:
231
  print(f"Face embedding failed: {e}")
@@ -296,6 +308,216 @@ def _get_scenes_extraction(
296
  print("Error in scenes_extraction:", e)
297
  return None, None
298
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
299
  """
300
  # ==============================================================================
301
  # API Helpers
@@ -432,79 +654,66 @@ def scenes_extraction(
432
  """
433
  return _get_scenes_extraction(video_file, threshold, offset_frames, crop_ratio)
434
 
435
- @spaces.GPU
436
  def describe_list_images(
437
  images: List[Image.Image]
438
  ) -> List[str]:
439
  """
440
- Generate brief visual descriptions for a list of PIL Images using Salamandra Vision.
 
 
 
441
 
442
  Args:
443
- images (List[Image.Image]): List of PIL Image objects to describe.
444
 
445
  Returns:
446
- List[str]: List of descriptions, one per image.
447
  """
 
448
 
449
- # Load the Salamandra Vision model
450
- path_model = "BSC-LT/salamandra-7b-vision"
451
- processor = AutoProcessor.from_pretrained(path_model)
452
- model = LlavaOnevisionForConditionalGeneration.from_pretrained(
453
- path_model,
454
- torch_dtype=torch.float16,
455
- low_cpu_mem_usage=True
456
- ).to("cuda")
457
-
458
- # System prompt for image description
459
- sys_prompt = (
460
- "You are an expert in visual storytelling. "
461
- "Describe the image very briefly and simply in Catalan, "
462
- "explaining only the main action seen. "
463
- "Respond with a single short sentence (maximum 10–20 words), "
464
- "without adding unnecessary details or describing the background."
465
- )
466
 
467
- all_results = []
468
-
469
- for img in images:
470
- batch = [img]
471
-
472
- # Create the conversation template
473
- conversation = [
474
- {"role": "system", "content": sys_prompt},
475
- {"role": "user", "content": [
476
- {"type": "image", "image": batch[0]},
477
- {"type": "text", "text": (
478
- "Describe the image very briefly and simply in Catalan."
479
- )}
480
- ]}
481
- ]
482
- prompt_batch = processor.apply_chat_template(conversation, add_generation_prompt=True)
483
 
484
- # Prepare inputs for the model
485
- inputs = processor(images=batch, text=prompt_batch, return_tensors="pt")
486
- for k, v in inputs.items():
487
- if v.dtype.is_floating_point:
488
- inputs[k] = v.to("cuda", torch.float16)
489
- else:
490
- inputs[k] = v.to("cuda")
 
 
 
491
 
492
- # Generate the description
493
- output = model.generate(**inputs, max_new_tokens=1024)
494
- text = processor.decode(output[0], skip_special_tokens=True)
495
- lines = text.split("\n")
 
 
 
 
 
496
 
497
- # Extract the assistant's answer
498
- desc = ""
499
- for i, line in enumerate(lines):
500
- if line.lower().startswith(" assistant"):
501
- desc = "\n".join(lines[i+1:]).strip()
502
- break
503
- print("====================")
504
- print(desc)
505
- all_results.append(desc)
 
 
506
 
507
- return all_results
508
 
509
  """
510
  # ==============================================================================
@@ -558,32 +767,51 @@ def _compose_prompt(user_text: str, context: Optional[Dict] = None) -> List[Dict
558
  ]
559
  return convo
560
 
561
-
562
- with gr.Blocks(title="Salamandra Vision 7B · ZeroGPU") as demo:
563
- gr.Markdown("## Salamandra-Vision 7B · ZeroGPU\nImage + text → description.")
564
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
565
  with gr.Row():
566
  with gr.Column():
567
- in_img = gr.Image(label="Image", type="pil")
568
- in_txt = gr.Textbox(label="Text/prompt", value="Describe the image in detail (ES/CA).")
569
- max_new = gr.Slider(16, 1024, value=256, step=16, label="max_new_tokens")
570
- temp = gr.Slider(0.0, 1.5, value=0.7, step=0.05, label="temperature")
571
- btn = gr.Button("Generate", variant="primary")
572
  with gr.Column():
573
- out = gr.Textbox(label="Description", lines=18)
574
 
575
- # Single image inference
576
  btn.click(_infer_one, [in_img, in_txt, max_new, temp], out, api_name="describe", concurrency_limit=1)
 
577
 
578
- # Batch API for engine (Gradio Client): images + context_json → list[str]
579
- batch_in_images = gr.Gallery(label="Batch images", show_label=False, columns=4, height="auto")
 
 
 
580
  batch_context = gr.Textbox(label="context_json", value="{}", lines=4)
581
- batch_max = gr.Slider(16, 1024, value=256, step=16, label="max_new_tokens")
582
- batch_temp = gr.Slider(0.0, 1.5, value=0.7, step=0.05, label="temperature")
583
- batch_btn = gr.Button("Describe batch")
584
- batch_out = gr.JSON(label="Descriptions (list)")
585
 
586
- # Note: Gradio Gallery returns paths/objects; the client is used to load files
587
  batch_btn.click(
588
  describe_batch,
589
  [batch_in_images, batch_context, batch_max, batch_temp],
@@ -591,25 +819,32 @@ with gr.Blocks(title="Salamandra Vision 7B · ZeroGPU") as demo:
591
  api_name="predict",
592
  concurrency_limit=1
593
  )
 
594
 
595
- # Facial embedding section
 
 
 
596
  with gr.Row():
597
- face_img = gr.Image(label="Image for facial embedding", type="pil")
598
- face_btn = gr.Button("Get facial embedding")
599
- face_out = gr.JSON(label="Facial embedding (vector)")
600
  face_btn.click(face_image_embedding, [face_img], face_out, api_name="face_image_embedding", concurrency_limit=1)
 
601
 
602
- # Video scene extraction section
 
 
 
603
  with gr.Row():
604
- video_file = gr.Video(label="Upload a video")
605
- threshold = gr.Slider(0.0, 100.0, value=30.0, step=1.0, label="Threshold")
606
- offset_frames = gr.Slider(0, 30, value=5, step=1, label="Offset frames")
607
- crop_ratio = gr.Slider(0.0, 1.0, value=1.0, step=0.05, label="Crop ratio")
608
- scenes_btn = gr.Button("Extract scenes")
609
- scenes_gallery_out = gr.Gallery(label="Scene keyframes", show_label=False, columns=4, height="auto")
610
- scenes_info_out = gr.JSON(label="Scene information")
611
-
612
- # Bind the scene extraction function
613
  scenes_btn.click(
614
  scenes_extraction,
615
  inputs=[video_file, threshold, offset_frames, crop_ratio],
@@ -617,21 +852,52 @@ with gr.Blocks(title="Salamandra Vision 7B · ZeroGPU") as demo:
617
  api_name="scenes_extraction",
618
  concurrency_limit=1
619
  )
 
620
 
621
- # List image description with Salamandra Vision
 
 
 
622
  with gr.Row():
623
- img_input = gr.Gallery(label="List images", show_label=False, columns=4, height="auto")
624
- describe_btn = gr.Button("Generate descriptions")
625
- desc_output = gr.Textbox(label="Image descriptions", lines=10)
626
 
627
  describe_btn.click(
628
- fn=lambda imgs: describe_list_images([img for img in imgs if isinstance(img, Image.Image)])
629
- if imgs else ["No images uploaded."],
630
  inputs=[img_input],
631
  outputs=desc_output,
632
  api_name="describe_images",
633
  concurrency_limit=1
634
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
635
 
636
  demo.queue(max_size=16).launch(show_error=True)
637
 
 
28
  # Standard library
29
  import json
30
  import os
31
+ import re
32
  from typing import Any, Dict, List, Optional, Tuple, Union
33
 
34
  # Third-party libraries
 
42
  from scenedetect import SceneManager, VideoManager
43
  from scenedetect.detectors import ContentDetector
44
  from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration
45
+ from wordfreq import zipf_frequency
46
+ import easyocr
47
 
48
 
49
  '''
 
213
  """
214
  try:
215
  mtcnn, facenet = _load_face_models()
 
 
 
 
216
 
217
+ boxes, probs = mtcnn.detect(image)
218
+
219
+ if boxes is None:
220
+ return []
221
+
222
+ embeddings = []
223
  device = DEVICE if DEVICE == "cuda" and torch.cuda.is_available() else "cpu"
224
+
225
+ for box in boxes:
226
+ x1, y1, x2, y2 = map(int, box)
227
+ face = image.crop((x1, y1, x2, y2))
228
 
229
+ face_tensor = mtcnn(face)
230
+ if face_tensor is None:
231
+ continue
232
+ face_tensor = face_tensor.unsqueeze(0).to(device)
233
 
234
+ with torch.no_grad():
235
+ emb = facenet(face_tensor).cpu().numpy()[0]
236
+
237
+ emb = emb / np.linalg.norm(emb)
238
+ embeddings.append(emb.astype(float).tolist())
239
+
240
+ return embeddings
241
 
242
  except Exception as e:
243
  print(f"Face embedding failed: {e}")
 
308
  print("Error in scenes_extraction:", e)
309
  return None, None
310
 
311
+ @spaces.GPU
312
+ def _get_image_list_description(
313
+ images: List[Image.Image]
314
+ ) -> List[str]:
315
+ """
316
+ Generate brief visual descriptions for a list of PIL Images using Salamandra Vision.
317
+
318
+ Args:
319
+ images (List[Image.Image]): List of PIL Image objects to describe.
320
+
321
+ Returns:
322
+ List[str]: List of descriptions, one per image.
323
+ """
324
+ list_images = [x[0] for x in images]
325
+
326
+ # Load the Salamandra Vision model
327
+ path_model = "BSC-LT/salamandra-7b-vision"
328
+ processor = AutoProcessor.from_pretrained(path_model)
329
+ model = LlavaOnevisionForConditionalGeneration.from_pretrained(
330
+ path_model,
331
+ torch_dtype=torch.float16,
332
+ low_cpu_mem_usage=False
333
+ ).to("cuda")
334
+
335
+ # System prompt for image description
336
+ sys_prompt = (
337
+ "Ets un expert en narrativa visual. "
338
+ "Descriu la imatge de manera molt breu i senzilla en català, "
339
+ "explicant només l'acció principal que es veu. "
340
+ "Respon amb una única frase curta (màxim 10–20 paraules), "
341
+ "sense afegir detalls innecessaris ni descriure el fons."
342
+ )
343
+
344
+ all_results = []
345
+
346
+ for img in list_images:
347
+ batch = [img]
348
+
349
+ # Create the conversation template
350
+ conversation = [
351
+ {"role": "system", "content": sys_prompt},
352
+ {"role": "user", "content": [
353
+ {"type": "image", "image": batch[0]},
354
+ {"type": "text", "text": (
355
+ "Descriu la imatge de manera molt breu i senzilla en català."
356
+ )}
357
+ ]}
358
+ ]
359
+ prompt_batch = processor.apply_chat_template(conversation, add_generation_prompt=True)
360
+
361
+ # Prepare inputs for the model
362
+ inputs = processor(images=batch, text=prompt_batch, return_tensors="pt")
363
+ for k, v in inputs.items():
364
+ if v.dtype.is_floating_point:
365
+ inputs[k] = v.to("cuda", torch.float16)
366
+ else:
367
+ inputs[k] = v.to("cuda")
368
+
369
+ # Generate the description
370
+ output = model.generate(**inputs, max_new_tokens=1024)
371
+ text = processor.decode(output[0], skip_special_tokens=True)
372
+ lines = text.split("\n")
373
+
374
+ # Extract the assistant's answer
375
+ desc = ""
376
+ for i, line in enumerate(lines):
377
+ if line.lower().startswith(" assistant"):
378
+ desc = "\n".join(lines[i+1:]).strip()
379
+ break
380
+
381
+ all_results.append(desc)
382
+
383
+ return all_results
384
+
385
+ @spaces.GPU
386
+ def _get_ocr_characters_to_image(
387
+ image: Image.Image,
388
+ informacion_image: Dict[str, Any],
389
+ face_col: List[Dict[str, Any]]
390
+ ) -> Dict[str, Any]:
391
+ """
392
+ Process an input image by detecting faces, generating face embeddings,
393
+ performing K-nearest neighbors (KNN) matching against a known face database,
394
+ and extracting OCR (Optical Character Recognition) text using EasyOCR.
395
+
396
+ The function performs the following steps:
397
+ 1. Detects faces in the image and generates embeddings for each face.
398
+ 2. For each detected face, retrieves the top 3 closest embeddings from the
399
+ reference database and determines the identity if the distance is below
400
+ a defined threshold.
401
+ 3. Executes OCR using EasyOCR to extract textual content from the image.
402
+ It filters the OCR output by removing uncommon or noisy words, and
403
+ validates results using zipf word frequency to ensure linguistic relevance.
404
+ 4. Returns a dictionary containing metadata, detected identities, and OCR text.
405
+
406
+ Parameters
407
+ ----------
408
+ image : PIL.Image.Image
409
+ The image to process.
410
+ informacion_image : Dict[str, Any]
411
+ Metadata about the image (index, start time, end time), provided as JSON.
412
+ face_col : List[Dict[str, Any]]
413
+ A list of dictionaries containing stored face embeddings and names,
414
+ provided as JSON.
415
+
416
+ Returns
417
+ -------
418
+ Dict[str, Any]
419
+ A dictionary containing:
420
+ - id: image identifier
421
+ - start: start timestamp
422
+ - end: end timestamp
423
+ - faces: list of detected identities
424
+ - ocr: extracted OCR text
425
+ """
426
+
427
+ # First, detect faces in the image and generate embeddings for each of them.
428
+ raw_faces = _get_face_embedding(image)
429
+ informacion_image_dict = json.loads(informacion_image)
430
+ face_col = json.loads(face_col)
431
+ faces_detected = []
432
+
433
+ for f in raw_faces:
434
+ embedding_image = f
435
+ identity = "Desconegut"
436
+ knn = []
437
+
438
+ # Now search for the 3 nearest neighbors in the database for each embedding.
439
+ if face_col and embedding_image is not None:
440
+ try:
441
+ num_embeddings = len(face_col)
442
+
443
+ if num_embeddings < 1:
444
+ knn = []
445
+ identity = "Desconegut"
446
+
447
+ else:
448
+ n_results = min(3, num_embeddings)
449
+
450
+ embedding_image = np.array(embedding_image)
451
+
452
+ distances_embedding = []
453
+
454
+ # Compute Euclidean distance between the detected face and each stored embedding
455
+ for image_base_datos in face_col:
456
+ image_base_datos_embedding = np.array(image_base_datos["embedding"])
457
+ distance = np.linalg.norm(embedding_image - image_base_datos_embedding)
458
+ distances_embedding.append({
459
+ "identity": image_base_datos["nombre"],
460
+ "distance": float(distance)
461
+ })
462
+
463
+ # Sort by distance and keep the top N matches
464
+ distances_embedding = sorted(distances_embedding, key=lambda x: x["distance"])
465
+ knn = distances_embedding[:n_results]
466
+
467
+ # Assign identity if closest match is below distance threshold
468
+ if knn and knn[0]["distance"] < 0.8:
469
+ identity = knn[0]["identity"]
470
+ else:
471
+ identity = "Desconegut"
472
+
473
+ except Exception as e:
474
+ print(f"Face KNN failed: {e}")
475
+ knn = []
476
+ identity = "Desconegut"
477
+
478
+ faces_detected.append(identity)
479
+
480
+ # Now perform OCR detection
481
+ ocr_text_easyocr = ""
482
+ use_easyocr = True
483
+ if use_easyocr:
484
+ try:
485
+ rgb = np.array(image)
486
+ bgr = cv2.cvtColor(rgb, cv2.COLOR_RGB2BGR)
487
+
488
+ # EasyOCR reader for English and Spanish
489
+ reader = easyocr.Reader(['en', 'es'], gpu=True)
490
+ results = reader.readtext(bgr)
491
+
492
+ # Join OCR results into a single text string
493
+ ocr_text_easyocr = " ".join([text for _, text, _ in results]).strip()
494
+
495
+ # Filter out uncommon or malformed words
496
+ palabras_ocr_text = ocr_text_easyocr.split()
497
+ palabras_ocr_text = [p for p in palabras_ocr_text if re.fullmatch(r'[A-Za-zÀ-ÿ]+', p)]
498
+
499
+ # Keep OCR text only if at least one word is linguistically valid
500
+ for palabra in palabras_ocr_text:
501
+ if zipf_frequency(palabra, "ca") != 0.0:
502
+ break
503
+ else:
504
+ ocr_text_easyocr = ""
505
+
506
+ except Exception as e:
507
+ print(f"OCR error: {e}")
508
+ return None
509
+
510
+ # Final structured output with metadata, faces, and OCR
511
+ informacion_image_completo = {
512
+ "id": informacion_image_dict["index"],
513
+ "start": informacion_image_dict["start"],
514
+ "end": informacion_image_dict["end"],
515
+ "faces": faces_detected,
516
+ "ocr": ocr_text_easyocr,
517
+ }
518
+
519
+ return informacion_image_completo
520
+
521
  """
522
  # ==============================================================================
523
  # API Helpers
 
654
  """
655
  return _get_scenes_extraction(video_file, threshold, offset_frames, crop_ratio)
656
 
657
+
658
  def describe_list_images(
659
  images: List[Image.Image]
660
  ) -> List[str]:
661
  """
662
+ Endpoint wrapper for generating brief descriptions of a list of images.
663
+
664
+ This function acts as a wrapper around the internal `_get_image_list_description` function.
665
+ It takes a list of PIL Images and returns a list of short textual descriptions for each image.
666
 
667
  Args:
668
+ images (List[Image.Image]): A list of PIL Image objects to describe.
669
 
670
  Returns:
671
+ List[str]: A list of strings, where each string is a brief description of the corresponding image.
672
  """
673
+ return _get_image_list_description(images)
674
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
675
 
676
+ def add_ocr_characters_to_image(
677
+ image: Image.Image,
678
+ informacion_image: Dict[str, Any],
679
+ face_col: List[Dict[str, Any]]
680
+ ) -> Dict[str, Any]:
681
+ """
682
+ Endpoint wrapper for processing an image to extract face identities and OCR text.
 
 
 
 
 
 
 
 
 
683
 
684
+ This function serves as a wrapper for the internal `_get_ocr_characters_to_image`
685
+ function. It receives an image, metadata describing that image, and a collection
686
+ of stored face embeddings. The wrapped internal function performs the following:
687
+
688
+ 1. Detects faces and generates embeddings for each detected face.
689
+ 2. Matches these embeddings against a reference database using K-nearest neighbors.
690
+ 3. Runs OCR (Optical Character Recognition) on the image to extract textual content.
691
+ 4. Applies filtering to discard invalid or noisy OCR results.
692
+ 5. Returns a structured dictionary containing image metadata, identified faces,
693
+ and OCR-extracted text.
694
 
695
+ Parameters
696
+ ----------
697
+ image : PIL.Image.Image
698
+ The image object to be analyzed.
699
+ informacion_image : Dict[str, Any]
700
+ Metadata describing the image (such as index, start timestamp, end timestamp).
701
+ face_col : List[Dict[str, Any]]
702
+ A list of dictionaries representing stored face embeddings and related identity
703
+ information, used for similarity matching.
704
 
705
+ Returns
706
+ -------
707
+ Dict[str, Any]
708
+ A dictionary containing:
709
+ - id: the image identifier
710
+ - start: start timestamp
711
+ - end: end timestamp
712
+ - faces: detected face identities
713
+ - ocr: the extracted OCR text
714
+ """
715
+ return _get_ocr_characters_to_image(image,informacion_image,face_col)
716
 
 
717
 
718
  """
719
  # ==============================================================================
 
767
  ]
768
  return convo
769
 
770
+ custom_css = """
771
+ h2 {
772
+ background: #e3e4e6 !important;
773
+ padding: 14px 22px !important;
774
+ border-radius: 14px !important;
775
+ box-shadow: 0 4px 12px rgba(0,0,0,0.08) !important;
776
+ display: block !important; /* ocupa tot l'ample */
777
+ width: 100% !important; /* assegura 100% */
778
+ margin: 20px auto !important;
779
+ text-align:center;
780
+ }
781
+ """
782
+ with gr.Blocks(title="Salamandra Vision 7B · ZeroGPU", css=custom_css) as demo:
783
+ # Main title H1 centered
784
+ gr.Markdown('<h1 style="text-align:center">SALAMANDRA VISION 7B · ZEROGPU</h1>')
785
+ gr.Markdown("---")
786
+
787
+ # ---------------------
788
+ # Section: Single image inference
789
+ # ---------------------
790
+ gr.Markdown('<h2 style="text-align:center">Inferència per imatge única</h2>')
791
  with gr.Row():
792
  with gr.Column():
793
+ in_img = gr.Image(label="Imatge", type="pil")
794
+ in_txt = gr.Textbox(label="Text/prompt", value="Descriu la imatge amb detall (ES/CA).")
795
+ max_new = gr.Slider(16, 1024, value=256, step=16, label="màx_tokens nous")
796
+ temp = gr.Slider(0.0, 1.5, value=0.7, step=0.05, label="temperatura")
797
+ btn = gr.Button("Genera", variant="primary")
798
  with gr.Column():
799
+ out = gr.Textbox(label="Descripció", lines=18)
800
 
 
801
  btn.click(_infer_one, [in_img, in_txt, max_new, temp], out, api_name="describe", concurrency_limit=1)
802
+ gr.Markdown("---")
803
 
804
+ # ---------------------
805
+ # Section: Batch images
806
+ # ---------------------
807
+ gr.Markdown('<h2 style="text-align:center">Llot d’imatges</h2>')
808
+ batch_in_images = gr.Gallery(label="Llot d’imatges", show_label=False, columns=4, height="auto")
809
  batch_context = gr.Textbox(label="context_json", value="{}", lines=4)
810
+ batch_max = gr.Slider(16, 1024, value=256, step=16, label="màx_tokens nous")
811
+ batch_temp = gr.Slider(0.0, 1.5, value=0.7, step=0.05, label="temperatura")
812
+ batch_btn = gr.Button("Descriu el lot")
813
+ batch_out = gr.JSON(label="Descripcions (llista)")
814
 
 
815
  batch_btn.click(
816
  describe_batch,
817
  [batch_in_images, batch_context, batch_max, batch_temp],
 
819
  api_name="predict",
820
  concurrency_limit=1
821
  )
822
+ gr.Markdown("---")
823
 
824
+ # ---------------------
825
+ # Section: Facial embeddings
826
+ # ---------------------
827
+ gr.Markdown('<h2 style="text-align:center">Embeddings facials</h2>')
828
  with gr.Row():
829
+ face_img = gr.Image(label="Imatge per embedding facial", type="pil")
830
+ face_btn = gr.Button("Obté embedding facial")
831
+ face_out = gr.JSON(label="Embedding facial (vector)")
832
  face_btn.click(face_image_embedding, [face_img], face_out, api_name="face_image_embedding", concurrency_limit=1)
833
+ gr.Markdown("---")
834
 
835
+ # ---------------------
836
+ # Section: Video scene extraction
837
+ # ---------------------
838
+ gr.Markdown('<h2 style="text-align:center">Extracció d’escenes de vídeo</h2>')
839
  with gr.Row():
840
+ video_file = gr.Video(label="Puja un vídeo")
841
+ threshold = gr.Slider(0.0, 100.0, value=30.0, step=1.0, label="Llindar")
842
+ offset_frames = gr.Slider(0, 30, value=5, step=1, label="Desplaçament de frames")
843
+ crop_ratio = gr.Slider(0.0, 1.0, value=1.0, step=0.05, label="Raó de retall")
844
+ scenes_btn = gr.Button("Extreu escenes")
845
+ scenes_gallery_out = gr.Gallery(label="Fotogrames clau de l’escena", show_label=False, columns=4, height="auto")
846
+ scenes_info_out = gr.JSON(label="Informació de l’escena")
847
+
 
848
  scenes_btn.click(
849
  scenes_extraction,
850
  inputs=[video_file, threshold, offset_frames, crop_ratio],
 
852
  api_name="scenes_extraction",
853
  concurrency_limit=1
854
  )
855
+ gr.Markdown("---")
856
 
857
+ # ---------------------
858
+ # Section: Batch description with Salamandra Vision
859
+ # ---------------------
860
+ gr.Markdown('<h2 style="text-align:center">Descripció per lots amb Salamandra Vision</h2>')
861
  with gr.Row():
862
+ img_input = gr.Gallery(label="Llot d’imatges", show_label=False)
863
+ describe_btn = gr.Button("Genera descripcions")
864
+ desc_output = gr.Textbox(label="Descripcions de les imatges")
865
 
866
  describe_btn.click(
867
+ describe_list_images,
 
868
  inputs=[img_input],
869
  outputs=desc_output,
870
  api_name="describe_images",
871
  concurrency_limit=1
872
  )
873
+ gr.Markdown("---")
874
+
875
+ # ---------------------
876
+ # Section: Add OCR and characters to image
877
+ # ---------------------
878
+ gr.Markdown('<h2 style="text-align:center">Afegiu OCR i informació de caràcters al vídeo</h2>')
879
+ with gr.Row():
880
+ img_input = gr.Image(label="Imatge per ampliar la descripció", type="pil")
881
+ info_input = gr.Textbox(
882
+ label="Diccionari informacion_image (format JSON)",
883
+ placeholder='{"index": 0, "start": 0.0, "end": 1.2}',
884
+ lines=3
885
+ )
886
+ faces_input = gr.Textbox(
887
+ label="Llistat de diccionaris face_col (format JSON)",
888
+ placeholder='[{"nombre": "Anna", "embedding": [0.12, 0.88, ...]}, ...]',
889
+ lines=5
890
+ )
891
+ process_btn = gr.Button("Processar imatge (OCR + Persones)")
892
+ output_json = gr.JSON(label="Resultat complet")
893
+
894
+ process_btn.click(
895
+ add_ocr_characters_to_image,
896
+ inputs=[img_input, info_input, faces_input],
897
+ outputs=output_json,
898
+ api_name="add_ocr_and_faces",
899
+ concurrency_limit=1
900
+ )
901
 
902
  demo.queue(max_size=16).launch(show_error=True)
903