VeuReu commited on
Commit
970f203
·
verified ·
1 Parent(s): e877380

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +86 -0
app.py CHANGED
@@ -432,6 +432,78 @@ def scenes_extraction(
432
  """
433
  return _get_scenes_extraction(video_file, threshold, offset_frames, crop_ratio)
434
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
435
 
436
  """
437
  # ==============================================================================
@@ -545,5 +617,19 @@ with gr.Blocks(title="Salamandra Vision 7B · ZeroGPU") as demo:
545
  concurrency_limit=1
546
  )
547
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
548
  demo.queue(max_size=16).launch(show_error=True)
549
 
 
432
  """
433
  return _get_scenes_extraction(video_file, threshold, offset_frames, crop_ratio)
434
 
435
+ @spaces.GPU
436
+ def describe_list_images(
437
+ images: List[Image.Image]
438
+ ) -> List[str]:
439
+ """
440
+ Generate brief visual descriptions for a list of PIL Images using Salamandra Vision.
441
+
442
+ Args:
443
+ images (List[Image.Image]): List of PIL Image objects to describe.
444
+
445
+ Returns:
446
+ List[str]: List of descriptions, one per image.
447
+ """
448
+
449
+ # Load the Salamandra Vision model
450
+ path_model = "BSC-LT/salamandra-7b-vision"
451
+ processor = AutoProcessor.from_pretrained(path_model)
452
+ model = LlavaOnevisionForConditionalGeneration.from_pretrained(
453
+ path_model,
454
+ torch_dtype=torch.float16,
455
+ low_cpu_mem_usage=True
456
+ ).to("cuda")
457
+
458
+ # System prompt for image description
459
+ sys_prompt = (
460
+ "You are an expert in visual storytelling. "
461
+ "Describe the image very briefly and simply in Catalan, "
462
+ "explaining only the main action seen. "
463
+ "Respond with a single short sentence (maximum 10–20 words), "
464
+ "without adding unnecessary details or describing the background."
465
+ )
466
+
467
+ all_results = []
468
+
469
+ for img in images:
470
+ batch = [img]
471
+
472
+ # Create the conversation template
473
+ conversation = [
474
+ {"role": "system", "content": sys_prompt},
475
+ {"role": "user", "content": [
476
+ {"type": "image", "image": batch[0]},
477
+ {"type": "text", "text": (
478
+ "Describe the image very briefly and simply in Catalan."
479
+ )}
480
+ ]}
481
+ ]
482
+ prompt_batch = processor.apply_chat_template(conversation, add_generation_prompt=True)
483
+
484
+ # Prepare inputs for the model
485
+ inputs = processor(images=batch, text=prompt_batch, return_tensors="pt")
486
+ for k, v in inputs.items():
487
+ if v.dtype.is_floating_point:
488
+ inputs[k] = v.to("cuda", torch.float16)
489
+ else:
490
+ inputs[k] = v.to("cuda")
491
+
492
+ # Generate the description
493
+ output = model.generate(**inputs, max_new_tokens=1024)
494
+ text = processor.decode(output[0], skip_special_tokens=True)
495
+ lines = text.split("\n")
496
+
497
+ # Extract the assistant's answer
498
+ desc = ""
499
+ for i, line in enumerate(lines):
500
+ if line.lower().startswith(" assistant"):
501
+ desc = "\n".join(lines[i+1:]).strip()
502
+ break
503
+
504
+ all_results.append(desc)
505
+
506
+ return all_results
507
 
508
  """
509
  # ==============================================================================
 
617
  concurrency_limit=1
618
  )
619
 
620
+ # List image description with Salamandra Vision
621
+ with gr.Row():
622
+ img_input = gr.Gallery(label="Batch images", show_label=False)
623
+ describe_btn = gr.Button("Generate descriptions")
624
+ desc_output = gr.Textbox(label="Image descriptions", lines=5)
625
+
626
+ describe_btn.click(
627
+ describe_list_images,
628
+ inputs=[img_input],
629
+ outputs=desc_output,
630
+ api_name="describe_images",
631
+ concurrency_limit=1
632
+ )
633
+
634
  demo.queue(max_size=16).launch(show_error=True)
635