Update app.py
Browse files
app.py
CHANGED
|
@@ -432,6 +432,78 @@ def scenes_extraction(
|
|
| 432 |
"""
|
| 433 |
return _get_scenes_extraction(video_file, threshold, offset_frames, crop_ratio)
|
| 434 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 435 |
|
| 436 |
"""
|
| 437 |
# ==============================================================================
|
|
@@ -545,5 +617,19 @@ with gr.Blocks(title="Salamandra Vision 7B · ZeroGPU") as demo:
|
|
| 545 |
concurrency_limit=1
|
| 546 |
)
|
| 547 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 548 |
demo.queue(max_size=16).launch(show_error=True)
|
| 549 |
|
|
|
|
| 432 |
"""
|
| 433 |
return _get_scenes_extraction(video_file, threshold, offset_frames, crop_ratio)
|
| 434 |
|
| 435 |
+
@spaces.GPU
|
| 436 |
+
def describe_list_images(
|
| 437 |
+
images: List[Image.Image]
|
| 438 |
+
) -> List[str]:
|
| 439 |
+
"""
|
| 440 |
+
Generate brief visual descriptions for a list of PIL Images using Salamandra Vision.
|
| 441 |
+
|
| 442 |
+
Args:
|
| 443 |
+
images (List[Image.Image]): List of PIL Image objects to describe.
|
| 444 |
+
|
| 445 |
+
Returns:
|
| 446 |
+
List[str]: List of descriptions, one per image.
|
| 447 |
+
"""
|
| 448 |
+
|
| 449 |
+
# Load the Salamandra Vision model
|
| 450 |
+
path_model = "BSC-LT/salamandra-7b-vision"
|
| 451 |
+
processor = AutoProcessor.from_pretrained(path_model)
|
| 452 |
+
model = LlavaOnevisionForConditionalGeneration.from_pretrained(
|
| 453 |
+
path_model,
|
| 454 |
+
torch_dtype=torch.float16,
|
| 455 |
+
low_cpu_mem_usage=True
|
| 456 |
+
).to("cuda")
|
| 457 |
+
|
| 458 |
+
# System prompt for image description
|
| 459 |
+
sys_prompt = (
|
| 460 |
+
"You are an expert in visual storytelling. "
|
| 461 |
+
"Describe the image very briefly and simply in Catalan, "
|
| 462 |
+
"explaining only the main action seen. "
|
| 463 |
+
"Respond with a single short sentence (maximum 10–20 words), "
|
| 464 |
+
"without adding unnecessary details or describing the background."
|
| 465 |
+
)
|
| 466 |
+
|
| 467 |
+
all_results = []
|
| 468 |
+
|
| 469 |
+
for img in images:
|
| 470 |
+
batch = [img]
|
| 471 |
+
|
| 472 |
+
# Create the conversation template
|
| 473 |
+
conversation = [
|
| 474 |
+
{"role": "system", "content": sys_prompt},
|
| 475 |
+
{"role": "user", "content": [
|
| 476 |
+
{"type": "image", "image": batch[0]},
|
| 477 |
+
{"type": "text", "text": (
|
| 478 |
+
"Describe the image very briefly and simply in Catalan."
|
| 479 |
+
)}
|
| 480 |
+
]}
|
| 481 |
+
]
|
| 482 |
+
prompt_batch = processor.apply_chat_template(conversation, add_generation_prompt=True)
|
| 483 |
+
|
| 484 |
+
# Prepare inputs for the model
|
| 485 |
+
inputs = processor(images=batch, text=prompt_batch, return_tensors="pt")
|
| 486 |
+
for k, v in inputs.items():
|
| 487 |
+
if v.dtype.is_floating_point:
|
| 488 |
+
inputs[k] = v.to("cuda", torch.float16)
|
| 489 |
+
else:
|
| 490 |
+
inputs[k] = v.to("cuda")
|
| 491 |
+
|
| 492 |
+
# Generate the description
|
| 493 |
+
output = model.generate(**inputs, max_new_tokens=1024)
|
| 494 |
+
text = processor.decode(output[0], skip_special_tokens=True)
|
| 495 |
+
lines = text.split("\n")
|
| 496 |
+
|
| 497 |
+
# Extract the assistant's answer
|
| 498 |
+
desc = ""
|
| 499 |
+
for i, line in enumerate(lines):
|
| 500 |
+
if line.lower().startswith(" assistant"):
|
| 501 |
+
desc = "\n".join(lines[i+1:]).strip()
|
| 502 |
+
break
|
| 503 |
+
|
| 504 |
+
all_results.append(desc)
|
| 505 |
+
|
| 506 |
+
return all_results
|
| 507 |
|
| 508 |
"""
|
| 509 |
# ==============================================================================
|
|
|
|
| 617 |
concurrency_limit=1
|
| 618 |
)
|
| 619 |
|
| 620 |
+
# List image description with Salamandra Vision
|
| 621 |
+
with gr.Row():
|
| 622 |
+
img_input = gr.Gallery(label="Batch images", show_label=False)
|
| 623 |
+
describe_btn = gr.Button("Generate descriptions")
|
| 624 |
+
desc_output = gr.Textbox(label="Image descriptions", lines=5)
|
| 625 |
+
|
| 626 |
+
describe_btn.click(
|
| 627 |
+
describe_list_images,
|
| 628 |
+
inputs=[img_input],
|
| 629 |
+
outputs=desc_output,
|
| 630 |
+
api_name="describe_images",
|
| 631 |
+
concurrency_limit=1
|
| 632 |
+
)
|
| 633 |
+
|
| 634 |
demo.queue(max_size=16).launch(show_error=True)
|
| 635 |
|