Spaces:

VeuReu
/

svision

Running on Zero

App Files Files Community

VeuReu commited on 26 days ago

Commit

970f203

verified ·

1 Parent(s): e877380

Update app.py

Browse files

Files changed (1) hide show

app.py +86 -0

app.py CHANGED Viewed

@@ -432,6 +432,78 @@ def scenes_extraction(
     """
     return _get_scenes_extraction(video_file, threshold, offset_frames, crop_ratio)
 """
 # ==============================================================================
@@ -545,5 +617,19 @@ with gr.Blocks(title="Salamandra Vision 7B · ZeroGPU") as demo:
         concurrency_limit=1
     )
 demo.queue(max_size=16).launch(show_error=True)

     """
     return _get_scenes_extraction(video_file, threshold, offset_frames, crop_ratio)
+@spaces.GPU
+def describe_list_images(
+    images: List[Image.Image]
+) -> List[str]:
+    """
+    Generate brief visual descriptions for a list of PIL Images using Salamandra Vision.
+    Args:
+        images (List[Image.Image]): List of PIL Image objects to describe.
+    Returns:
+        List[str]: List of descriptions, one per image.
+    """
+    # Load the Salamandra Vision model
+    path_model = "BSC-LT/salamandra-7b-vision"
+    processor = AutoProcessor.from_pretrained(path_model)
+    model = LlavaOnevisionForConditionalGeneration.from_pretrained(
+        path_model,
+        torch_dtype=torch.float16,
+        low_cpu_mem_usage=True
+    ).to("cuda")
+    # System prompt for image description
+    sys_prompt = (
+        "You are an expert in visual storytelling. "
+        "Describe the image very briefly and simply in Catalan, "
+        "explaining only the main action seen. "
+        "Respond with a single short sentence (maximum 10–20 words), "
+        "without adding unnecessary details or describing the background."
+    )
+    all_results = []
+    for img in images:
+        batch = [img]
+        # Create the conversation template
+        conversation = [
+            {"role": "system", "content": sys_prompt},
+            {"role": "user", "content": [
+                {"type": "image", "image": batch[0]},
+                {"type": "text", "text": (
+                    "Describe the image very briefly and simply in Catalan."
+                )}
+            ]}
+        ]
+        prompt_batch = processor.apply_chat_template(conversation, add_generation_prompt=True)
+        # Prepare inputs for the model
+        inputs = processor(images=batch, text=prompt_batch, return_tensors="pt")
+        for k, v in inputs.items():
+            if v.dtype.is_floating_point:
+                inputs[k] = v.to("cuda", torch.float16)
+            else:
+                inputs[k] = v.to("cuda")
+        # Generate the description
+        output = model.generate(**inputs, max_new_tokens=1024)
+        text = processor.decode(output[0], skip_special_tokens=True)
+        lines = text.split("\n")
+        # Extract the assistant's answer
+        desc = ""
+        for i, line in enumerate(lines):
+            if line.lower().startswith(" assistant"):
+                desc = "\n".join(lines[i+1:]).strip()
+                break
+        all_results.append(desc)
+    return all_results
 """
 # ==============================================================================
         concurrency_limit=1
     )
+    # List image description with Salamandra Vision
+    with gr.Row():
+        img_input = gr.Gallery(label="Batch images", show_label=False)
+        describe_btn = gr.Button("Generate descriptions")
+        desc_output = gr.Textbox(label="Image descriptions", lines=5)
+    describe_btn.click(
+        describe_list_images,
+        inputs=[img_input],
+        outputs=desc_output,
+        api_name="describe_images",
+        concurrency_limit=1
+    )
 demo.queue(max_size=16).launch(show_error=True)