Spaces:

VeuReu
/

svision

Running on Zero

App Files Files Community

VeuReu commited on 29 days ago

Commit

bc0102e

verified ·

1 Parent(s): e2dc4cb

Update app.py

Browse files

Files changed (1) hide show

app.py +150 -136

app.py CHANGED Viewed

@@ -1,136 +1,150 @@
-# app.py — veureu/svision (Salamandra Vision 7B · ZeroGPU) — compatible con ENGINE
-import os
-import json
-from typing import Dict, List, Optional, Tuple, Union
-import gradio as gr
-import spaces
-import torch
-from PIL import Image
-from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration
-MODEL_ID = os.environ.get("MODEL_ID", "BSC-LT/salamandra-7b-vision")
-DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-_model = None
-_processor = None
-def _lazy_load() -> Tuple[LlavaOnevisionForConditionalGeneration, AutoProcessor]:
-    global _model, _processor
-    if _model is None or _processor is None:
-        _processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
-        _model = LlavaOnevisionForConditionalGeneration.from_pretrained(
-            MODEL_ID,
-            dtype=DTYPE,
-            low_cpu_mem_usage=True,
-            trust_remote_code=True,
-            use_safetensors=True,
-            device_map=None,
-        )
-        _model.to(DEVICE)
-    return _model, _processor
-def _compose_prompt(user_text: str, context: Optional[Dict] = None) -> List[Dict]:
-    """Construye el chat template con imagen + texto + contexto opcional."""
-    ctx_txt = ""
-    if context:
-        try:
-            # breve, sin ruido
-            ctx_txt = "\n\nContexto adicional:\n" + json.dumps(context, ensure_ascii=False)[:2000]
-        except Exception:
-            pass
-    user_txt = (user_text or "Describe la imagen con detalle.") + ctx_txt
-    convo = [
-        {
-            "role": "user",
-            "content": [
-                {"type": "image"},
-                {"type": "text", "text": user_txt},
-            ],
-        }
-    ]
-    return convo
-@spaces.GPU  # en HF Spaces usará GPU cuando haya disponibilidad (ZeroGPU)
-def _infer_one(image: Image.Image, text: str, max_new_tokens: int = 256, temperature: float = 0.7,
-               context: Optional[Dict] = None) -> str:
-    # Reducir el tamaño de la imagen para ahorrar memoria en la GPU
-    image.thumbnail((1024, 1024))
-    model, processor = _lazy_load()
-    prompt = processor.apply_chat_template(_compose_prompt(text, context), add_generation_prompt=True)
-    inputs = processor(images=image, text=prompt, return_tensors="pt").to(DEVICE, dtype=DTYPE)
-    with torch.inference_mode():
-        out = model.generate(**inputs, max_new_tokens=int(max_new_tokens), temperature=float(temperature))
-    return processor.decode(out[0], skip_special_tokens=True).strip()
-# ----------------------------- API helpers -----------------------------------
-def describe_raw(image: Image.Image, text: str = "Describe la imagen con detalle.",
-                 max_new_tokens: int = 256, temperature: float = 0.7) -> Dict[str, str]:
-    result = _infer_one(image, text, max_new_tokens, temperature, context=None)
-    return {"text": result}
-def describe_batch(images: List[Image.Image], context_json: str,
-                   max_new_tokens: int = 256, temperature: float = 0.7) -> List[str]:
-    """Endpoint batch para ENGINE: lista de imágenes + contexto (JSON) → lista de textos."""
-    try:
-        context = json.loads(context_json) if context_json else None
-    except Exception:
-        context = None
-    outputs: List[str] = []
-    for img in images:
-        outputs.append(_infer_one(img, text="Describe la imagen con detalle.", max_new_tokens=max_new_tokens,
-                                  temperature=temperature, context=context))
-    return outputs
-# ----------------------------- UI & Endpoints --------------------------------
-with gr.Blocks(title="Salamandra Vision 7B · ZeroGPU") as demo:
-    gr.Markdown("## Salamandra-Vision 7B · ZeroGPU\nImagen + texto → descripción.")
-    with gr.Row():
-        with gr.Column():
-            in_img = gr.Image(label="Imagen", type="pil")
-            in_txt = gr.Textbox(label="Texto/prompt", value="Describe la imagen con detalle (ES/CA).")
-            max_new = gr.Slider(16, 1024, value=256, step=16, label="max_new_tokens")
-            temp = gr.Slider(0.0, 1.5, value=0.7, step=0.05, label="temperature")
-            btn = gr.Button("Generar", variant="primary")
-        with gr.Column():
-            out = gr.Textbox(label="Descripción", lines=18)
-    # UI
-    btn.click(_infer_one, [in_img, in_txt, max_new, temp], out, api_name="describe", concurrency_limit=1)
-    # API simple (multipart) compatible con tu versión anterior
-    # demo.load(
-    #     None,
-    #     [gr.Image(label="image", type="pil"),
-    #      gr.Textbox(value="Describe la imagen con detalle."),
-    #      gr.Slider(16, 1024, value=256),
-    #      gr.Slider(0.0, 1.5, value=0.7)],
-    #     describe_raw,
-    #     api_name="describe_raw"
-    # )
-    # API BATCH para ENGINE (Gradio Client): images + context_json → list[str]
-    # Firma que espera el VisionClient del engine (api_name="/predict")
-    batch_in_images = gr.Gallery(label="Imágenes (batch)", show_label=False, columns=4, height="auto")
-    batch_context = gr.Textbox(label="context_json", value="{}", lines=4)
-    batch_max = gr.Slider(16, 1024, value=256, step=16, label="max_new_tokens")
-    batch_temp = gr.Slider(0.0, 1.5, value=0.7, step=0.05, label="temperature")
-    batch_btn = gr.Button("Describir lote")
-    batch_out = gr.JSON(label="Descripciones (lista)")
-    # Nota: Gradio Gallery entrega rutas/obj; nos apoyamos en el cliente para cargar archivos
-    batch_btn.click(describe_batch, [batch_in_images, batch_context, batch_max, batch_temp], batch_out,
-                    api_name="predict", concurrency_limit=1)
-demo.queue(max_size=16).launch()

+# app.py — veureu/svision (Salamandra Vision 7B · ZeroGPU) — compatible con ENGINE
+import os
+import json
+from typing import Dict, List, Optional, Tuple, Union
+import gradio as gr
+import spaces
+import torch
+from PIL import Image
+from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration
+MODEL_ID = os.environ.get("MODEL_ID", "BSC-LT/salamandra-7b-vision")
+DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+_model = None
+_processor = None
+def _lazy_load() -> Tuple[LlavaOnevisionForConditionalGeneration, AutoProcessor]:
+    global _model, _processor
+    if _model is None or _processor is None:
+        _processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
+        _model = LlavaOnevisionForConditionalGeneration.from_pretrained(
+            MODEL_ID,
+            dtype=DTYPE,
+            low_cpu_mem_usage=True,
+            trust_remote_code=True,
+            use_safetensors=True,
+            device_map=None,
+        )
+        _model.to(DEVICE)
+    return _model, _processor
+def _compose_prompt(user_text: str, context: Optional[Dict] = None) -> List[Dict]:
+    """Construye el chat template con imagen + texto + contexto opcional."""
+    ctx_txt = ""
+    if context:
+        try:
+            # breve, sin ruido
+            ctx_txt = "\n\nContexto adicional:\n" + json.dumps(context, ensure_ascii=False)[:2000]
+        except Exception:
+            pass
+    user_txt = (user_text or "Describe la imagen con detalle.") + ctx_txt
+    convo = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image"},
+                {"type": "text", "text": user_txt},
+            ],
+        }
+    ]
+    return convo
+@spaces.GPU  # en HF Spaces usará GPU cuando haya disponibilidad (ZeroGPU)
+def _infer_one(image: Image.Image, text: str, max_new_tokens: int = 256, temperature: float = 0.7,
+               context: Optional[Dict] = None) -> str:
+    # Reducir el tamaño de la imagen para ahorrar memoria en la GPU
+    image.thumbnail((1024, 1024))
+    model, processor = _lazy_load()
+    prompt = processor.apply_chat_template(_compose_prompt(text, context), add_generation_prompt=True)
+    inputs = processor(images=image, text=prompt, return_tensors="pt").to(DEVICE, dtype=DTYPE)
+    with torch.inference_mode():
+        out = model.generate(**inputs, max_new_tokens=int(max_new_tokens), temperature=float(temperature))
+    return processor.decode(out[0], skip_special_tokens=True).strip()
+# ----------------------------- API helpers -----------------------------------
+def describe_raw(image: Image.Image, text: str = "Describe la imagen con detalle.",
+                 max_new_tokens: int = 256, temperature: float = 0.7) -> Dict[str, str]:
+    result = _infer_one(image, text, max_new_tokens, temperature, context=None)
+    return {"text": result}
+def describe_batch(images: List[Image.Image], context_json: str,
+                   max_new_tokens: int = 256, temperature: float = 0.7) -> List[str]:
+    """Endpoint batch para ENGINE: lista de imágenes + contexto (JSON) → lista de textos."""
+    try:
+        context = json.loads(context_json) if context_json else None
+    except Exception:
+        context = None
+    outputs: List[str] = []
+    for img in images:
+        outputs.append(_infer_one(img, text="Describe la imagen con detalle.", max_new_tokens=max_new_tokens,
+                                  temperature=temperature, context=context))
+    return outputs
+def image_size_str(image: Image.Image) -> str:
+    """Devuelve el tamaño de la imagen en formato 'ancho x alto'."""
+    if image is None:
+        raise ValueError("Debes proporcionar una imagen.")
+    width, height = image.size
+    return f"{width}x{height}"
+# ----------------------------- UI & Endpoints --------------------------------
+with gr.Blocks(title="Salamandra Vision 7B · ZeroGPU") as demo:
+    gr.Markdown("## Salamandra-Vision 7B · ZeroGPU\nImagen + texto → descripción.")
+    with gr.Row():
+        with gr.Column():
+            in_img = gr.Image(label="Imagen", type="pil")
+            in_txt = gr.Textbox(label="Texto/prompt", value="Describe la imagen con detalle (ES/CA).")
+            max_new = gr.Slider(16, 1024, value=256, step=16, label="max_new_tokens")
+            temp = gr.Slider(0.0, 1.5, value=0.7, step=0.05, label="temperature")
+            btn = gr.Button("Generar", variant="primary")
+        with gr.Column():
+            out = gr.Textbox(label="Descripción", lines=18)
+    # UI
+    btn.click(_infer_one, [in_img, in_txt, max_new, temp], out, api_name="describe", concurrency_limit=1)
+    # API simple (multipart) compatible con tu versión anterior
+    # demo.load(
+    #     None,
+    #     [gr.Image(label="image", type="pil"),
+    #      gr.Textbox(value="Describe la imagen con detalle."),
+    #      gr.Slider(16, 1024, value=256),
+    #      gr.Slider(0.0, 1.5, value=0.7)],
+    #     describe_raw,
+    #     api_name="describe_raw"
+    # )
+    # API BATCH para ENGINE (Gradio Client): images + context_json → list[str]
+    # Firma que espera el VisionClient del engine (api_name="/predict")
+    batch_in_images = gr.Gallery(label="Imágenes (batch)", show_label=False, columns=4, height="auto")
+    batch_context = gr.Textbox(label="context_json", value="{}", lines=4)
+    batch_max = gr.Slider(16, 1024, value=256, step=16, label="max_new_tokens")
+    batch_temp = gr.Slider(0.0, 1.5, value=0.7, step=0.05, label="temperature")
+    batch_btn = gr.Button("Describir lote")
+    batch_out = gr.JSON(label="Descripciones (lista)")
+    # Nota: Gradio Gallery entrega rutas/obj; nos apoyamos en el cliente para cargar archivos
+    batch_btn.click(describe_batch, [batch_in_images, batch_context, batch_max, batch_temp], batch_out,
+                    api_name="predict", concurrency_limit=1)
+    # Endpoint utilitario: devolver tamaño de imagen como string
+    size_img = gr.Image(label="Imagen para tamaño", type="pil")
+    size_btn = gr.Button("Obtener tamaño")
+    size_out = gr.Textbox(label="Tamaño (ancho x alto)")
+    size_btn.click(image_size_str, [size_img], size_out, api_name="image_size", concurrency_limit=4)
+demo.queue(max_size=16).launch()