VeuReu commited on
Commit
bc0102e
·
verified ·
1 Parent(s): e2dc4cb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +150 -136
app.py CHANGED
@@ -1,136 +1,150 @@
1
- # app.py — veureu/svision (Salamandra Vision 7B · ZeroGPU) — compatible con ENGINE
2
- import os
3
- import json
4
- from typing import Dict, List, Optional, Tuple, Union
5
-
6
- import gradio as gr
7
- import spaces
8
- import torch
9
- from PIL import Image
10
- from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration
11
-
12
- MODEL_ID = os.environ.get("MODEL_ID", "BSC-LT/salamandra-7b-vision")
13
- DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32
14
- DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
15
-
16
- _model = None
17
- _processor = None
18
-
19
-
20
- def _lazy_load() -> Tuple[LlavaOnevisionForConditionalGeneration, AutoProcessor]:
21
- global _model, _processor
22
- if _model is None or _processor is None:
23
- _processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
24
- _model = LlavaOnevisionForConditionalGeneration.from_pretrained(
25
- MODEL_ID,
26
- dtype=DTYPE,
27
- low_cpu_mem_usage=True,
28
- trust_remote_code=True,
29
- use_safetensors=True,
30
- device_map=None,
31
- )
32
- _model.to(DEVICE)
33
- return _model, _processor
34
-
35
-
36
- def _compose_prompt(user_text: str, context: Optional[Dict] = None) -> List[Dict]:
37
- """Construye el chat template con imagen + texto + contexto opcional."""
38
- ctx_txt = ""
39
- if context:
40
- try:
41
- # breve, sin ruido
42
- ctx_txt = "\n\nContexto adicional:\n" + json.dumps(context, ensure_ascii=False)[:2000]
43
- except Exception:
44
- pass
45
- user_txt = (user_text or "Describe la imagen con detalle.") + ctx_txt
46
- convo = [
47
- {
48
- "role": "user",
49
- "content": [
50
- {"type": "image"},
51
- {"type": "text", "text": user_txt},
52
- ],
53
- }
54
- ]
55
- return convo
56
-
57
-
58
- @spaces.GPU # en HF Spaces usará GPU cuando haya disponibilidad (ZeroGPU)
59
- def _infer_one(image: Image.Image, text: str, max_new_tokens: int = 256, temperature: float = 0.7,
60
- context: Optional[Dict] = None) -> str:
61
- # Reducir el tamaño de la imagen para ahorrar memoria en la GPU
62
- image.thumbnail((1024, 1024))
63
-
64
- model, processor = _lazy_load()
65
- prompt = processor.apply_chat_template(_compose_prompt(text, context), add_generation_prompt=True)
66
- inputs = processor(images=image, text=prompt, return_tensors="pt").to(DEVICE, dtype=DTYPE)
67
- with torch.inference_mode():
68
- out = model.generate(**inputs, max_new_tokens=int(max_new_tokens), temperature=float(temperature))
69
- return processor.decode(out[0], skip_special_tokens=True).strip()
70
-
71
-
72
- # ----------------------------- API helpers -----------------------------------
73
-
74
- def describe_raw(image: Image.Image, text: str = "Describe la imagen con detalle.",
75
- max_new_tokens: int = 256, temperature: float = 0.7) -> Dict[str, str]:
76
- result = _infer_one(image, text, max_new_tokens, temperature, context=None)
77
- return {"text": result}
78
-
79
-
80
- def describe_batch(images: List[Image.Image], context_json: str,
81
- max_new_tokens: int = 256, temperature: float = 0.7) -> List[str]:
82
- """Endpoint batch para ENGINE: lista de imágenes + contexto (JSON) → lista de textos."""
83
- try:
84
- context = json.loads(context_json) if context_json else None
85
- except Exception:
86
- context = None
87
- outputs: List[str] = []
88
- for img in images:
89
- outputs.append(_infer_one(img, text="Describe la imagen con detalle.", max_new_tokens=max_new_tokens,
90
- temperature=temperature, context=context))
91
- return outputs
92
-
93
-
94
- # ----------------------------- UI & Endpoints --------------------------------
95
-
96
- with gr.Blocks(title="Salamandra Vision 7B · ZeroGPU") as demo:
97
- gr.Markdown("## Salamandra-Vision 7B · ZeroGPU\nImagen + texto → descripción.")
98
- with gr.Row():
99
- with gr.Column():
100
- in_img = gr.Image(label="Imagen", type="pil")
101
- in_txt = gr.Textbox(label="Texto/prompt", value="Describe la imagen con detalle (ES/CA).")
102
- max_new = gr.Slider(16, 1024, value=256, step=16, label="max_new_tokens")
103
- temp = gr.Slider(0.0, 1.5, value=0.7, step=0.05, label="temperature")
104
- btn = gr.Button("Generar", variant="primary")
105
- with gr.Column():
106
- out = gr.Textbox(label="Descripción", lines=18)
107
-
108
- # UI
109
- btn.click(_infer_one, [in_img, in_txt, max_new, temp], out, api_name="describe", concurrency_limit=1)
110
-
111
- # API simple (multipart) compatible con tu versión anterior
112
- # demo.load(
113
- # None,
114
- # [gr.Image(label="image", type="pil"),
115
- # gr.Textbox(value="Describe la imagen con detalle."),
116
- # gr.Slider(16, 1024, value=256),
117
- # gr.Slider(0.0, 1.5, value=0.7)],
118
- # describe_raw,
119
- # api_name="describe_raw"
120
- # )
121
-
122
- # API BATCH para ENGINE (Gradio Client): images + context_json → list[str]
123
- # Firma que espera el VisionClient del engine (api_name="/predict")
124
- batch_in_images = gr.Gallery(label="Imágenes (batch)", show_label=False, columns=4, height="auto")
125
- batch_context = gr.Textbox(label="context_json", value="{}", lines=4)
126
- batch_max = gr.Slider(16, 1024, value=256, step=16, label="max_new_tokens")
127
- batch_temp = gr.Slider(0.0, 1.5, value=0.7, step=0.05, label="temperature")
128
- batch_btn = gr.Button("Describir lote")
129
- batch_out = gr.JSON(label="Descripciones (lista)")
130
-
131
- # Nota: Gradio Gallery entrega rutas/obj; nos apoyamos en el cliente para cargar archivos
132
- batch_btn.click(describe_batch, [batch_in_images, batch_context, batch_max, batch_temp], batch_out,
133
- api_name="predict", concurrency_limit=1)
134
-
135
- demo.queue(max_size=16).launch()
136
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py — veureu/svision (Salamandra Vision 7B · ZeroGPU) — compatible con ENGINE
2
+ import os
3
+ import json
4
+ from typing import Dict, List, Optional, Tuple, Union
5
+
6
+ import gradio as gr
7
+ import spaces
8
+ import torch
9
+ from PIL import Image
10
+ from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration
11
+
12
+ MODEL_ID = os.environ.get("MODEL_ID", "BSC-LT/salamandra-7b-vision")
13
+ DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32
14
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
15
+
16
+ _model = None
17
+ _processor = None
18
+
19
+
20
+ def _lazy_load() -> Tuple[LlavaOnevisionForConditionalGeneration, AutoProcessor]:
21
+ global _model, _processor
22
+ if _model is None or _processor is None:
23
+ _processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
24
+ _model = LlavaOnevisionForConditionalGeneration.from_pretrained(
25
+ MODEL_ID,
26
+ dtype=DTYPE,
27
+ low_cpu_mem_usage=True,
28
+ trust_remote_code=True,
29
+ use_safetensors=True,
30
+ device_map=None,
31
+ )
32
+ _model.to(DEVICE)
33
+ return _model, _processor
34
+
35
+
36
+ def _compose_prompt(user_text: str, context: Optional[Dict] = None) -> List[Dict]:
37
+ """Construye el chat template con imagen + texto + contexto opcional."""
38
+ ctx_txt = ""
39
+ if context:
40
+ try:
41
+ # breve, sin ruido
42
+ ctx_txt = "\n\nContexto adicional:\n" + json.dumps(context, ensure_ascii=False)[:2000]
43
+ except Exception:
44
+ pass
45
+ user_txt = (user_text or "Describe la imagen con detalle.") + ctx_txt
46
+ convo = [
47
+ {
48
+ "role": "user",
49
+ "content": [
50
+ {"type": "image"},
51
+ {"type": "text", "text": user_txt},
52
+ ],
53
+ }
54
+ ]
55
+ return convo
56
+
57
+
58
+ @spaces.GPU # en HF Spaces usará GPU cuando haya disponibilidad (ZeroGPU)
59
+ def _infer_one(image: Image.Image, text: str, max_new_tokens: int = 256, temperature: float = 0.7,
60
+ context: Optional[Dict] = None) -> str:
61
+ # Reducir el tamaño de la imagen para ahorrar memoria en la GPU
62
+ image.thumbnail((1024, 1024))
63
+
64
+ model, processor = _lazy_load()
65
+ prompt = processor.apply_chat_template(_compose_prompt(text, context), add_generation_prompt=True)
66
+ inputs = processor(images=image, text=prompt, return_tensors="pt").to(DEVICE, dtype=DTYPE)
67
+ with torch.inference_mode():
68
+ out = model.generate(**inputs, max_new_tokens=int(max_new_tokens), temperature=float(temperature))
69
+ return processor.decode(out[0], skip_special_tokens=True).strip()
70
+
71
+
72
+ # ----------------------------- API helpers -----------------------------------
73
+
74
+ def describe_raw(image: Image.Image, text: str = "Describe la imagen con detalle.",
75
+ max_new_tokens: int = 256, temperature: float = 0.7) -> Dict[str, str]:
76
+ result = _infer_one(image, text, max_new_tokens, temperature, context=None)
77
+ return {"text": result}
78
+
79
+
80
+ def describe_batch(images: List[Image.Image], context_json: str,
81
+ max_new_tokens: int = 256, temperature: float = 0.7) -> List[str]:
82
+ """Endpoint batch para ENGINE: lista de imágenes + contexto (JSON) → lista de textos."""
83
+ try:
84
+ context = json.loads(context_json) if context_json else None
85
+ except Exception:
86
+ context = None
87
+ outputs: List[str] = []
88
+ for img in images:
89
+ outputs.append(_infer_one(img, text="Describe la imagen con detalle.", max_new_tokens=max_new_tokens,
90
+ temperature=temperature, context=context))
91
+ return outputs
92
+
93
+
94
+ def image_size_str(image: Image.Image) -> str:
95
+ """Devuelve el tamaño de la imagen en formato 'ancho x alto'."""
96
+ if image is None:
97
+ raise ValueError("Debes proporcionar una imagen.")
98
+ width, height = image.size
99
+ return f"{width}x{height}"
100
+
101
+
102
+ # ----------------------------- UI & Endpoints --------------------------------
103
+
104
+ with gr.Blocks(title="Salamandra Vision 7B · ZeroGPU") as demo:
105
+ gr.Markdown("## Salamandra-Vision 7B · ZeroGPU\nImagen + texto → descripción.")
106
+ with gr.Row():
107
+ with gr.Column():
108
+ in_img = gr.Image(label="Imagen", type="pil")
109
+ in_txt = gr.Textbox(label="Texto/prompt", value="Describe la imagen con detalle (ES/CA).")
110
+ max_new = gr.Slider(16, 1024, value=256, step=16, label="max_new_tokens")
111
+ temp = gr.Slider(0.0, 1.5, value=0.7, step=0.05, label="temperature")
112
+ btn = gr.Button("Generar", variant="primary")
113
+ with gr.Column():
114
+ out = gr.Textbox(label="Descripción", lines=18)
115
+
116
+ # UI
117
+ btn.click(_infer_one, [in_img, in_txt, max_new, temp], out, api_name="describe", concurrency_limit=1)
118
+
119
+ # API simple (multipart) compatible con tu versión anterior
120
+ # demo.load(
121
+ # None,
122
+ # [gr.Image(label="image", type="pil"),
123
+ # gr.Textbox(value="Describe la imagen con detalle."),
124
+ # gr.Slider(16, 1024, value=256),
125
+ # gr.Slider(0.0, 1.5, value=0.7)],
126
+ # describe_raw,
127
+ # api_name="describe_raw"
128
+ # )
129
+
130
+ # API BATCH para ENGINE (Gradio Client): images + context_json → list[str]
131
+ # Firma que espera el VisionClient del engine (api_name="/predict")
132
+ batch_in_images = gr.Gallery(label="Imágenes (batch)", show_label=False, columns=4, height="auto")
133
+ batch_context = gr.Textbox(label="context_json", value="{}", lines=4)
134
+ batch_max = gr.Slider(16, 1024, value=256, step=16, label="max_new_tokens")
135
+ batch_temp = gr.Slider(0.0, 1.5, value=0.7, step=0.05, label="temperature")
136
+ batch_btn = gr.Button("Describir lote")
137
+ batch_out = gr.JSON(label="Descripciones (lista)")
138
+
139
+ # Nota: Gradio Gallery entrega rutas/obj; nos apoyamos en el cliente para cargar archivos
140
+ batch_btn.click(describe_batch, [batch_in_images, batch_context, batch_max, batch_temp], batch_out,
141
+ api_name="predict", concurrency_limit=1)
142
+
143
+ # Endpoint utilitario: devolver tamaño de imagen como string
144
+ size_img = gr.Image(label="Imagen para tamaño", type="pil")
145
+ size_btn = gr.Button("Obtener tamaño")
146
+ size_out = gr.Textbox(label="Tamaño (ancho x alto)")
147
+ size_btn.click(image_size_str, [size_img], size_out, api_name="image_size", concurrency_limit=4)
148
+
149
+ demo.queue(max_size=16).launch()
150
+