Spaces:

VeuReu
/

svision

Running on Zero

App Files Files Community

VeuReu commited on 24 days ago

Commit

0aa10f8

verified ·

1 Parent(s): c55a9c1

Update app.py

Browse files

Files changed (1) hide show

app.py +374 -108

app.py CHANGED Viewed

@@ -28,6 +28,7 @@ APIs/UI and the underlying machine learning models.
 # Standard library
 import json
 import os
 from typing import Any, Dict, List, Optional, Tuple, Union
 # Third-party libraries
@@ -41,6 +42,8 @@ from PIL import Image
 from scenedetect import SceneManager, VideoManager
 from scenedetect.detectors import ContentDetector
 from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration
 '''
@@ -210,22 +213,31 @@ def _get_face_embedding(
     """
     try:
         mtcnn, facenet = _load_face_models()
-        # Detect and extract face
-        face = mtcnn(image)
-        if face is None:
-            return None
-        # FaceNet expects tensor of shape (1,3,160,160)
         device = DEVICE if DEVICE == "cuda" and torch.cuda.is_available() else "cpu"
-        face = face.unsqueeze(0).to(device)
-        # Get embedding
-        with torch.no_grad():
-            emb = facenet(face).cpu().numpy()[0]
-        # Normalize embedding
-        emb = emb / np.linalg.norm(emb)
-        return emb.astype(float).tolist()
     except Exception as e:
         print(f"Face embedding failed: {e}")
@@ -296,6 +308,216 @@ def _get_scenes_extraction(
         print("Error in scenes_extraction:", e)
         return None, None
 """
 # ==============================================================================
 # API Helpers
@@ -432,79 +654,66 @@ def scenes_extraction(
     """
     return _get_scenes_extraction(video_file, threshold, offset_frames, crop_ratio)
-@spaces.GPU
 def describe_list_images(
     images: List[Image.Image]
 ) -> List[str]:
     """
-    Generate brief visual descriptions for a list of PIL Images using Salamandra Vision.
     Args:
-        images (List[Image.Image]): List of PIL Image objects to describe.
     Returns:
-        List[str]: List of descriptions, one per image.
     """
-    # Load the Salamandra Vision model
-    path_model = "BSC-LT/salamandra-7b-vision"
-    processor = AutoProcessor.from_pretrained(path_model)
-    model = LlavaOnevisionForConditionalGeneration.from_pretrained(
-        path_model,
-        torch_dtype=torch.float16,
-        low_cpu_mem_usage=True
-    ).to("cuda")
-    # System prompt for image description
-    sys_prompt = (
-        "You are an expert in visual storytelling. "
-        "Describe the image very briefly and simply in Catalan, "
-        "explaining only the main action seen. "
-        "Respond with a single short sentence (maximum 10–20 words), "
-        "without adding unnecessary details or describing the background."
-    )
-    all_results = []
-    for img in images:
-        batch = [img]
-        # Create the conversation template
-        conversation = [
-            {"role": "system", "content": sys_prompt},
-            {"role": "user", "content": [
-                {"type": "image", "image": batch[0]},
-                {"type": "text", "text": (
-                    "Describe the image very briefly and simply in Catalan."
-                )}
-            ]}
-        ]
-        prompt_batch = processor.apply_chat_template(conversation, add_generation_prompt=True)
-        # Prepare inputs for the model
-        inputs = processor(images=batch, text=prompt_batch, return_tensors="pt")
-        for k, v in inputs.items():
-            if v.dtype.is_floating_point:
-                inputs[k] = v.to("cuda", torch.float16)
-            else:
-                inputs[k] = v.to("cuda")
-        # Generate the description
-        output = model.generate(**inputs, max_new_tokens=1024)
-        text = processor.decode(output[0], skip_special_tokens=True)
-        lines = text.split("\n")
-        # Extract the assistant's answer
-        desc = ""
-        for i, line in enumerate(lines):
-            if line.lower().startswith(" assistant"):
-                desc = "\n".join(lines[i+1:]).strip()
-                break
-        print("====================")
-        print(desc)
-        all_results.append(desc)
-    return all_results
 """
 # ==============================================================================
@@ -558,32 +767,51 @@ def _compose_prompt(user_text: str, context: Optional[Dict] = None) -> List[Dict
     ]
     return convo
-with gr.Blocks(title="Salamandra Vision 7B · ZeroGPU") as demo:
-    gr.Markdown("## Salamandra-Vision 7B · ZeroGPU\nImage + text → description.")
     with gr.Row():
         with gr.Column():
-            in_img = gr.Image(label="Image", type="pil")
-            in_txt = gr.Textbox(label="Text/prompt", value="Describe the image in detail (ES/CA).")
-            max_new = gr.Slider(16, 1024, value=256, step=16, label="max_new_tokens")
-            temp = gr.Slider(0.0, 1.5, value=0.7, step=0.05, label="temperature")
-            btn = gr.Button("Generate", variant="primary")
         with gr.Column():
-            out = gr.Textbox(label="Description", lines=18)
-    # Single image inference
     btn.click(_infer_one, [in_img, in_txt, max_new, temp], out, api_name="describe", concurrency_limit=1)
-    # Batch API for engine (Gradio Client): images + context_json → list[str]
-    batch_in_images = gr.Gallery(label="Batch images", show_label=False, columns=4, height="auto")
     batch_context = gr.Textbox(label="context_json", value="{}", lines=4)
-    batch_max = gr.Slider(16, 1024, value=256, step=16, label="max_new_tokens")
-    batch_temp = gr.Slider(0.0, 1.5, value=0.7, step=0.05, label="temperature")
-    batch_btn = gr.Button("Describe batch")
-    batch_out = gr.JSON(label="Descriptions (list)")
-    # Note: Gradio Gallery returns paths/objects; the client is used to load files
     batch_btn.click(
         describe_batch,
         [batch_in_images, batch_context, batch_max, batch_temp],
@@ -591,25 +819,32 @@ with gr.Blocks(title="Salamandra Vision 7B · ZeroGPU") as demo:
         api_name="predict",
         concurrency_limit=1
     )
-    # Facial embedding section
     with gr.Row():
-        face_img = gr.Image(label="Image for facial embedding", type="pil")
-        face_btn = gr.Button("Get facial embedding")
-        face_out = gr.JSON(label="Facial embedding (vector)")
     face_btn.click(face_image_embedding, [face_img], face_out, api_name="face_image_embedding", concurrency_limit=1)
-    # Video scene extraction section
     with gr.Row():
-        video_file = gr.Video(label="Upload a video")
-        threshold = gr.Slider(0.0, 100.0, value=30.0, step=1.0, label="Threshold")
-        offset_frames = gr.Slider(0, 30, value=5, step=1, label="Offset frames")
-        crop_ratio = gr.Slider(0.0, 1.0, value=1.0, step=0.05, label="Crop ratio")
-        scenes_btn = gr.Button("Extract scenes")
-        scenes_gallery_out = gr.Gallery(label="Scene keyframes", show_label=False, columns=4, height="auto")
-        scenes_info_out = gr.JSON(label="Scene information")
-    # Bind the scene extraction function
     scenes_btn.click(
         scenes_extraction,
         inputs=[video_file, threshold, offset_frames, crop_ratio],
@@ -617,21 +852,52 @@ with gr.Blocks(title="Salamandra Vision 7B · ZeroGPU") as demo:
         api_name="scenes_extraction",
         concurrency_limit=1
     )
-    # List image description with Salamandra Vision
     with gr.Row():
-        img_input = gr.Gallery(label="List images", show_label=False, columns=4, height="auto")
-        describe_btn = gr.Button("Generate descriptions")
-        desc_output = gr.Textbox(label="Image descriptions", lines=10)
     describe_btn.click(
-        fn=lambda imgs: describe_list_images([img for img in imgs if isinstance(img, Image.Image)])
-                    if imgs else ["No images uploaded."],
         inputs=[img_input],
         outputs=desc_output,
         api_name="describe_images",
         concurrency_limit=1
     )
 demo.queue(max_size=16).launch(show_error=True)

 # Standard library
 import json
 import os
+import re
 from typing import Any, Dict, List, Optional, Tuple, Union
 # Third-party libraries
 from scenedetect import SceneManager, VideoManager
 from scenedetect.detectors import ContentDetector
 from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration
+from wordfreq import zipf_frequency
+import easyocr
 '''
     """
     try:
         mtcnn, facenet = _load_face_models()
+        boxes, probs = mtcnn.detect(image)
+        if boxes is None:
+            return []
+        embeddings = []
         device = DEVICE if DEVICE == "cuda" and torch.cuda.is_available() else "cpu"
+        for box in boxes:
+            x1, y1, x2, y2 = map(int, box)
+            face = image.crop((x1, y1, x2, y2))
+            face_tensor = mtcnn(face)
+            if face_tensor is None:
+                continue
+            face_tensor = face_tensor.unsqueeze(0).to(device)
+            with torch.no_grad():
+                emb = facenet(face_tensor).cpu().numpy()[0]
+            emb = emb / np.linalg.norm(emb)
+            embeddings.append(emb.astype(float).tolist())
+        return embeddings
     except Exception as e:
         print(f"Face embedding failed: {e}")
         print("Error in scenes_extraction:", e)
         return None, None
+@spaces.GPU
+def _get_image_list_description(
+    images: List[Image.Image]
+) -> List[str]:
+    """
+    Generate brief visual descriptions for a list of PIL Images using Salamandra Vision.
+    Args:
+        images (List[Image.Image]): List of PIL Image objects to describe.
+    Returns:
+        List[str]: List of descriptions, one per image.
+    """
+    list_images = [x[0] for x in images]
+    # Load the Salamandra Vision model
+    path_model = "BSC-LT/salamandra-7b-vision"
+    processor = AutoProcessor.from_pretrained(path_model)
+    model = LlavaOnevisionForConditionalGeneration.from_pretrained(
+        path_model,
+        torch_dtype=torch.float16,
+        low_cpu_mem_usage=False
+    ).to("cuda")
+    # System prompt for image description
+    sys_prompt = (
+        "Ets un expert en narrativa visual. "
+        "Descriu la imatge de manera molt breu i senzilla en català, "
+        "explicant només l'acció principal que es veu. "
+        "Respon amb una única frase curta (màxim 10–20 paraules), "
+        "sense afegir detalls innecessaris ni descriure el fons."
+    )
+    all_results = []
+    for img in list_images:
+        batch = [img]
+        # Create the conversation template
+        conversation = [
+            {"role": "system", "content": sys_prompt},
+            {"role": "user", "content": [
+                {"type": "image", "image": batch[0]},
+                {"type": "text", "text": (
+                    "Descriu la imatge de manera molt breu i senzilla en català."
+                )}
+            ]}
+        ]
+        prompt_batch = processor.apply_chat_template(conversation, add_generation_prompt=True)
+        # Prepare inputs for the model
+        inputs = processor(images=batch, text=prompt_batch, return_tensors="pt")
+        for k, v in inputs.items():
+            if v.dtype.is_floating_point:
+                inputs[k] = v.to("cuda", torch.float16)
+            else:
+                inputs[k] = v.to("cuda")
+        # Generate the description
+        output = model.generate(**inputs, max_new_tokens=1024)
+        text = processor.decode(output[0], skip_special_tokens=True)
+        lines = text.split("\n")
+        # Extract the assistant's answer
+        desc = ""
+        for i, line in enumerate(lines):
+            if line.lower().startswith(" assistant"):
+                desc = "\n".join(lines[i+1:]).strip()
+                break
+        all_results.append(desc)
+    return all_results
+@spaces.GPU
+def _get_ocr_characters_to_image(
+    image: Image.Image,
+    informacion_image: Dict[str, Any],
+    face_col: List[Dict[str, Any]]
+    ) -> Dict[str, Any]:
+    """
+    Process an input image by detecting faces, generating face embeddings,
+    performing K-nearest neighbors (KNN) matching against a known face database,
+    and extracting OCR (Optical Character Recognition) text using EasyOCR.
+    The function performs the following steps:
+    1. Detects faces in the image and generates embeddings for each face.
+    2. For each detected face, retrieves the top 3 closest embeddings from the
+       reference database and determines the identity if the distance is below
+       a defined threshold.
+    3. Executes OCR using EasyOCR to extract textual content from the image.
+       It filters the OCR output by removing uncommon or noisy words, and
+       validates results using zipf word frequency to ensure linguistic relevance.
+    4. Returns a dictionary containing metadata, detected identities, and OCR text.
+    Parameters
+    ----------
+    image : PIL.Image.Image
+        The image to process.
+    informacion_image : Dict[str, Any]
+        Metadata about the image (index, start time, end time), provided as JSON.
+    face_col : List[Dict[str, Any]]
+        A list of dictionaries containing stored face embeddings and names,
+        provided as JSON.
+    Returns
+    -------
+    Dict[str, Any]
+        A dictionary containing:
+            - id: image identifier
+            - start: start timestamp
+            - end: end timestamp
+            - faces: list of detected identities
+            - ocr: extracted OCR text
+    """
+    # First, detect faces in the image and generate embeddings for each of them.
+    raw_faces = _get_face_embedding(image)
+    informacion_image_dict = json.loads(informacion_image)
+    face_col = json.loads(face_col)
+    faces_detected = []
+    for f in raw_faces:
+        embedding_image = f
+        identity = "Desconegut"
+        knn = []
+        # Now search for the 3 nearest neighbors in the database for each embedding.
+        if face_col and embedding_image is not None:
+            try:
+                num_embeddings = len(face_col)
+                if num_embeddings < 1:
+                    knn = []
+                    identity = "Desconegut"
+                else:
+                    n_results = min(3, num_embeddings)
+                    embedding_image = np.array(embedding_image)
+                    distances_embedding = []
+                    # Compute Euclidean distance between the detected face and each stored embedding
+                    for image_base_datos in face_col:
+                        image_base_datos_embedding = np.array(image_base_datos["embedding"])
+                        distance = np.linalg.norm(embedding_image - image_base_datos_embedding)
+                        distances_embedding.append({
+                            "identity": image_base_datos["nombre"],
+                            "distance": float(distance)
+                        })
+                    # Sort by distance and keep the top N matches
+                    distances_embedding = sorted(distances_embedding, key=lambda x: x["distance"])
+                    knn = distances_embedding[:n_results]
+                    # Assign identity if closest match is below distance threshold
+                    if knn and knn[0]["distance"] < 0.8:
+                        identity = knn[0]["identity"]
+                    else:
+                        identity = "Desconegut"
+            except Exception as e:
+                print(f"Face KNN failed: {e}")
+                knn = []
+                identity = "Desconegut"
+        faces_detected.append(identity)
+    # Now perform OCR detection
+    ocr_text_easyocr = ""
+    use_easyocr = True
+    if use_easyocr:
+        try:
+            rgb = np.array(image)
+            bgr = cv2.cvtColor(rgb, cv2.COLOR_RGB2BGR)
+            # EasyOCR reader for English and Spanish
+            reader = easyocr.Reader(['en', 'es'], gpu=True)
+            results = reader.readtext(bgr)
+            # Join OCR results into a single text string
+            ocr_text_easyocr = " ".join([text for _, text, _ in results]).strip()
+            # Filter out uncommon or malformed words
+            palabras_ocr_text = ocr_text_easyocr.split()
+            palabras_ocr_text = [p for p in palabras_ocr_text if re.fullmatch(r'[A-Za-zÀ-ÿ]+', p)]
+            # Keep OCR text only if at least one word is linguistically valid
+            for palabra in palabras_ocr_text:
+                if zipf_frequency(palabra, "ca") != 0.0:
+                    break
+            else:
+                ocr_text_easyocr = ""
+        except Exception as e:
+            print(f"OCR error: {e}")
+            return None
+    # Final structured output with metadata, faces, and OCR
+    informacion_image_completo = {
+        "id": informacion_image_dict["index"],
+        "start": informacion_image_dict["start"],
+        "end": informacion_image_dict["end"],
+        "faces": faces_detected,
+        "ocr": ocr_text_easyocr,
+    }
+    return informacion_image_completo
 """
 # ==============================================================================
 # API Helpers
     """
     return _get_scenes_extraction(video_file, threshold, offset_frames, crop_ratio)
 def describe_list_images(
     images: List[Image.Image]
 ) -> List[str]:
     """
+    Endpoint wrapper for generating brief descriptions of a list of images.
+    This function acts as a wrapper around the internal `_get_image_list_description` function.
+    It takes a list of PIL Images and returns a list of short textual descriptions for each image.
     Args:
+        images (List[Image.Image]): A list of PIL Image objects to describe.
     Returns:
+        List[str]: A list of strings, where each string is a brief description of the corresponding image.
     """
+    return _get_image_list_description(images)
+def add_ocr_characters_to_image(
+    image: Image.Image,
+    informacion_image: Dict[str, Any],
+    face_col: List[Dict[str, Any]]
+    ) -> Dict[str, Any]:
+    """
+    Endpoint wrapper for processing an image to extract face identities and OCR text.
+    This function serves as a wrapper for the internal `_get_ocr_characters_to_image`
+    function. It receives an image, metadata describing that image, and a collection
+    of stored face embeddings. The wrapped internal function performs the following:
+    1. Detects faces and generates embeddings for each detected face.
+    2. Matches these embeddings against a reference database using K-nearest neighbors.
+    3. Runs OCR (Optical Character Recognition) on the image to extract textual content.
+    4. Applies filtering to discard invalid or noisy OCR results.
+    5. Returns a structured dictionary containing image metadata, identified faces,
+       and OCR-extracted text.
+    Parameters
+    ----------
+    image : PIL.Image.Image
+        The image object to be analyzed.
+    informacion_image : Dict[str, Any]
+        Metadata describing the image (such as index, start timestamp, end timestamp).
+    face_col : List[Dict[str, Any]]
+        A list of dictionaries representing stored face embeddings and related identity
+        information, used for similarity matching.
+    Returns
+    -------
+    Dict[str, Any]
+        A dictionary containing:
+            - id: the image identifier
+            - start: start timestamp
+            - end: end timestamp
+            - faces: detected face identities
+            - ocr: the extracted OCR text
+    """
+    return _get_ocr_characters_to_image(image,informacion_image,face_col)
 """
 # ==============================================================================
     ]
     return convo
+custom_css = """
+h2 {
+    background: #e3e4e6 !important;
+    padding: 14px 22px !important;
+    border-radius: 14px !important;
+    box-shadow: 0 4px 12px rgba(0,0,0,0.08) !important;
+    display: block !important;       /* ocupa tot l'ample */
+    width: 100% !important;          /* assegura 100% */
+    margin: 20px auto !important;
+    text-align:center;
+}
+"""
+with gr.Blocks(title="Salamandra Vision 7B · ZeroGPU", css=custom_css) as demo:
+    # Main title H1 centered
+    gr.Markdown('<h1 style="text-align:center">SALAMANDRA VISION 7B · ZEROGPU</h1>')
+    gr.Markdown("---")
+    # ---------------------
+    # Section: Single image inference
+    # ---------------------
+    gr.Markdown('<h2 style="text-align:center">Inferència per imatge única</h2>')
     with gr.Row():
         with gr.Column():
+            in_img = gr.Image(label="Imatge", type="pil")
+            in_txt = gr.Textbox(label="Text/prompt", value="Descriu la imatge amb detall (ES/CA).")
+            max_new = gr.Slider(16, 1024, value=256, step=16, label="màx_tokens nous")
+            temp = gr.Slider(0.0, 1.5, value=0.7, step=0.05, label="temperatura")
+            btn = gr.Button("Genera", variant="primary")
         with gr.Column():
+            out = gr.Textbox(label="Descripció", lines=18)
     btn.click(_infer_one, [in_img, in_txt, max_new, temp], out, api_name="describe", concurrency_limit=1)
+    gr.Markdown("---")
+    # ---------------------
+    # Section: Batch images
+    # ---------------------
+    gr.Markdown('<h2 style="text-align:center">Llot d’imatges</h2>')
+    batch_in_images = gr.Gallery(label="Llot d’imatges", show_label=False, columns=4, height="auto")
     batch_context = gr.Textbox(label="context_json", value="{}", lines=4)
+    batch_max = gr.Slider(16, 1024, value=256, step=16, label="màx_tokens nous")
+    batch_temp = gr.Slider(0.0, 1.5, value=0.7, step=0.05, label="temperatura")
+    batch_btn = gr.Button("Descriu el lot")
+    batch_out = gr.JSON(label="Descripcions (llista)")
     batch_btn.click(
         describe_batch,
         [batch_in_images, batch_context, batch_max, batch_temp],
         api_name="predict",
         concurrency_limit=1
     )
+    gr.Markdown("---")
+    # ---------------------
+    # Section: Facial embeddings
+    # ---------------------
+    gr.Markdown('<h2 style="text-align:center">Embeddings facials</h2>')
     with gr.Row():
+        face_img = gr.Image(label="Imatge per embedding facial", type="pil")
+        face_btn = gr.Button("Obté embedding facial")
+        face_out = gr.JSON(label="Embedding facial (vector)")
     face_btn.click(face_image_embedding, [face_img], face_out, api_name="face_image_embedding", concurrency_limit=1)
+    gr.Markdown("---")
+    # ---------------------
+    # Section: Video scene extraction
+    # ---------------------
+    gr.Markdown('<h2 style="text-align:center">Extracció d’escenes de vídeo</h2>')
     with gr.Row():
+        video_file = gr.Video(label="Puja un vídeo")
+        threshold = gr.Slider(0.0, 100.0, value=30.0, step=1.0, label="Llindar")
+        offset_frames = gr.Slider(0, 30, value=5, step=1, label="Desplaçament de frames")
+        crop_ratio = gr.Slider(0.0, 1.0, value=1.0, step=0.05, label="Raó de retall")
+        scenes_btn = gr.Button("Extreu escenes")
+        scenes_gallery_out = gr.Gallery(label="Fotogrames clau de l’escena", show_label=False, columns=4, height="auto")
+        scenes_info_out = gr.JSON(label="Informació de l’escena")
     scenes_btn.click(
         scenes_extraction,
         inputs=[video_file, threshold, offset_frames, crop_ratio],
         api_name="scenes_extraction",
         concurrency_limit=1
     )
+    gr.Markdown("---")
+    # ---------------------
+    # Section: Batch description with Salamandra Vision
+    # ---------------------
+    gr.Markdown('<h2 style="text-align:center">Descripció per lots amb Salamandra Vision</h2>')
     with gr.Row():
+        img_input = gr.Gallery(label="Llot d’imatges", show_label=False)
+        describe_btn = gr.Button("Genera descripcions")
+        desc_output = gr.Textbox(label="Descripcions de les imatges")
     describe_btn.click(
+        describe_list_images,
         inputs=[img_input],
         outputs=desc_output,
         api_name="describe_images",
         concurrency_limit=1
     )
+    gr.Markdown("---")
+    # ---------------------
+    # Section: Add OCR and characters to image
+    # ---------------------
+    gr.Markdown('<h2 style="text-align:center">Afegiu OCR i informació de caràcters al vídeo</h2>')
+    with gr.Row():
+        img_input = gr.Image(label="Imatge per ampliar la descripció", type="pil")
+        info_input = gr.Textbox(
+            label="Diccionari informacion_image (format JSON)",
+            placeholder='{"index": 0, "start": 0.0, "end": 1.2}',
+            lines=3
+        )
+        faces_input = gr.Textbox(
+            label="Llistat de diccionaris face_col (format JSON)",
+            placeholder='[{"nombre": "Anna", "embedding": [0.12, 0.88, ...]}, ...]',
+            lines=5
+        )
+        process_btn = gr.Button("Processar imatge (OCR + Persones)")
+        output_json = gr.JSON(label="Resultat complet")
+    process_btn.click(
+        add_ocr_characters_to_image,
+        inputs=[img_input, info_input, faces_input],
+        outputs=output_json,
+        api_name="add_ocr_and_faces",
+        concurrency_limit=1
+    )
 demo.queue(max_size=16).launch(show_error=True)