Explainable-Vision-Language-Model

Running on Zero

App Files Files Community

khang119966 commited on Apr 13

Commit

1a04091

verified ·

1 Parent(s): 4fc9e5e

Update app.py

Browse files

Files changed (1) hide show

app.py +53 -3

app.py CHANGED Viewed

@@ -28,6 +28,8 @@ import spaces
 import subprocess
 import os
 from moviepy.editor import VideoFileClip, AudioFileClip
 subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
@@ -446,7 +448,14 @@ model = AutoModel.from_pretrained(
     trust_remote_code=True,
 ).eval().cuda()
 tokenizer = AutoTokenizer.from_pretrained("khang119966/Vintern-1B-v3_5-explainableAI", trust_remote_code=True, use_fast=False)
 @spaces.GPU
 def generate_video(image, prompt, max_tokens):
     print(image)
@@ -517,11 +526,52 @@ def generate_video(image, prompt, max_tokens):
         input_token = predict_token_text
         heatmap_imgs.append(overlay)
-    return "path_to_generated_video.mp4"
 with gr.Blocks() as demo:
-    gr.Markdown("### Simple VLM Demo")
     with gr.Row():
         with gr.Column():

 import subprocess
 import os
 from moviepy.editor import VideoFileClip, AudioFileClip
+import multiprocessing
+import imageio
 subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
     trust_remote_code=True,
 ).eval().cuda()
 tokenizer = AutoTokenizer.from_pretrained("khang119966/Vintern-1B-v3_5-explainableAI", trust_remote_code=True, use_fast=False)
+# Hàm bao để truyền vào multiprocessing
+def generate_text_img_wrapper(args):
+    return generate_text_image_with_html2image(*args, image_width=500, min_height=1000)
+def generate_hidden_img_wrapper(args):
+    return render_next_token_table_image(*args)
 @spaces.GPU
 def generate_video(image, prompt, max_tokens):
     print(image)
         input_token = predict_token_text
         heatmap_imgs.append(overlay)
+    # Dùng multiprocessing
+    with multiprocessing.Pool(processes=20) as pool:
+        text_imgs = pool.map(generate_text_img_wrapper, params_for_text)
+        hidden_imgs = pool.map(generate_hidden_img_wrapper, params_for_hidden)
+    for i in range(len(text_imgs)):
+        overlay = heatmap_imgs[i]
+        text_img = text_imgs[i]
+        predict_hidden_states = hidden_imgs[i]
+        overlay_adjusted = adjust_overlay(overlay, text_img)
+        predict_hidden_states = adjust_overlay(predict_hidden_states, text_img)
+        combined_image = np.hstack((overlay_adjusted, text_img, predict_hidden_states))
+        visualization_frames.append(combined_image)
+    resized_visualization_frames = []
+    for frame in visualization_frames:
+        frame = cv2.resize(frame,(visualization_frames[0].shape[1],visualization_frames[0].shape[0]))
+        resized_visualization_frames.append(frame)
+    # Lưu thành video MP4 bằng imageio
+    imageio.mimsave(
+        'heatmap_animation.mp4',
+        resized_visualization_frames,  # dạng RGB
+        fps=5
+    )
+    return "heatmap_animation.mp4"
 with gr.Blocks() as demo:
+    gr.Markdown("""## 🎥 Visualizing How Multimodal Models Think
+This tool generates a video to **visualize how a multimodal model (image + text)** attends to different parts of an image while generating text.
+### 📌 What it does:
+- Takes an input image and a text prompt.
+- Shows how the model’s attention shifts on the image for each generated token.
+- Helps explain the model’s behavior and decision-making.
+### 🖼️ Video layout (per frame):
+Each frame in the video includes:
+1. 🔥 **Heatmap over image**: Shows which area the model focuses on.
+2. 📝 **Generated text**: With old context, current token highlighted.
+3. 📊 **Token prediction table**: Shows the model’s top next-token guesses.
+### 🎯 Use cases:
+- Research explainability of vision-language models.
+- Debugging or interpreting model outputs.
+- Creating educational visualizations.
+""")
     with gr.Row():
         with gr.Column():