Spaces:
Running
on
Zero
Running
on
Zero
File size: 12,351 Bytes
5b3defa 81bd1a1 5b3defa 79927f3 5b3defa 79927f3 5b3defa 79927f3 5b3defa 79927f3 4700e59 79927f3 5b3defa 81bd1a1 5b3defa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 |
"""
Visualization utilities for drawing text detection boxes on images
"""
import numpy as np
from PIL import Image, ImageDraw, ImageFont
from typing import List, Dict, Tuple
import os
import math
def generate_random_color() -> Tuple[int, int, int]:
"""
Generate a random color for bounding boxes
Returns:
RGB color tuple
"""
return (
np.random.randint(0, 200),
np.random.randint(0, 200),
np.random.randint(0, 255)
)
def draw_detection_boxes(
image: Image.Image,
detections: List[Dict],
box_width: int = 2,
font_size: int = 12,
show_text: bool = True,
merge_boxes: bool = True
) -> Image.Image:
"""
Draw text detection boxes with labels on image
Args:
image: PIL Image to draw on
detections: List of detection dicts with 'text', 'x1', 'y1', 'x2', 'y2'
box_width: Width of bounding box lines
font_size: Font size for text labels
show_text: Whether to show text labels
merge_boxes: Whether to merge close boxes (default: True)
Returns:
New image with boxes and labels drawn
"""
# Merge detections if requested
if merge_boxes:
detections = merge_detections(detections)
# Create a copy of the image
img_draw = image.copy().convert('RGBA')
# Create transparent overlay for semi-transparent boxes
overlay = Image.new('RGBA', img_draw.size, (0, 0, 0, 0))
draw_overlay = ImageDraw.Draw(overlay)
draw = ImageDraw.Draw(img_draw)
# Try to load a better font that supports CJK (Chinese/Japanese/Korean)
# Prioritize local fonts folder for portability
font_paths = [
# Local fonts (project/fonts/) - Prioritize slim/light fonts
os.path.join(os.path.dirname(__file__), "fonts", "NotoSansCJK-Light.ttc"),
os.path.join(os.path.dirname(__file__), "fonts", "NotoSansCJK-Regular.ttc"),
os.path.join(os.path.dirname(__file__), "fonts", "STHeiti-Light.ttc"),
# macOS fonts
"/System/Library/Fonts/STHeiti Light.ttc",
"/System/Library/Fonts/PingFang.ttc",
"/System/Library/Fonts/Hiragino Sans GB.ttc",
# Linux fonts
"/usr/share/fonts/truetype/noto/NotoSansCJK-Light.ttc",
"/usr/share/fonts/truetype/noto/NotoSansCJK-Regular.ttc",
"/usr/share/fonts/truetype/wqy/wqy-microhei.ttc"
]
font = None
valid_font_path = None
for path in font_paths:
try:
font = ImageFont.truetype(path, font_size)
valid_font_path = path
break
except (IOError, OSError):
continue
if font is None:
# Fallback to default if no custom font loaded
font = ImageFont.load_default()
# Draw each detection
for i, detection in enumerate(detections, 1):
try:
text = detection['text']
x1, y1 = detection['x1'], detection['y1']
x2, y2 = detection['x2'], detection['y2']
# Calculate box dimensions
box_w = x2 - x1
box_h = y2 - y1
# Helper function to wrap text and calculate size
def get_text_layout(text, font, max_width):
lines = []
raw_lines = text.split('\n')
for raw_line in raw_lines:
current_line = ""
for char in raw_line:
test_line = current_line + char
bbox = draw.textbbox((0, 0), test_line, font=font)
if bbox[2] - bbox[0] < max_width:
current_line = test_line
else:
if current_line:
lines.append(current_line)
current_line = char
if current_line:
lines.append(current_line)
# Calculate total height
if not lines:
return [], 0, 0
# Get line height from font metrics
ascent, descent = font.getmetrics()
line_height = ascent + descent
total_height = len(lines) * line_height * 1.2 # 1.2 line spacing
max_line_w = 0
for line in lines:
bbox = draw.textbbox((0, 0), line, font=font)
max_line_w = max(max_line_w, bbox[2] - bbox[0])
return lines, total_height, max_line_w
# Use fixed font size as requested
font_size_to_use = 12
try:
if valid_font_path:
font_to_use = ImageFont.truetype(valid_font_path, font_size_to_use)
else:
font_to_use = ImageFont.load_default()
except:
font_to_use = ImageFont.load_default()
# Calculate max allowed dimensions (max 20% larger)
max_allowed_w = int(box_w * 1.2)
max_allowed_h = int(box_h * 1.2)
# Try layout with max allowed width to minimize height
# Use -8 for padding (4px left, 4px right)
lines, total_h, max_line_w = get_text_layout(text, font_to_use, max_allowed_w - 8)
# Determine new dimensions, capped at 20% expansion
# We ensure we don't shrink below original size
new_w = max(box_w, min(max_line_w + 8, max_allowed_w))
new_h = max(box_h, min(total_h + 4, max_allowed_h))
# Update box coordinates
x2 = x1 + new_w
y2 = y1 + new_h
box_w = new_w
box_h = new_h
# 1. Draw box with soft background (no border)
draw.rectangle(
[x1, y1, x2, y2],
fill=(255, 250, 240), # FloralWhite (soft background)
outline=None
)
# 4. Draw text left-aligned horizontally and centered vertically
# Get metrics again for drawing
ascent, descent = font_to_use.getmetrics()
line_height = (ascent + descent) * 1.2
start_y = y1 + (box_h - total_h) / 2
for j, line in enumerate(lines):
# Left align with small padding
line_x = x1 + 4
line_y = start_y + j * line_height
# Draw text with a bright red color
text_color = (150, 0, 0)
draw.text((line_x, line_y), line, font=font_to_use, fill=text_color)
except Exception as e:
print(f"Error drawing detection box: {str(e)}")
continue
except Exception as e:
print(f"Error drawing detection box: {str(e)}")
continue
# Composite the overlay onto the image
img_draw.paste(overlay, (0, 0), overlay)
# Convert back to RGB
return img_draw.convert('RGB')
def create_side_by_side_comparison(
original: Image.Image,
annotated: Image.Image,
spacing: int = 20
) -> Image.Image:
"""
Create side-by-side comparison of original and annotated images
Args:
original: Original image
annotated: Annotated image with boxes
spacing: Space between images in pixels
Returns:
Combined image showing both versions
"""
# Get dimensions
width1, height1 = original.size
width2, height2 = annotated.size
# Create new image
total_width = width1 + width2 + spacing
total_height = max(height1, height2)
combined = Image.new('RGB', (total_width, total_height), (255, 255, 255))
# Paste images
combined.paste(original, (0, 0))
combined.paste(annotated, (width1 + spacing, 0))
# Add labels
draw = ImageDraw.Draw(combined)
# Try to load a better font that supports CJK
font_paths = [
"/System/Library/Fonts/PingFang.ttc",
"/System/Library/Fonts/Hiragino Sans GB.ttc",
"/System/Library/Fonts/STHeiti Light.ttc",
"/System/Library/Fonts/Supplemental/Arial Unicode.ttf",
"/System/Library/Fonts/Supplemental/Arial.ttf",
"/usr/share/fonts/truetype/noto/NotoSansCJK-Regular.ttc",
"/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf"
]
font = None
for path in font_paths:
try:
font = ImageFont.truetype(path, 24)
break
except (IOError, OSError):
continue
if font is None:
font = ImageFont.load_default()
draw.text((10, 10), "Original", font=font, fill=(0, 0, 0))
draw.text((width1 + spacing + 10, 10), "Detected Text", font=font, fill=(0, 0, 0))
return combined
def get_detection_summary(detections: List[Dict]) -> str:
"""
Create a text summary of detection results
Args:
detections: List of detection dictionaries
Returns:
Formatted summary string
"""
if not detections:
return "No text detected in the image."
summary = f"Detected {len(detections)} text region(s):\n\n"
for i, det in enumerate(detections, 1):
if 'original_text' in det and det['original_text'] != det['text']:
summary += f"{i}. Original: \"{det['original_text']}\"\n"
summary += f" Translated: \"{det['text']}\"\n"
else:
summary += f"{i}. \"{det['text']}\"\n"
summary += f" Location: ({det['x1']}, {det['y1']}) → ({det['x2']}, {det['y2']})\n\n"
return summary
def merge_detections(detections: List[Dict], threshold: int = 30) -> List[Dict]:
"""
Merge close detection boxes into single boxes
Args:
detections: List of detection dicts
threshold: Distance threshold for merging
Returns:
List of merged detection dicts
"""
if not detections:
return []
# Helper to check if two boxes are close
def are_close(box1, box2, thresh):
# Expand box1 by thresh
b1_x1, b1_y1 = box1['x1'] - thresh, box1['y1'] - thresh
b1_x2, b1_y2 = box1['x2'] + thresh, box1['y2'] + thresh
# Check overlap with box2
return not (b1_x2 < box2['x1'] or b1_x1 > box2['x2'] or
b1_y2 < box2['y1'] or b1_y1 > box2['y2'])
# Build adjacency list
n = len(detections)
adj = [[] for _ in range(n)]
for i in range(n):
for j in range(i + 1, n):
if are_close(detections[i], detections[j], threshold):
adj[i].append(j)
adj[j].append(i)
# Find connected components
visited = [False] * n
merged_results = []
for i in range(n):
if not visited[i]:
# BFS to find component
component = []
stack = [i]
visited[i] = True
while stack:
curr = stack.pop()
component.append(detections[curr])
for neighbor in adj[curr]:
if not visited[neighbor]:
visited[neighbor] = True
stack.append(neighbor)
# Merge component
if not component:
continue
# Calculate merged bounds
min_x1 = min(d['x1'] for d in component)
min_y1 = min(d['y1'] for d in component)
max_x2 = max(d['x2'] for d in component)
max_y2 = max(d['y2'] for d in component)
# Sort texts: Right-to-Left (descending X), then Top-to-Bottom (ascending Y)
# This is standard for Manga reading order
component.sort(key=lambda d: (-d['x1'], d['y1']))
merged_text = "".join(d['text'] for d in component).replace(" ", "")
merged_results.append({
'text': merged_text,
'x1': min_x1,
'y1': min_y1,
'x2': max_x2,
'y2': max_y2
})
return merged_results
|