Spaces:
Running
on
Zero
Running
on
Zero
| """ | |
| Visualization utilities for drawing text detection boxes on images | |
| """ | |
| import numpy as np | |
| from PIL import Image, ImageDraw, ImageFont | |
| from typing import List, Dict, Tuple | |
| import os | |
| import math | |
| def generate_random_color() -> Tuple[int, int, int]: | |
| """ | |
| Generate a random color for bounding boxes | |
| Returns: | |
| RGB color tuple | |
| """ | |
| return ( | |
| np.random.randint(0, 200), | |
| np.random.randint(0, 200), | |
| np.random.randint(0, 255) | |
| ) | |
| def draw_detection_boxes( | |
| image: Image.Image, | |
| detections: List[Dict], | |
| box_width: int = 2, | |
| font_size: int = 12, | |
| show_text: bool = True, | |
| merge_boxes: bool = True | |
| ) -> Image.Image: | |
| """ | |
| Draw text detection boxes with labels on image | |
| Args: | |
| image: PIL Image to draw on | |
| detections: List of detection dicts with 'text', 'x1', 'y1', 'x2', 'y2' | |
| box_width: Width of bounding box lines | |
| font_size: Font size for text labels | |
| show_text: Whether to show text labels | |
| merge_boxes: Whether to merge close boxes (default: True) | |
| Returns: | |
| New image with boxes and labels drawn | |
| """ | |
| # Merge detections if requested | |
| if merge_boxes: | |
| detections = merge_detections(detections) | |
| # Create a copy of the image | |
| img_draw = image.copy().convert('RGBA') | |
| # Create transparent overlay for semi-transparent boxes | |
| overlay = Image.new('RGBA', img_draw.size, (0, 0, 0, 0)) | |
| draw_overlay = ImageDraw.Draw(overlay) | |
| draw = ImageDraw.Draw(img_draw) | |
| # Try to load a better font that supports CJK (Chinese/Japanese/Korean) | |
| # Prioritize local fonts folder for portability | |
| font_paths = [ | |
| # Local fonts (project/fonts/) - Prioritize slim/light fonts | |
| os.path.join(os.path.dirname(__file__), "fonts", "NotoSansCJK-Light.ttc"), | |
| os.path.join(os.path.dirname(__file__), "fonts", "NotoSansCJK-Regular.ttc"), | |
| os.path.join(os.path.dirname(__file__), "fonts", "STHeiti-Light.ttc"), | |
| # macOS fonts | |
| "/System/Library/Fonts/STHeiti Light.ttc", | |
| "/System/Library/Fonts/PingFang.ttc", | |
| "/System/Library/Fonts/Hiragino Sans GB.ttc", | |
| # Linux fonts | |
| "/usr/share/fonts/truetype/noto/NotoSansCJK-Light.ttc", | |
| "/usr/share/fonts/truetype/noto/NotoSansCJK-Regular.ttc", | |
| "/usr/share/fonts/truetype/wqy/wqy-microhei.ttc" | |
| ] | |
| font = None | |
| valid_font_path = None | |
| for path in font_paths: | |
| try: | |
| font = ImageFont.truetype(path, font_size) | |
| valid_font_path = path | |
| break | |
| except (IOError, OSError): | |
| continue | |
| if font is None: | |
| # Fallback to default if no custom font loaded | |
| font = ImageFont.load_default() | |
| # Draw each detection | |
| for i, detection in enumerate(detections, 1): | |
| try: | |
| text = detection['text'] | |
| x1, y1 = detection['x1'], detection['y1'] | |
| x2, y2 = detection['x2'], detection['y2'] | |
| # Calculate box dimensions | |
| box_w = x2 - x1 | |
| box_h = y2 - y1 | |
| # Helper function to wrap text and calculate size | |
| def get_text_layout(text, font, max_width): | |
| lines = [] | |
| raw_lines = text.split('\n') | |
| for raw_line in raw_lines: | |
| current_line = "" | |
| for char in raw_line: | |
| test_line = current_line + char | |
| bbox = draw.textbbox((0, 0), test_line, font=font) | |
| if bbox[2] - bbox[0] < max_width: | |
| current_line = test_line | |
| else: | |
| if current_line: | |
| lines.append(current_line) | |
| current_line = char | |
| if current_line: | |
| lines.append(current_line) | |
| # Calculate total height | |
| if not lines: | |
| return [], 0, 0 | |
| # Get line height from font metrics | |
| ascent, descent = font.getmetrics() | |
| line_height = ascent + descent | |
| total_height = len(lines) * line_height * 1.2 # 1.2 line spacing | |
| max_line_w = 0 | |
| for line in lines: | |
| bbox = draw.textbbox((0, 0), line, font=font) | |
| max_line_w = max(max_line_w, bbox[2] - bbox[0]) | |
| return lines, total_height, max_line_w | |
| # Use fixed font size as requested | |
| font_size_to_use = 12 | |
| try: | |
| if valid_font_path: | |
| font_to_use = ImageFont.truetype(valid_font_path, font_size_to_use) | |
| else: | |
| font_to_use = ImageFont.load_default() | |
| except: | |
| font_to_use = ImageFont.load_default() | |
| # Calculate max allowed dimensions (max 20% larger) | |
| max_allowed_w = int(box_w * 1.2) | |
| max_allowed_h = int(box_h * 1.2) | |
| # Try layout with max allowed width to minimize height | |
| # Use -8 for padding (4px left, 4px right) | |
| lines, total_h, max_line_w = get_text_layout(text, font_to_use, max_allowed_w - 8) | |
| # Determine new dimensions, capped at 20% expansion | |
| # We ensure we don't shrink below original size | |
| new_w = max(box_w, min(max_line_w + 8, max_allowed_w)) | |
| new_h = max(box_h, min(total_h + 4, max_allowed_h)) | |
| # Update box coordinates | |
| x2 = x1 + new_w | |
| y2 = y1 + new_h | |
| box_w = new_w | |
| box_h = new_h | |
| # 1. Draw box with soft background (no border) | |
| draw.rectangle( | |
| [x1, y1, x2, y2], | |
| fill=(255, 250, 240), # FloralWhite (soft background) | |
| outline=None | |
| ) | |
| # 4. Draw text left-aligned horizontally and centered vertically | |
| # Get metrics again for drawing | |
| ascent, descent = font_to_use.getmetrics() | |
| line_height = (ascent + descent) * 1.2 | |
| start_y = y1 + (box_h - total_h) / 2 | |
| for j, line in enumerate(lines): | |
| # Left align with small padding | |
| line_x = x1 + 4 | |
| line_y = start_y + j * line_height | |
| # Draw text with a bright red color | |
| text_color = (150, 0, 0) | |
| draw.text((line_x, line_y), line, font=font_to_use, fill=text_color) | |
| except Exception as e: | |
| print(f"Error drawing detection box: {str(e)}") | |
| continue | |
| except Exception as e: | |
| print(f"Error drawing detection box: {str(e)}") | |
| continue | |
| # Composite the overlay onto the image | |
| img_draw.paste(overlay, (0, 0), overlay) | |
| # Convert back to RGB | |
| return img_draw.convert('RGB') | |
| def create_side_by_side_comparison( | |
| original: Image.Image, | |
| annotated: Image.Image, | |
| spacing: int = 20 | |
| ) -> Image.Image: | |
| """ | |
| Create side-by-side comparison of original and annotated images | |
| Args: | |
| original: Original image | |
| annotated: Annotated image with boxes | |
| spacing: Space between images in pixels | |
| Returns: | |
| Combined image showing both versions | |
| """ | |
| # Get dimensions | |
| width1, height1 = original.size | |
| width2, height2 = annotated.size | |
| # Create new image | |
| total_width = width1 + width2 + spacing | |
| total_height = max(height1, height2) | |
| combined = Image.new('RGB', (total_width, total_height), (255, 255, 255)) | |
| # Paste images | |
| combined.paste(original, (0, 0)) | |
| combined.paste(annotated, (width1 + spacing, 0)) | |
| # Add labels | |
| draw = ImageDraw.Draw(combined) | |
| # Try to load a better font that supports CJK | |
| font_paths = [ | |
| "/System/Library/Fonts/PingFang.ttc", | |
| "/System/Library/Fonts/Hiragino Sans GB.ttc", | |
| "/System/Library/Fonts/STHeiti Light.ttc", | |
| "/System/Library/Fonts/Supplemental/Arial Unicode.ttf", | |
| "/System/Library/Fonts/Supplemental/Arial.ttf", | |
| "/usr/share/fonts/truetype/noto/NotoSansCJK-Regular.ttc", | |
| "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf" | |
| ] | |
| font = None | |
| for path in font_paths: | |
| try: | |
| font = ImageFont.truetype(path, 24) | |
| break | |
| except (IOError, OSError): | |
| continue | |
| if font is None: | |
| font = ImageFont.load_default() | |
| draw.text((10, 10), "Original", font=font, fill=(0, 0, 0)) | |
| draw.text((width1 + spacing + 10, 10), "Detected Text", font=font, fill=(0, 0, 0)) | |
| return combined | |
| def get_detection_summary(detections: List[Dict]) -> str: | |
| """ | |
| Create a text summary of detection results | |
| Args: | |
| detections: List of detection dictionaries | |
| Returns: | |
| Formatted summary string | |
| """ | |
| if not detections: | |
| return "No text detected in the image." | |
| summary = f"Detected {len(detections)} text region(s):\n\n" | |
| for i, det in enumerate(detections, 1): | |
| if 'original_text' in det and det['original_text'] != det['text']: | |
| summary += f"{i}. Original: \"{det['original_text']}\"\n" | |
| summary += f" Translated: \"{det['text']}\"\n" | |
| else: | |
| summary += f"{i}. \"{det['text']}\"\n" | |
| summary += f" Location: ({det['x1']}, {det['y1']}) → ({det['x2']}, {det['y2']})\n\n" | |
| return summary | |
| def merge_detections(detections: List[Dict], threshold: int = 30) -> List[Dict]: | |
| """ | |
| Merge close detection boxes into single boxes | |
| Args: | |
| detections: List of detection dicts | |
| threshold: Distance threshold for merging | |
| Returns: | |
| List of merged detection dicts | |
| """ | |
| if not detections: | |
| return [] | |
| # Helper to check if two boxes are close | |
| def are_close(box1, box2, thresh): | |
| # Expand box1 by thresh | |
| b1_x1, b1_y1 = box1['x1'] - thresh, box1['y1'] - thresh | |
| b1_x2, b1_y2 = box1['x2'] + thresh, box1['y2'] + thresh | |
| # Check overlap with box2 | |
| return not (b1_x2 < box2['x1'] or b1_x1 > box2['x2'] or | |
| b1_y2 < box2['y1'] or b1_y1 > box2['y2']) | |
| # Build adjacency list | |
| n = len(detections) | |
| adj = [[] for _ in range(n)] | |
| for i in range(n): | |
| for j in range(i + 1, n): | |
| if are_close(detections[i], detections[j], threshold): | |
| adj[i].append(j) | |
| adj[j].append(i) | |
| # Find connected components | |
| visited = [False] * n | |
| merged_results = [] | |
| for i in range(n): | |
| if not visited[i]: | |
| # BFS to find component | |
| component = [] | |
| stack = [i] | |
| visited[i] = True | |
| while stack: | |
| curr = stack.pop() | |
| component.append(detections[curr]) | |
| for neighbor in adj[curr]: | |
| if not visited[neighbor]: | |
| visited[neighbor] = True | |
| stack.append(neighbor) | |
| # Merge component | |
| if not component: | |
| continue | |
| # Calculate merged bounds | |
| min_x1 = min(d['x1'] for d in component) | |
| min_y1 = min(d['y1'] for d in component) | |
| max_x2 = max(d['x2'] for d in component) | |
| max_y2 = max(d['y2'] for d in component) | |
| # Sort texts: Right-to-Left (descending X), then Top-to-Bottom (ascending Y) | |
| # This is standard for Manga reading order | |
| component.sort(key=lambda d: (-d['x1'], d['y1'])) | |
| merged_text = "".join(d['text'] for d in component).replace(" ", "") | |
| merged_results.append({ | |
| 'text': merged_text, | |
| 'x1': min_x1, | |
| 'y1': min_y1, | |
| 'x2': max_x2, | |
| 'y2': max_y2 | |
| }) | |
| return merged_results | |