Spaces:

jzhang533
/

ai_manga_translator

Running on Zero

App Files Files Community

ai_manga_translator / visualization.py

jzhang533

minor

4700e59 18 days ago

raw

history blame contribute delete

12.4 kB

	"""
	Visualization utilities for drawing text detection boxes on images
	"""
	import numpy as np
	from PIL import Image, ImageDraw, ImageFont
	from typing import List, Dict, Tuple
	import os
	import math

	def generate_random_color() -> Tuple[int, int, int]:
	"""
	Generate a random color for bounding boxes

	Returns:
	RGB color tuple
	"""
	return (
	np.random.randint(0, 200),
	np.random.randint(0, 200),
	np.random.randint(0, 255)
	)


	def draw_detection_boxes(
	image: Image.Image,
	detections: List[Dict],
	box_width: int = 2,
	font_size: int = 12,
	show_text: bool = True,
	merge_boxes: bool = True
	) -> Image.Image:
	"""
	Draw text detection boxes with labels on image

	Args:
	image: PIL Image to draw on
	detections: List of detection dicts with 'text', 'x1', 'y1', 'x2', 'y2'
	box_width: Width of bounding box lines
	font_size: Font size for text labels
	show_text: Whether to show text labels
	merge_boxes: Whether to merge close boxes (default: True)

	Returns:
	New image with boxes and labels drawn
	"""
	# Merge detections if requested
	if merge_boxes:
	detections = merge_detections(detections)

	# Create a copy of the image
	img_draw = image.copy().convert('RGBA')

	# Create transparent overlay for semi-transparent boxes
	overlay = Image.new('RGBA', img_draw.size, (0, 0, 0, 0))
	draw_overlay = ImageDraw.Draw(overlay)
	draw = ImageDraw.Draw(img_draw)

	# Try to load a better font that supports CJK (Chinese/Japanese/Korean)
	# Prioritize local fonts folder for portability
	font_paths = [
	# Local fonts (project/fonts/) - Prioritize slim/light fonts
	os.path.join(os.path.dirname(__file__), "fonts", "NotoSansCJK-Light.ttc"),
	os.path.join(os.path.dirname(__file__), "fonts", "NotoSansCJK-Regular.ttc"),
	os.path.join(os.path.dirname(__file__), "fonts", "STHeiti-Light.ttc"),
	# macOS fonts
	"/System/Library/Fonts/STHeiti Light.ttc",
	"/System/Library/Fonts/PingFang.ttc",
	"/System/Library/Fonts/Hiragino Sans GB.ttc",
	# Linux fonts
	"/usr/share/fonts/truetype/noto/NotoSansCJK-Light.ttc",
	"/usr/share/fonts/truetype/noto/NotoSansCJK-Regular.ttc",
	"/usr/share/fonts/truetype/wqy/wqy-microhei.ttc"
	]

	font = None
	valid_font_path = None
	for path in font_paths:
	try:
	font = ImageFont.truetype(path, font_size)
	valid_font_path = path
	break
	except (IOError, OSError):
	continue

	if font is None:
	# Fallback to default if no custom font loaded
	font = ImageFont.load_default()

	# Draw each detection
	for i, detection in enumerate(detections, 1):
	try:
	text = detection['text']
	x1, y1 = detection['x1'], detection['y1']
	x2, y2 = detection['x2'], detection['y2']

	# Calculate box dimensions
	box_w = x2 - x1
	box_h = y2 - y1

	# Helper function to wrap text and calculate size
	def get_text_layout(text, font, max_width):
	lines = []
	raw_lines = text.split('\n')
	for raw_line in raw_lines:
	current_line = ""
	for char in raw_line:
	test_line = current_line + char
	bbox = draw.textbbox((0, 0), test_line, font=font)
	if bbox[2] - bbox[0] < max_width:
	current_line = test_line
	else:
	if current_line:
	lines.append(current_line)
	current_line = char
	if current_line:
	lines.append(current_line)

	# Calculate total height
	if not lines:
	return [], 0, 0

	# Get line height from font metrics
	ascent, descent = font.getmetrics()
	line_height = ascent + descent
	total_height = len(lines) * line_height * 1.2 # 1.2 line spacing

	max_line_w = 0
	for line in lines:
	bbox = draw.textbbox((0, 0), line, font=font)
	max_line_w = max(max_line_w, bbox[2] - bbox[0])

	return lines, total_height, max_line_w

	# Use fixed font size as requested
	font_size_to_use = 12

	try:
	if valid_font_path:
	font_to_use = ImageFont.truetype(valid_font_path, font_size_to_use)
	else:
	font_to_use = ImageFont.load_default()
	except:
	font_to_use = ImageFont.load_default()

	# Calculate max allowed dimensions (max 20% larger)
	max_allowed_w = int(box_w * 1.2)
	max_allowed_h = int(box_h * 1.2)

	# Try layout with max allowed width to minimize height
	# Use -8 for padding (4px left, 4px right)
	lines, total_h, max_line_w = get_text_layout(text, font_to_use, max_allowed_w - 8)

	# Determine new dimensions, capped at 20% expansion
	# We ensure we don't shrink below original size
	new_w = max(box_w, min(max_line_w + 8, max_allowed_w))
	new_h = max(box_h, min(total_h + 4, max_allowed_h))

	# Update box coordinates
	x2 = x1 + new_w
	y2 = y1 + new_h
	box_w = new_w
	box_h = new_h

	# 1. Draw box with soft background (no border)
	draw.rectangle(
	[x1, y1, x2, y2],
	fill=(255, 250, 240), # FloralWhite (soft background)
	outline=None
	)

	# 4. Draw text left-aligned horizontally and centered vertically
	# Get metrics again for drawing
	ascent, descent = font_to_use.getmetrics()
	line_height = (ascent + descent) * 1.2

	start_y = y1 + (box_h - total_h) / 2

	for j, line in enumerate(lines):
	# Left align with small padding
	line_x = x1 + 4
	line_y = start_y + j * line_height

	# Draw text with a bright red color
	text_color = (150, 0, 0)
	draw.text((line_x, line_y), line, font=font_to_use, fill=text_color)

	except Exception as e:
	print(f"Error drawing detection box: {str(e)}")
	continue

	except Exception as e:
	print(f"Error drawing detection box: {str(e)}")
	continue

	# Composite the overlay onto the image
	img_draw.paste(overlay, (0, 0), overlay)

	# Convert back to RGB
	return img_draw.convert('RGB')


	def create_side_by_side_comparison(
	original: Image.Image,
	annotated: Image.Image,
	spacing: int = 20
	) -> Image.Image:
	"""
	Create side-by-side comparison of original and annotated images

	Args:
	original: Original image
	annotated: Annotated image with boxes
	spacing: Space between images in pixels

	Returns:
	Combined image showing both versions
	"""
	# Get dimensions
	width1, height1 = original.size
	width2, height2 = annotated.size

	# Create new image
	total_width = width1 + width2 + spacing
	total_height = max(height1, height2)

	combined = Image.new('RGB', (total_width, total_height), (255, 255, 255))

	# Paste images
	combined.paste(original, (0, 0))
	combined.paste(annotated, (width1 + spacing, 0))

	# Add labels
	draw = ImageDraw.Draw(combined)

	# Try to load a better font that supports CJK
	font_paths = [
	"/System/Library/Fonts/PingFang.ttc",
	"/System/Library/Fonts/Hiragino Sans GB.ttc",
	"/System/Library/Fonts/STHeiti Light.ttc",
	"/System/Library/Fonts/Supplemental/Arial Unicode.ttf",
	"/System/Library/Fonts/Supplemental/Arial.ttf",
	"/usr/share/fonts/truetype/noto/NotoSansCJK-Regular.ttc",
	"/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf"
	]

	font = None
	for path in font_paths:
	try:
	font = ImageFont.truetype(path, 24)
	break
	except (IOError, OSError):
	continue

	if font is None:
	font = ImageFont.load_default()

	draw.text((10, 10), "Original", font=font, fill=(0, 0, 0))
	draw.text((width1 + spacing + 10, 10), "Detected Text", font=font, fill=(0, 0, 0))

	return combined


	def get_detection_summary(detections: List[Dict]) -> str:
	"""
	Create a text summary of detection results

	Args:
	detections: List of detection dictionaries

	Returns:
	Formatted summary string
	"""
	if not detections:
	return "No text detected in the image."

	summary = f"Detected {len(detections)} text region(s):\n\n"

	for i, det in enumerate(detections, 1):
	if 'original_text' in det and det['original_text'] != det['text']:
	summary += f"{i}. Original: \"{det['original_text']}\"\n"
	summary += f" Translated: \"{det['text']}\"\n"
	else:
	summary += f"{i}. \"{det['text']}\"\n"
	summary += f" Location: ({det['x1']}, {det['y1']}) → ({det['x2']}, {det['y2']})\n\n"

	return summary


	def merge_detections(detections: List[Dict], threshold: int = 30) -> List[Dict]:
	"""
	Merge close detection boxes into single boxes

	Args:
	detections: List of detection dicts
	threshold: Distance threshold for merging

	Returns:
	List of merged detection dicts
	"""
	if not detections:
	return []

	# Helper to check if two boxes are close
	def are_close(box1, box2, thresh):
	# Expand box1 by thresh
	b1_x1, b1_y1 = box1['x1'] - thresh, box1['y1'] - thresh
	b1_x2, b1_y2 = box1['x2'] + thresh, box1['y2'] + thresh

	# Check overlap with box2
	return not (b1_x2 < box2['x1'] or b1_x1 > box2['x2'] or
	b1_y2 < box2['y1'] or b1_y1 > box2['y2'])

	# Build adjacency list
	n = len(detections)
	adj = [[] for _ in range(n)]
	for i in range(n):
	for j in range(i + 1, n):
	if are_close(detections[i], detections[j], threshold):
	adj[i].append(j)
	adj[j].append(i)

	# Find connected components
	visited = [False] * n
	merged_results = []

	for i in range(n):
	if not visited[i]:
	# BFS to find component
	component = []
	stack = [i]
	visited[i] = True
	while stack:
	curr = stack.pop()
	component.append(detections[curr])
	for neighbor in adj[curr]:
	if not visited[neighbor]:
	visited[neighbor] = True
	stack.append(neighbor)

	# Merge component
	if not component:
	continue

	# Calculate merged bounds
	min_x1 = min(d['x1'] for d in component)
	min_y1 = min(d['y1'] for d in component)
	max_x2 = max(d['x2'] for d in component)
	max_y2 = max(d['y2'] for d in component)

	# Sort texts: Right-to-Left (descending X), then Top-to-Bottom (ascending Y)
	# This is standard for Manga reading order
	component.sort(key=lambda d: (-d['x1'], d['y1']))

	merged_text = "".join(d['text'] for d in component).replace(" ", "")

	merged_results.append({
	'text': merged_text,
	'x1': min_x1,
	'y1': min_y1,
	'x2': max_x2,
	'y2': max_y2
	})

	return merged_results