File size: 12,351 Bytes
5b3defa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81bd1a1
5b3defa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79927f3
 
 
5b3defa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79927f3
5b3defa
 
79927f3
 
5b3defa
 
 
 
 
 
 
 
 
 
 
 
 
 
79927f3
4700e59
79927f3
5b3defa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81bd1a1
5b3defa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
"""
Visualization utilities for drawing text detection boxes on images
"""
import numpy as np
from PIL import Image, ImageDraw, ImageFont
from typing import List, Dict, Tuple
import os
import math

def generate_random_color() -> Tuple[int, int, int]:
    """
    Generate a random color for bounding boxes
    
    Returns:
        RGB color tuple
    """
    return (
        np.random.randint(0, 200),
        np.random.randint(0, 200),
        np.random.randint(0, 255)
    )


def draw_detection_boxes(
    image: Image.Image,
    detections: List[Dict],
    box_width: int = 2,
    font_size: int = 12,
    show_text: bool = True,
    merge_boxes: bool = True
) -> Image.Image:
    """
    Draw text detection boxes with labels on image
    
    Args:
        image: PIL Image to draw on
        detections: List of detection dicts with 'text', 'x1', 'y1', 'x2', 'y2'
        box_width: Width of bounding box lines
        font_size: Font size for text labels
        show_text: Whether to show text labels
        merge_boxes: Whether to merge close boxes (default: True)
        
    Returns:
        New image with boxes and labels drawn
    """
    # Merge detections if requested
    if merge_boxes:
        detections = merge_detections(detections)

    # Create a copy of the image
    img_draw = image.copy().convert('RGBA')
    
    # Create transparent overlay for semi-transparent boxes
    overlay = Image.new('RGBA', img_draw.size, (0, 0, 0, 0))
    draw_overlay = ImageDraw.Draw(overlay)
    draw = ImageDraw.Draw(img_draw)
    
    # Try to load a better font that supports CJK (Chinese/Japanese/Korean)
    # Prioritize local fonts folder for portability
    font_paths = [
        # Local fonts (project/fonts/) - Prioritize slim/light fonts
        os.path.join(os.path.dirname(__file__), "fonts", "NotoSansCJK-Light.ttc"),
        os.path.join(os.path.dirname(__file__), "fonts", "NotoSansCJK-Regular.ttc"),
        os.path.join(os.path.dirname(__file__), "fonts", "STHeiti-Light.ttc"),
        # macOS fonts
        "/System/Library/Fonts/STHeiti Light.ttc",
        "/System/Library/Fonts/PingFang.ttc",
        "/System/Library/Fonts/Hiragino Sans GB.ttc",
        # Linux fonts
        "/usr/share/fonts/truetype/noto/NotoSansCJK-Light.ttc",
        "/usr/share/fonts/truetype/noto/NotoSansCJK-Regular.ttc",
        "/usr/share/fonts/truetype/wqy/wqy-microhei.ttc"
    ]
    
    font = None
    valid_font_path = None
    for path in font_paths:
        try:
            font = ImageFont.truetype(path, font_size)
            valid_font_path = path
            break
        except (IOError, OSError):
            continue
            
    if font is None:
        # Fallback to default if no custom font loaded
        font = ImageFont.load_default()
    
    # Draw each detection
    for i, detection in enumerate(detections, 1):
        try:
            text = detection['text']
            x1, y1 = detection['x1'], detection['y1']
            x2, y2 = detection['x2'], detection['y2']
            
            # Calculate box dimensions
            box_w = x2 - x1
            box_h = y2 - y1
            
            # Helper function to wrap text and calculate size
            def get_text_layout(text, font, max_width):
                lines = []
                raw_lines = text.split('\n')
                for raw_line in raw_lines:
                    current_line = ""
                    for char in raw_line:
                        test_line = current_line + char
                        bbox = draw.textbbox((0, 0), test_line, font=font)
                        if bbox[2] - bbox[0] < max_width:
                            current_line = test_line
                        else:
                            if current_line:
                                lines.append(current_line)
                            current_line = char
                    if current_line:
                        lines.append(current_line)
                
                # Calculate total height
                if not lines:
                    return [], 0, 0
                
                # Get line height from font metrics
                ascent, descent = font.getmetrics()
                line_height = ascent + descent
                total_height = len(lines) * line_height * 1.2 # 1.2 line spacing
                
                max_line_w = 0
                for line in lines:
                    bbox = draw.textbbox((0, 0), line, font=font)
                    max_line_w = max(max_line_w, bbox[2] - bbox[0])
                    
                return lines, total_height, max_line_w

            # Use fixed font size as requested
            font_size_to_use = 12

            try:
                if valid_font_path:
                    font_to_use = ImageFont.truetype(valid_font_path, font_size_to_use)
                else:
                    font_to_use = ImageFont.load_default()
            except:
                font_to_use = ImageFont.load_default()
            
            # Calculate max allowed dimensions (max 20% larger)
            max_allowed_w = int(box_w * 1.2)
            max_allowed_h = int(box_h * 1.2)
            
            # Try layout with max allowed width to minimize height
            # Use -8 for padding (4px left, 4px right)
            lines, total_h, max_line_w = get_text_layout(text, font_to_use, max_allowed_w - 8)
            
            # Determine new dimensions, capped at 20% expansion
            # We ensure we don't shrink below original size
            new_w = max(box_w, min(max_line_w + 8, max_allowed_w))
            new_h = max(box_h, min(total_h + 4, max_allowed_h))
            
            # Update box coordinates
            x2 = x1 + new_w
            y2 = y1 + new_h
            box_w = new_w
            box_h = new_h

            # 1. Draw box with soft background (no border)
            draw.rectangle(
                [x1, y1, x2, y2],
                fill=(255, 250, 240),  # FloralWhite (soft background)
                outline=None
            )
            
            # 4. Draw text left-aligned horizontally and centered vertically
            # Get metrics again for drawing
            ascent, descent = font_to_use.getmetrics()
            line_height = (ascent + descent) * 1.2
            
            start_y = y1 + (box_h - total_h) / 2
            
            for j, line in enumerate(lines):
                # Left align with small padding
                line_x = x1 + 4
                line_y = start_y + j * line_height
                
                # Draw text with a bright red color
                text_color = (150, 0, 0)  
                draw.text((line_x, line_y), line, font=font_to_use, fill=text_color)
                
        except Exception as e:
            print(f"Error drawing detection box: {str(e)}")
            continue
                
        except Exception as e:
            print(f"Error drawing detection box: {str(e)}")
            continue
    
    # Composite the overlay onto the image
    img_draw.paste(overlay, (0, 0), overlay)
    
    # Convert back to RGB
    return img_draw.convert('RGB')


def create_side_by_side_comparison(
    original: Image.Image,
    annotated: Image.Image,
    spacing: int = 20
) -> Image.Image:
    """
    Create side-by-side comparison of original and annotated images
    
    Args:
        original: Original image
        annotated: Annotated image with boxes
        spacing: Space between images in pixels
        
    Returns:
        Combined image showing both versions
    """
    # Get dimensions
    width1, height1 = original.size
    width2, height2 = annotated.size
    
    # Create new image
    total_width = width1 + width2 + spacing
    total_height = max(height1, height2)
    
    combined = Image.new('RGB', (total_width, total_height), (255, 255, 255))
    
    # Paste images
    combined.paste(original, (0, 0))
    combined.paste(annotated, (width1 + spacing, 0))
    
    # Add labels
    draw = ImageDraw.Draw(combined)
    
    # Try to load a better font that supports CJK
    font_paths = [
        "/System/Library/Fonts/PingFang.ttc",
        "/System/Library/Fonts/Hiragino Sans GB.ttc",
        "/System/Library/Fonts/STHeiti Light.ttc",
        "/System/Library/Fonts/Supplemental/Arial Unicode.ttf",
        "/System/Library/Fonts/Supplemental/Arial.ttf",
        "/usr/share/fonts/truetype/noto/NotoSansCJK-Regular.ttc",
        "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf"
    ]
    
    font = None
    for path in font_paths:
        try:
            font = ImageFont.truetype(path, 24)
            break
        except (IOError, OSError):
            continue
            
    if font is None:
        font = ImageFont.load_default()
    
    draw.text((10, 10), "Original", font=font, fill=(0, 0, 0))
    draw.text((width1 + spacing + 10, 10), "Detected Text", font=font, fill=(0, 0, 0))
    
    return combined


def get_detection_summary(detections: List[Dict]) -> str:
    """
    Create a text summary of detection results
    
    Args:
        detections: List of detection dictionaries
        
    Returns:
        Formatted summary string
    """
    if not detections:
        return "No text detected in the image."
    
    summary = f"Detected {len(detections)} text region(s):\n\n"
    
    for i, det in enumerate(detections, 1):
        if 'original_text' in det and det['original_text'] != det['text']:
            summary += f"{i}. Original: \"{det['original_text']}\"\n"
            summary += f"   Translated: \"{det['text']}\"\n"
        else:
            summary += f"{i}. \"{det['text']}\"\n"
        summary += f"   Location: ({det['x1']}, {det['y1']}) → ({det['x2']}, {det['y2']})\n\n"
    
    return summary


def merge_detections(detections: List[Dict], threshold: int = 30) -> List[Dict]:
    """
    Merge close detection boxes into single boxes
    
    Args:
        detections: List of detection dicts
        threshold: Distance threshold for merging
        
    Returns:
        List of merged detection dicts
    """
    if not detections:
        return []
        
    # Helper to check if two boxes are close
    def are_close(box1, box2, thresh):
        # Expand box1 by thresh
        b1_x1, b1_y1 = box1['x1'] - thresh, box1['y1'] - thresh
        b1_x2, b1_y2 = box1['x2'] + thresh, box1['y2'] + thresh
        
        # Check overlap with box2
        return not (b1_x2 < box2['x1'] or b1_x1 > box2['x2'] or
                    b1_y2 < box2['y1'] or b1_y1 > box2['y2'])

    # Build adjacency list
    n = len(detections)
    adj = [[] for _ in range(n)]
    for i in range(n):
        for j in range(i + 1, n):
            if are_close(detections[i], detections[j], threshold):
                adj[i].append(j)
                adj[j].append(i)
    
    # Find connected components
    visited = [False] * n
    merged_results = []
    
    for i in range(n):
        if not visited[i]:
            # BFS to find component
            component = []
            stack = [i]
            visited[i] = True
            while stack:
                curr = stack.pop()
                component.append(detections[curr])
                for neighbor in adj[curr]:
                    if not visited[neighbor]:
                        visited[neighbor] = True
                        stack.append(neighbor)
            
            # Merge component
            if not component:
                continue
                
            # Calculate merged bounds
            min_x1 = min(d['x1'] for d in component)
            min_y1 = min(d['y1'] for d in component)
            max_x2 = max(d['x2'] for d in component)
            max_y2 = max(d['y2'] for d in component)
            
            # Sort texts: Right-to-Left (descending X), then Top-to-Bottom (ascending Y)
            # This is standard for Manga reading order
            component.sort(key=lambda d: (-d['x1'], d['y1']))
            
            merged_text = "".join(d['text'] for d in component).replace(" ", "")
            
            merged_results.append({
                'text': merged_text,
                'x1': min_x1,
                'y1': min_y1,
                'x2': max_x2,
                'y2': max_y2
            })
            
    return merged_results