File size: 10,130 Bytes
5b3defa
 
 
 
 
 
 
79927f3
 
 
 
c77244c
5b3defa
 
 
 
6d3742e
5b3defa
 
 
 
 
 
 
 
6d3742e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5b3defa
 
 
 
6d3742e
 
 
5b3defa
 
 
 
 
 
c77244c
5b3defa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6d3742e
 
 
5b3defa
 
 
 
 
6d3742e
5b3defa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a3e82eb
5b3defa
bcd299f
 
5b3defa
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
"""
Gradio Web UI for HunyuanOCR Text Spotting
Upload an image and get text detection with bounding boxes
"""
import gradio as gr
from PIL import Image
import os

# Set environment variable to avoid tokenizer parallelism deadlocks
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import spaces
from ocr_model import HunyuanOCR
from visualization import draw_detection_boxes, get_detection_summary
from dotenv import load_dotenv
from openai import OpenAI
from huggingface_hub import hf_hub_download

# Load environment variables
load_dotenv()

# Global model instance (loaded once)
ocr_model = None


def download_font():
    """Download font from Hugging Face Hub if not exists"""
    font_dir = os.path.join(os.path.dirname(__file__), "fonts")
    os.makedirs(font_dir, exist_ok=True)
    
    font_path = os.path.join(font_dir, "NotoSansCJK-Light.ttc")
    if not os.path.exists(font_path):
        print("Downloading font from Hugging Face Hub...")
        try:
            hf_hub_download(
                repo_id="jzhang533/fonts",
                filename="NotoSansCJK-Light.ttc",
                repo_type="dataset",
                local_dir=font_dir,
                local_dir_use_symlinks=False
            )
            print("Font downloaded successfully!")
        except Exception as e:
            print(f"Failed to download font: {e}")


def initialize_model():
    """Initialize the OCR model (called once at startup)"""
    global ocr_model
    if ocr_model is None:
        # Ensure font is available
        download_font()
        
        print("Initializing HunyuanOCR model...")
        ocr_model = HunyuanOCR()
        print("Model ready!")
    return ocr_model


@spaces.GPU
def process_image(image: Image.Image, prompt: str = None, target_language: str = "Chinese"):
    """
    Process uploaded image and return annotated result
    
    Args:
        image: PIL Image from Gradio
        prompt: Optional custom prompt
        target_language: Target language for translation (Original, Chinese, English, French, etc.)
        
    Returns:
        Tuple of (annotated_image, detection_summary, raw_response)
    """
    if image is None:
        return None, "Please upload an image first.", ""
    
    try:
        # Initialize model if needed
        model = initialize_model()
        
        # Resize image if height > 960 while maintaining aspect ratio
        if image.height > 960:
            aspect_ratio = image.width / image.height
            new_height = 960
            new_width = int(new_height * aspect_ratio)
            print(f"Resizing image from {image.size} to ({new_width}, {new_height})")
            image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
        
        # Get image dimensions
        image_width, image_height = image.size
        
        # Use default prompt if not provided
        if not prompt or prompt.strip() == "":
            prompt = "检测并识别图片中的文字,将文本内容与坐标格式化输出。"
        
        # Detect text
        print("Running text detection...")
        response = model.detect_text(image, prompt)
        
        # Parse results
        detections = model.parse_detection_results(response, image_width, image_height)
        
        # Merge detections first (since visualization does it internally, we need to do it here for translation)
        from visualization import merge_detections
        merged_detections = merge_detections(detections)
        
        # Translate text in merged detections if not "Original"
        if target_language != "Original":
            print(f"Translating text to {target_language}...")
            for det in merged_detections:
                original_text = det['text']
                translated = translate_text(original_text, target_language)
                det['original_text'] = original_text
                det['text'] = translated
                print(f"Translated: {original_text[:20]}... -> {translated[:20]}...")
        else:
            print("Skipping translation (Original selected)")
            
        # Draw boxes on image (pass merged detections and disable internal merging)
        annotated_image = draw_detection_boxes(image, merged_detections, merge_boxes=False)
        
        # Create summary
        summary = get_detection_summary(merged_detections)
        
        print(f"Detected {len(detections)} text regions")
        
        return annotated_image, summary, response
        
    except Exception as e:
        error_msg = f"Error processing image: {str(e)}"
        print(error_msg)
        return None, error_msg, ""


def translate_text(text: str, target_language: str = "Chinese") -> str:
    """
    Translate text to target language using model specified in .env via OpenAI-compatible API
    """
    try:
        api_key = os.getenv("MODEL_ACCESS_TOKEN")
        base_url = os.getenv("MODEL_API_URL")
        model_name = os.getenv("MODEL_NAME", "ernie-4.5-turbo-128k") # Default fallback
        
        if not api_key or not base_url:
            print("Warning: MODEL_ACCESS_TOKEN or MODEL_API_URL not found in .env")
            return text

        client = OpenAI(api_key=api_key, base_url=base_url)
        
        system_prompt = f"You are a professional manga translator. The following text is from a Japanese manga. Translate it into natural and expressive {target_language}, maintaining the character's tone and the context of the scene. Only output the translation, no explanations."
        
        response = client.chat.completions.create(
            model=model_name,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": text}
            ]
        )
        
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Translation error: {e}")
        return text


def create_demo():
    """Create and configure the Gradio interface"""
    
    with gr.Blocks(title="AI Manga Translator") as demo:
        gr.Markdown("""
        # 📚 AI Manga Translator
        
        An intelligent tool designed to detect, recognize, and translate text in images, with specialized features for Manga and Comics.
        
        **Key Capabilities:**
        - 🖌️ **Smart Text Replacement**: Automatically detects text bubbles, wipes them clean, and overlays translated text.
        - 📖 **Manga-Optimized**: Handles vertical text and right-to-left reading order correctly.
        - 🌏 **Multi-Language Translation**: Translates detected text into your preferred language (Chinese, English, French, etc.).
        - 🔍 **High-Precision OCR**: Accurately spots text even in complex backgrounds.
        """)
        
        with gr.Row():
            with gr.Column(scale=1):
                # Input section
                gr.Markdown("### 📤 Input")
                input_image = gr.Image(
                    type="pil",
                    label="Upload Image",
                    sources=["upload", "clipboard"]
                )
                
                custom_prompt = gr.Textbox(
                    label="Custom Prompt (Optional)",
                    placeholder="检测并识别图片中的文字,将文本内容与坐标格式化输出。",
                    lines=2
                )
                
                target_lang = gr.Dropdown(
                    choices=["Original", "Chinese", "English", "French", "German", "Spanish", "Korean", "Japanese"],
                    value="Chinese",
                    label="Target Language",
                    info="Select language for translation (Original = no translation)"
                )
                
                detect_btn = gr.Button("🔍 Detect & Translate", variant="primary", size="lg")
            
            with gr.Column(scale=1):
                # Output section
                gr.Markdown("### 📊 Results")
                output_image = gr.Image(
                    type="pil",
                    label="Detected Text with Bounding Boxes"
                )
                
                detection_summary = gr.Textbox(
                    label="Detection Summary",
                    lines=10,
                    max_lines=20
                )
                
                with gr.Accordion("Raw Model Response", open=False):
                    raw_output = gr.Textbox(label="Raw Output", lines=5)
        
        # Connect the button
        detect_btn.click(
            fn=process_image,
            inputs=[input_image, custom_prompt, target_lang],
            outputs=[output_image, detection_summary, raw_output]
        )
        
        # Examples
        gr.Markdown("### 📝 Examples")
        gr.Examples(
            examples=[
                ["examples/dandadan.png", "检测并识别图片中的文字,将文本内容与坐标格式化输出。"],
                ["examples/ruridragon.png", "检测并识别图片中的文字,将文本内容与坐标格式化输出。"],
                ["examples/spyfamily.png", "检测并识别图片中的文字,将文本内容与坐标格式化输出。"],
            ],
            inputs=[input_image, custom_prompt],
            label="Click to use example image"
        )
        
        gr.Markdown("""
        ---
        ### ℹ️ About
        
        This application combines state-of-the-art AI technologies to provide seamless manga translation:
        
        - **OCR Engine**: HunyuanOCR.
        - **Translation**: Powered by **ERNIE 4.5** for natural and context-aware translations.
        - **Development**: Vibe coded with **Gemini 3 Pro**.
        """)
    
    return demo


if __name__ == "__main__":
    # Create and launch the demo
    print("Loading model (this may take a minute on first run)...")
    
    demo = create_demo()
    
    # Launch with public link option
    demo.launch(
        server_name="0.0.0.0",
        share=False,  # Set to True to create a public link
        show_error=True,
        ssr_mode=False
    )