Spaces:
Running
on
Zero
Running
on
Zero
| """ | |
| Gradio Web UI for HunyuanOCR Text Spotting | |
| Upload an image and get text detection with bounding boxes | |
| """ | |
| import gradio as gr | |
| from PIL import Image | |
| import os | |
| # Set environment variable to avoid tokenizer parallelism deadlocks | |
| os.environ["TOKENIZERS_PARALLELISM"] = "false" | |
| import spaces | |
| from ocr_model import HunyuanOCR | |
| from visualization import draw_detection_boxes, get_detection_summary | |
| from dotenv import load_dotenv | |
| from openai import OpenAI | |
| from huggingface_hub import hf_hub_download | |
| # Load environment variables | |
| load_dotenv() | |
| # Global model instance (loaded once) | |
| ocr_model = None | |
| def download_font(): | |
| """Download font from Hugging Face Hub if not exists""" | |
| font_dir = os.path.join(os.path.dirname(__file__), "fonts") | |
| os.makedirs(font_dir, exist_ok=True) | |
| font_path = os.path.join(font_dir, "NotoSansCJK-Light.ttc") | |
| if not os.path.exists(font_path): | |
| print("Downloading font from Hugging Face Hub...") | |
| try: | |
| hf_hub_download( | |
| repo_id="jzhang533/fonts", | |
| filename="NotoSansCJK-Light.ttc", | |
| repo_type="dataset", | |
| local_dir=font_dir, | |
| local_dir_use_symlinks=False | |
| ) | |
| print("Font downloaded successfully!") | |
| except Exception as e: | |
| print(f"Failed to download font: {e}") | |
| def initialize_model(): | |
| """Initialize the OCR model (called once at startup)""" | |
| global ocr_model | |
| if ocr_model is None: | |
| # Ensure font is available | |
| download_font() | |
| print("Initializing HunyuanOCR model...") | |
| ocr_model = HunyuanOCR() | |
| print("Model ready!") | |
| return ocr_model | |
| def process_image(image: Image.Image, prompt: str = None, target_language: str = "Chinese"): | |
| """ | |
| Process uploaded image and return annotated result | |
| Args: | |
| image: PIL Image from Gradio | |
| prompt: Optional custom prompt | |
| target_language: Target language for translation (Original, Chinese, English, French, etc.) | |
| Returns: | |
| Tuple of (annotated_image, detection_summary, raw_response) | |
| """ | |
| if image is None: | |
| return None, "Please upload an image first.", "" | |
| try: | |
| # Initialize model if needed | |
| model = initialize_model() | |
| # Resize image if height > 960 while maintaining aspect ratio | |
| if image.height > 960: | |
| aspect_ratio = image.width / image.height | |
| new_height = 960 | |
| new_width = int(new_height * aspect_ratio) | |
| print(f"Resizing image from {image.size} to ({new_width}, {new_height})") | |
| image = image.resize((new_width, new_height), Image.Resampling.LANCZOS) | |
| # Get image dimensions | |
| image_width, image_height = image.size | |
| # Use default prompt if not provided | |
| if not prompt or prompt.strip() == "": | |
| prompt = "检测并识别图片中的文字,将文本内容与坐标格式化输出。" | |
| # Detect text | |
| print("Running text detection...") | |
| response = model.detect_text(image, prompt) | |
| # Parse results | |
| detections = model.parse_detection_results(response, image_width, image_height) | |
| # Merge detections first (since visualization does it internally, we need to do it here for translation) | |
| from visualization import merge_detections | |
| merged_detections = merge_detections(detections) | |
| # Translate text in merged detections if not "Original" | |
| if target_language != "Original": | |
| print(f"Translating text to {target_language}...") | |
| for det in merged_detections: | |
| original_text = det['text'] | |
| translated = translate_text(original_text, target_language) | |
| det['original_text'] = original_text | |
| det['text'] = translated | |
| print(f"Translated: {original_text[:20]}... -> {translated[:20]}...") | |
| else: | |
| print("Skipping translation (Original selected)") | |
| # Draw boxes on image (pass merged detections and disable internal merging) | |
| annotated_image = draw_detection_boxes(image, merged_detections, merge_boxes=False) | |
| # Create summary | |
| summary = get_detection_summary(merged_detections) | |
| print(f"Detected {len(detections)} text regions") | |
| return annotated_image, summary, response | |
| except Exception as e: | |
| error_msg = f"Error processing image: {str(e)}" | |
| print(error_msg) | |
| return None, error_msg, "" | |
| def translate_text(text: str, target_language: str = "Chinese") -> str: | |
| """ | |
| Translate text to target language using model specified in .env via OpenAI-compatible API | |
| """ | |
| try: | |
| api_key = os.getenv("MODEL_ACCESS_TOKEN") | |
| base_url = os.getenv("MODEL_API_URL") | |
| model_name = os.getenv("MODEL_NAME", "ernie-4.5-turbo-128k") # Default fallback | |
| if not api_key or not base_url: | |
| print("Warning: MODEL_ACCESS_TOKEN or MODEL_API_URL not found in .env") | |
| return text | |
| client = OpenAI(api_key=api_key, base_url=base_url) | |
| system_prompt = f"You are a professional manga translator. The following text is from a Japanese manga. Translate it into natural and expressive {target_language}, maintaining the character's tone and the context of the scene. Only output the translation, no explanations." | |
| response = client.chat.completions.create( | |
| model=model_name, | |
| messages=[ | |
| {"role": "system", "content": system_prompt}, | |
| {"role": "user", "content": text} | |
| ] | |
| ) | |
| return response.choices[0].message.content.strip() | |
| except Exception as e: | |
| print(f"Translation error: {e}") | |
| return text | |
| def create_demo(): | |
| """Create and configure the Gradio interface""" | |
| with gr.Blocks(title="AI Manga Translator") as demo: | |
| gr.Markdown(""" | |
| # 📚 AI Manga Translator | |
| An intelligent tool designed to detect, recognize, and translate text in images, with specialized features for Manga and Comics. | |
| **Key Capabilities:** | |
| - 🖌️ **Smart Text Replacement**: Automatically detects text bubbles, wipes them clean, and overlays translated text. | |
| - 📖 **Manga-Optimized**: Handles vertical text and right-to-left reading order correctly. | |
| - 🌏 **Multi-Language Translation**: Translates detected text into your preferred language (Chinese, English, French, etc.). | |
| - 🔍 **High-Precision OCR**: Accurately spots text even in complex backgrounds. | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| # Input section | |
| gr.Markdown("### 📤 Input") | |
| input_image = gr.Image( | |
| type="pil", | |
| label="Upload Image", | |
| sources=["upload", "clipboard"] | |
| ) | |
| custom_prompt = gr.Textbox( | |
| label="Custom Prompt (Optional)", | |
| placeholder="检测并识别图片中的文字,将文本内容与坐标格式化输出。", | |
| lines=2 | |
| ) | |
| target_lang = gr.Dropdown( | |
| choices=["Original", "Chinese", "English", "French", "German", "Spanish", "Korean", "Japanese"], | |
| value="Chinese", | |
| label="Target Language", | |
| info="Select language for translation (Original = no translation)" | |
| ) | |
| detect_btn = gr.Button("🔍 Detect & Translate", variant="primary", size="lg") | |
| with gr.Column(scale=1): | |
| # Output section | |
| gr.Markdown("### 📊 Results") | |
| output_image = gr.Image( | |
| type="pil", | |
| label="Detected Text with Bounding Boxes" | |
| ) | |
| detection_summary = gr.Textbox( | |
| label="Detection Summary", | |
| lines=10, | |
| max_lines=20 | |
| ) | |
| with gr.Accordion("Raw Model Response", open=False): | |
| raw_output = gr.Textbox(label="Raw Output", lines=5) | |
| # Connect the button | |
| detect_btn.click( | |
| fn=process_image, | |
| inputs=[input_image, custom_prompt, target_lang], | |
| outputs=[output_image, detection_summary, raw_output] | |
| ) | |
| # Examples | |
| gr.Markdown("### 📝 Examples") | |
| gr.Examples( | |
| examples=[ | |
| ["examples/dandadan.png", "检测并识别图片中的文字,将文本内容与坐标格式化输出。"], | |
| ["examples/ruridragon.png", "检测并识别图片中的文字,将文本内容与坐标格式化输出。"], | |
| ["examples/spyfamily.png", "检测并识别图片中的文字,将文本内容与坐标格式化输出。"], | |
| ], | |
| inputs=[input_image, custom_prompt], | |
| label="Click to use example image" | |
| ) | |
| gr.Markdown(""" | |
| --- | |
| ### ℹ️ About | |
| This application combines state-of-the-art AI technologies to provide seamless manga translation: | |
| - **OCR Engine**: HunyuanOCR. | |
| - **Translation**: Powered by **ERNIE 4.5** for natural and context-aware translations. | |
| - **Development**: Vibe coded with **Gemini 3 Pro**. | |
| """) | |
| return demo | |
| if __name__ == "__main__": | |
| # Create and launch the demo | |
| print("Loading model (this may take a minute on first run)...") | |
| demo = create_demo() | |
| # Launch with public link option | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| share=False, # Set to True to create a public link | |
| show_error=True, | |
| ssr_mode=False | |
| ) | |