Spaces:

jzhang533
/

ai_manga_translator

Running on Zero

File size: 10,130 Bytes

"""
Gradio Web UI for HunyuanOCR Text Spotting
Upload an image and get text detection with bounding boxes
"""
import gradio as gr
from PIL import Image
import os

# Set environment variable to avoid tokenizer parallelism deadlocks
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import spaces
from ocr_model import HunyuanOCR
from visualization import draw_detection_boxes, get_detection_summary
from dotenv import load_dotenv
from openai import OpenAI
from huggingface_hub import hf_hub_download

# Load environment variables
load_dotenv()

# Global model instance (loaded once)
ocr_model = None


def download_font():
    """Download font from Hugging Face Hub if not exists"""
    font_dir = os.path.join(os.path.dirname(__file__), "fonts")
    os.makedirs(font_dir, exist_ok=True)
    
    font_path = os.path.join(font_dir, "NotoSansCJK-Light.ttc")
    if not os.path.exists(font_path):
        print("Downloading font from Hugging Face Hub...")
        try:
            hf_hub_download(
                repo_id="jzhang533/fonts",
                filename="NotoSansCJK-Light.ttc",
                repo_type="dataset",
                local_dir=font_dir,
                local_dir_use_symlinks=False
            )
            print("Font downloaded successfully!")
        except Exception as e:
            print(f"Failed to download font: {e}")


def initialize_model():
    """Initialize the OCR model (called once at startup)"""
    global ocr_model
    if ocr_model is None:
        # Ensure font is available
        download_font()
        
        print("Initializing HunyuanOCR model...")
        ocr_model = HunyuanOCR()
        print("Model ready!")
    return ocr_model


@spaces.GPU
def process_image(image: Image.Image, prompt: str = None, target_language: str = "Chinese"):
    """
    Process uploaded image and return annotated result
    
    Args:
        image: PIL Image from Gradio
        prompt: Optional custom prompt
        target_language: Target language for translation (Original, Chinese, English, French, etc.)
        
    Returns:
        Tuple of (annotated_image, detection_summary, raw_response)
    """
    if image is None:
        return None, "Please upload an image first.", ""
    
    try:
        # Initialize model if needed
        model = initialize_model()
        
        # Resize image if height > 960 while maintaining aspect ratio
        if image.height > 960:
            aspect_ratio = image.width / image.height
            new_height = 960
            new_width = int(new_height * aspect_ratio)
            print(f"Resizing image from {image.size} to ({new_width}, {new_height})")
            image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
        
        # Get image dimensions
        image_width, image_height = image.size
        
        # Use default prompt if not provided
        if not prompt or prompt.strip() == "":
            prompt = "检测并识别图片中的文字,将文本内容与坐标格式化输出。"
        
        # Detect text
        print("Running text detection...")
        response = model.detect_text(image, prompt)
        
        # Parse results
        detections = model.parse_detection_results(response, image_width, image_height)
        
        # Merge detections first (since visualization does it internally, we need to do it here for translation)
        from visualization import merge_detections
        merged_detections = merge_detections(detections)
        
        # Translate text in merged detections if not "Original"
        if target_language != "Original":
            print(f"Translating text to {target_language}...")
            for det in merged_detections:
                original_text = det['text']
                translated = translate_text(original_text, target_language)
                det['original_text'] = original_text
                det['text'] = translated
                print(f"Translated: {original_text[:20]}... -> {translated[:20]}...")
        else:
            print("Skipping translation (Original selected)")
            
        # Draw boxes on image (pass merged detections and disable internal merging)
        annotated_image = draw_detection_boxes(image, merged_detections, merge_boxes=False)
        
        # Create summary
        summary = get_detection_summary(merged_detections)
        
        print(f"Detected {len(detections)} text regions")
        
        return annotated_image, summary, response
        
    except Exception as e:
        error_msg = f"Error processing image: {str(e)}"
        print(error_msg)
        return None, error_msg, ""


def translate_text(text: str, target_language: str = "Chinese") -> str:
    """
    Translate text to target language using model specified in .env via OpenAI-compatible API
    """
    try:
        api_key = os.getenv("MODEL_ACCESS_TOKEN")
        base_url = os.getenv("MODEL_API_URL")
        model_name = os.getenv("MODEL_NAME", "ernie-4.5-turbo-128k") # Default fallback
        
        if not api_key or not base_url:
            print("Warning: MODEL_ACCESS_TOKEN or MODEL_API_URL not found in .env")
            return text

        client = OpenAI(api_key=api_key, base_url=base_url)
        
        system_prompt = f"You are a professional manga translator. The following text is from a Japanese manga. Translate it into natural and expressive {target_language}, maintaining the character's tone and the context of the scene. Only output the translation, no explanations."
        
        response = client.chat.completions.create(
            model=model_name,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": text}
            ]
        )
        
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Translation error: {e}")
        return text


def create_demo():
    """Create and configure the Gradio interface"""
    
    with gr.Blocks(title="AI Manga Translator") as demo:
        gr.Markdown("""
        # 📚 AI Manga Translator
        
        An intelligent tool designed to detect, recognize, and translate text in images, with specialized features for Manga and Comics.
        
        **Key Capabilities:**
        - 🖌️ **Smart Text Replacement**: Automatically detects text bubbles, wipes them clean, and overlays translated text.
        - 📖 **Manga-Optimized**: Handles vertical text and right-to-left reading order correctly.
        - 🌏 **Multi-Language Translation**: Translates detected text into your preferred language (Chinese, English, French, etc.).
        - 🔍 **High-Precision OCR**: Accurately spots text even in complex backgrounds.
        """)
        
        with gr.Row():
            with gr.Column(scale=1):
                # Input section
                gr.Markdown("### 📤 Input")
                input_image = gr.Image(
                    type="pil",
                    label="Upload Image",
                    sources=["upload", "clipboard"]
                )
                
                custom_prompt = gr.Textbox(
                    label="Custom Prompt (Optional)",
                    placeholder="检测并识别图片中的文字,将文本内容与坐标格式化输出。",
                    lines=2
                )
                
                target_lang = gr.Dropdown(
                    choices=["Original", "Chinese", "English", "French", "German", "Spanish", "Korean", "Japanese"],
                    value="Chinese",
                    label="Target Language",
                    info="Select language for translation (Original = no translation)"
                )
                
                detect_btn = gr.Button("🔍 Detect & Translate", variant="primary", size="lg")
            
            with gr.Column(scale=1):
                # Output section
                gr.Markdown("### 📊 Results")
                output_image = gr.Image(
                    type="pil",
                    label="Detected Text with Bounding Boxes"
                )
                
                detection_summary = gr.Textbox(
                    label="Detection Summary",
                    lines=10,
                    max_lines=20
                )
                
                with gr.Accordion("Raw Model Response", open=False):
                    raw_output = gr.Textbox(label="Raw Output", lines=5)
        
        # Connect the button
        detect_btn.click(
            fn=process_image,
            inputs=[input_image, custom_prompt, target_lang],
            outputs=[output_image, detection_summary, raw_output]
        )
        
        # Examples
        gr.Markdown("### 📝 Examples")
        gr.Examples(
            examples=[
                ["examples/dandadan.png", "检测并识别图片中的文字,将文本内容与坐标格式化输出。"],
                ["examples/ruridragon.png", "检测并识别图片中的文字,将文本内容与坐标格式化输出。"],
                ["examples/spyfamily.png", "检测并识别图片中的文字,将文本内容与坐标格式化输出。"],
            ],
            inputs=[input_image, custom_prompt],
            label="Click to use example image"
        )
        
        gr.Markdown("""
        ---
        ### ℹ️ About
        
        This application combines state-of-the-art AI technologies to provide seamless manga translation:
        
        - **OCR Engine**: HunyuanOCR.
        - **Translation**: Powered by **ERNIE 4.5** for natural and context-aware translations.
        - **Development**: Vibe coded with **Gemini 3 Pro**.
        """)
    
    return demo


if __name__ == "__main__":
    # Create and launch the demo
    print("Loading model (this may take a minute on first run)...")
    
    demo = create_demo()
    
    # Launch with public link option
    demo.launch(
        server_name="0.0.0.0",
        share=False,  # Set to True to create a public link
        show_error=True,
        ssr_mode=False
    )