jzhang533's picture
using zerogpu
c77244c
"""
Gradio Web UI for HunyuanOCR Text Spotting
Upload an image and get text detection with bounding boxes
"""
import gradio as gr
from PIL import Image
import os
# Set environment variable to avoid tokenizer parallelism deadlocks
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import spaces
from ocr_model import HunyuanOCR
from visualization import draw_detection_boxes, get_detection_summary
from dotenv import load_dotenv
from openai import OpenAI
from huggingface_hub import hf_hub_download
# Load environment variables
load_dotenv()
# Global model instance (loaded once)
ocr_model = None
def download_font():
"""Download font from Hugging Face Hub if not exists"""
font_dir = os.path.join(os.path.dirname(__file__), "fonts")
os.makedirs(font_dir, exist_ok=True)
font_path = os.path.join(font_dir, "NotoSansCJK-Light.ttc")
if not os.path.exists(font_path):
print("Downloading font from Hugging Face Hub...")
try:
hf_hub_download(
repo_id="jzhang533/fonts",
filename="NotoSansCJK-Light.ttc",
repo_type="dataset",
local_dir=font_dir,
local_dir_use_symlinks=False
)
print("Font downloaded successfully!")
except Exception as e:
print(f"Failed to download font: {e}")
def initialize_model():
"""Initialize the OCR model (called once at startup)"""
global ocr_model
if ocr_model is None:
# Ensure font is available
download_font()
print("Initializing HunyuanOCR model...")
ocr_model = HunyuanOCR()
print("Model ready!")
return ocr_model
@spaces.GPU
def process_image(image: Image.Image, prompt: str = None, target_language: str = "Chinese"):
"""
Process uploaded image and return annotated result
Args:
image: PIL Image from Gradio
prompt: Optional custom prompt
target_language: Target language for translation (Original, Chinese, English, French, etc.)
Returns:
Tuple of (annotated_image, detection_summary, raw_response)
"""
if image is None:
return None, "Please upload an image first.", ""
try:
# Initialize model if needed
model = initialize_model()
# Resize image if height > 960 while maintaining aspect ratio
if image.height > 960:
aspect_ratio = image.width / image.height
new_height = 960
new_width = int(new_height * aspect_ratio)
print(f"Resizing image from {image.size} to ({new_width}, {new_height})")
image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
# Get image dimensions
image_width, image_height = image.size
# Use default prompt if not provided
if not prompt or prompt.strip() == "":
prompt = "检测并识别图片中的文字,将文本内容与坐标格式化输出。"
# Detect text
print("Running text detection...")
response = model.detect_text(image, prompt)
# Parse results
detections = model.parse_detection_results(response, image_width, image_height)
# Merge detections first (since visualization does it internally, we need to do it here for translation)
from visualization import merge_detections
merged_detections = merge_detections(detections)
# Translate text in merged detections if not "Original"
if target_language != "Original":
print(f"Translating text to {target_language}...")
for det in merged_detections:
original_text = det['text']
translated = translate_text(original_text, target_language)
det['original_text'] = original_text
det['text'] = translated
print(f"Translated: {original_text[:20]}... -> {translated[:20]}...")
else:
print("Skipping translation (Original selected)")
# Draw boxes on image (pass merged detections and disable internal merging)
annotated_image = draw_detection_boxes(image, merged_detections, merge_boxes=False)
# Create summary
summary = get_detection_summary(merged_detections)
print(f"Detected {len(detections)} text regions")
return annotated_image, summary, response
except Exception as e:
error_msg = f"Error processing image: {str(e)}"
print(error_msg)
return None, error_msg, ""
def translate_text(text: str, target_language: str = "Chinese") -> str:
"""
Translate text to target language using model specified in .env via OpenAI-compatible API
"""
try:
api_key = os.getenv("MODEL_ACCESS_TOKEN")
base_url = os.getenv("MODEL_API_URL")
model_name = os.getenv("MODEL_NAME", "ernie-4.5-turbo-128k") # Default fallback
if not api_key or not base_url:
print("Warning: MODEL_ACCESS_TOKEN or MODEL_API_URL not found in .env")
return text
client = OpenAI(api_key=api_key, base_url=base_url)
system_prompt = f"You are a professional manga translator. The following text is from a Japanese manga. Translate it into natural and expressive {target_language}, maintaining the character's tone and the context of the scene. Only output the translation, no explanations."
response = client.chat.completions.create(
model=model_name,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": text}
]
)
return response.choices[0].message.content.strip()
except Exception as e:
print(f"Translation error: {e}")
return text
def create_demo():
"""Create and configure the Gradio interface"""
with gr.Blocks(title="AI Manga Translator") as demo:
gr.Markdown("""
# 📚 AI Manga Translator
An intelligent tool designed to detect, recognize, and translate text in images, with specialized features for Manga and Comics.
**Key Capabilities:**
- 🖌️ **Smart Text Replacement**: Automatically detects text bubbles, wipes them clean, and overlays translated text.
- 📖 **Manga-Optimized**: Handles vertical text and right-to-left reading order correctly.
- 🌏 **Multi-Language Translation**: Translates detected text into your preferred language (Chinese, English, French, etc.).
- 🔍 **High-Precision OCR**: Accurately spots text even in complex backgrounds.
""")
with gr.Row():
with gr.Column(scale=1):
# Input section
gr.Markdown("### 📤 Input")
input_image = gr.Image(
type="pil",
label="Upload Image",
sources=["upload", "clipboard"]
)
custom_prompt = gr.Textbox(
label="Custom Prompt (Optional)",
placeholder="检测并识别图片中的文字,将文本内容与坐标格式化输出。",
lines=2
)
target_lang = gr.Dropdown(
choices=["Original", "Chinese", "English", "French", "German", "Spanish", "Korean", "Japanese"],
value="Chinese",
label="Target Language",
info="Select language for translation (Original = no translation)"
)
detect_btn = gr.Button("🔍 Detect & Translate", variant="primary", size="lg")
with gr.Column(scale=1):
# Output section
gr.Markdown("### 📊 Results")
output_image = gr.Image(
type="pil",
label="Detected Text with Bounding Boxes"
)
detection_summary = gr.Textbox(
label="Detection Summary",
lines=10,
max_lines=20
)
with gr.Accordion("Raw Model Response", open=False):
raw_output = gr.Textbox(label="Raw Output", lines=5)
# Connect the button
detect_btn.click(
fn=process_image,
inputs=[input_image, custom_prompt, target_lang],
outputs=[output_image, detection_summary, raw_output]
)
# Examples
gr.Markdown("### 📝 Examples")
gr.Examples(
examples=[
["examples/dandadan.png", "检测并识别图片中的文字,将文本内容与坐标格式化输出。"],
["examples/ruridragon.png", "检测并识别图片中的文字,将文本内容与坐标格式化输出。"],
["examples/spyfamily.png", "检测并识别图片中的文字,将文本内容与坐标格式化输出。"],
],
inputs=[input_image, custom_prompt],
label="Click to use example image"
)
gr.Markdown("""
---
### ℹ️ About
This application combines state-of-the-art AI technologies to provide seamless manga translation:
- **OCR Engine**: HunyuanOCR.
- **Translation**: Powered by **ERNIE 4.5** for natural and context-aware translations.
- **Development**: Vibe coded with **Gemini 3 Pro**.
""")
return demo
if __name__ == "__main__":
# Create and launch the demo
print("Loading model (this may take a minute on first run)...")
demo = create_demo()
# Launch with public link option
demo.launch(
server_name="0.0.0.0",
share=False, # Set to True to create a public link
show_error=True,
ssr_mode=False
)