Spaces:

jzhang533
/

ai_manga_translator

Running on Zero

App Files Files Community

ai_manga_translator / app.py

jzhang533

using zerogpu

c77244c 18 days ago

raw

history blame contribute delete

10.1 kB

	"""
	Gradio Web UI for HunyuanOCR Text Spotting
	Upload an image and get text detection with bounding boxes
	"""
	import gradio as gr
	from PIL import Image
	import os

	# Set environment variable to avoid tokenizer parallelism deadlocks
	os.environ["TOKENIZERS_PARALLELISM"] = "false"

	import spaces
	from ocr_model import HunyuanOCR
	from visualization import draw_detection_boxes, get_detection_summary
	from dotenv import load_dotenv
	from openai import OpenAI
	from huggingface_hub import hf_hub_download

	# Load environment variables
	load_dotenv()

	# Global model instance (loaded once)
	ocr_model = None


	def download_font():
	"""Download font from Hugging Face Hub if not exists"""
	font_dir = os.path.join(os.path.dirname(__file__), "fonts")
	os.makedirs(font_dir, exist_ok=True)

	font_path = os.path.join(font_dir, "NotoSansCJK-Light.ttc")
	if not os.path.exists(font_path):
	print("Downloading font from Hugging Face Hub...")
	try:
	hf_hub_download(
	repo_id="jzhang533/fonts",
	filename="NotoSansCJK-Light.ttc",
	repo_type="dataset",
	local_dir=font_dir,
	local_dir_use_symlinks=False
	)
	print("Font downloaded successfully!")
	except Exception as e:
	print(f"Failed to download font: {e}")


	def initialize_model():
	"""Initialize the OCR model (called once at startup)"""
	global ocr_model
	if ocr_model is None:
	# Ensure font is available
	download_font()

	print("Initializing HunyuanOCR model...")
	ocr_model = HunyuanOCR()
	print("Model ready!")
	return ocr_model


	@spaces.GPU
	def process_image(image: Image.Image, prompt: str = None, target_language: str = "Chinese"):
	"""
	Process uploaded image and return annotated result

	Args:
	image: PIL Image from Gradio
	prompt: Optional custom prompt
	target_language: Target language for translation (Original, Chinese, English, French, etc.)

	Returns:
	Tuple of (annotated_image, detection_summary, raw_response)
	"""
	if image is None:
	return None, "Please upload an image first.", ""

	try:
	# Initialize model if needed
	model = initialize_model()

	# Resize image if height > 960 while maintaining aspect ratio
	if image.height > 960:
	aspect_ratio = image.width / image.height
	new_height = 960
	new_width = int(new_height * aspect_ratio)
	print(f"Resizing image from {image.size} to ({new_width}, {new_height})")
	image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)

	# Get image dimensions
	image_width, image_height = image.size

	# Use default prompt if not provided
	if not prompt or prompt.strip() == "":
	prompt = "检测并识别图片中的文字,将文本内容与坐标格式化输出。"

	# Detect text
	print("Running text detection...")
	response = model.detect_text(image, prompt)

	# Parse results
	detections = model.parse_detection_results(response, image_width, image_height)

	# Merge detections first (since visualization does it internally, we need to do it here for translation)
	from visualization import merge_detections
	merged_detections = merge_detections(detections)

	# Translate text in merged detections if not "Original"
	if target_language != "Original":
	print(f"Translating text to {target_language}...")
	for det in merged_detections:
	original_text = det['text']
	translated = translate_text(original_text, target_language)
	det['original_text'] = original_text
	det['text'] = translated
	print(f"Translated: {original_text[:20]}... -> {translated[:20]}...")
	else:
	print("Skipping translation (Original selected)")

	# Draw boxes on image (pass merged detections and disable internal merging)
	annotated_image = draw_detection_boxes(image, merged_detections, merge_boxes=False)

	# Create summary
	summary = get_detection_summary(merged_detections)

	print(f"Detected {len(detections)} text regions")

	return annotated_image, summary, response

	except Exception as e:
	error_msg = f"Error processing image: {str(e)}"
	print(error_msg)
	return None, error_msg, ""


	def translate_text(text: str, target_language: str = "Chinese") -> str:
	"""
	Translate text to target language using model specified in .env via OpenAI-compatible API
	"""
	try:
	api_key = os.getenv("MODEL_ACCESS_TOKEN")
	base_url = os.getenv("MODEL_API_URL")
	model_name = os.getenv("MODEL_NAME", "ernie-4.5-turbo-128k") # Default fallback

	if not api_key or not base_url:
	print("Warning: MODEL_ACCESS_TOKEN or MODEL_API_URL not found in .env")
	return text

	client = OpenAI(api_key=api_key, base_url=base_url)

	system_prompt = f"You are a professional manga translator. The following text is from a Japanese manga. Translate it into natural and expressive {target_language}, maintaining the character's tone and the context of the scene. Only output the translation, no explanations."

	response = client.chat.completions.create(
	model=model_name,
	messages=[
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": text}
	]
	)

	return response.choices[0].message.content.strip()
	except Exception as e:
	print(f"Translation error: {e}")
	return text


	def create_demo():
	"""Create and configure the Gradio interface"""

	with gr.Blocks(title="AI Manga Translator") as demo:
	gr.Markdown("""
	# 📚 AI Manga Translator

	An intelligent tool designed to detect, recognize, and translate text in images, with specialized features for Manga and Comics.

	Key Capabilities:
	- 🖌️ Smart Text Replacement: Automatically detects text bubbles, wipes them clean, and overlays translated text.
	- 📖 Manga-Optimized: Handles vertical text and right-to-left reading order correctly.
	- 🌏 Multi-Language Translation: Translates detected text into your preferred language (Chinese, English, French, etc.).
	- 🔍 High-Precision OCR: Accurately spots text even in complex backgrounds.
	""")

	with gr.Row():
	with gr.Column(scale=1):
	# Input section
	gr.Markdown("### 📤 Input")
	input_image = gr.Image(
	type="pil",
	label="Upload Image",
	sources=["upload", "clipboard"]
	)

	custom_prompt = gr.Textbox(
	label="Custom Prompt (Optional)",
	placeholder="检测并识别图片中的文字,将文本内容与坐标格式化输出。",
	lines=2
	)

	target_lang = gr.Dropdown(
	choices=["Original", "Chinese", "English", "French", "German", "Spanish", "Korean", "Japanese"],
	value="Chinese",
	label="Target Language",
	info="Select language for translation (Original = no translation)"
	)

	detect_btn = gr.Button("🔍 Detect & Translate", variant="primary", size="lg")

	with gr.Column(scale=1):
	# Output section
	gr.Markdown("### 📊 Results")
	output_image = gr.Image(
	type="pil",
	label="Detected Text with Bounding Boxes"
	)

	detection_summary = gr.Textbox(
	label="Detection Summary",
	lines=10,
	max_lines=20
	)

	with gr.Accordion("Raw Model Response", open=False):
	raw_output = gr.Textbox(label="Raw Output", lines=5)

	# Connect the button
	detect_btn.click(
	fn=process_image,
	inputs=[input_image, custom_prompt, target_lang],
	outputs=[output_image, detection_summary, raw_output]
	)

	# Examples
	gr.Markdown("### 📝 Examples")
	gr.Examples(
	examples=[
	["examples/dandadan.png", "检测并识别图片中的文字,将文本内容与坐标格式化输出。"],
	["examples/ruridragon.png", "检测并识别图片中的文字,将文本内容与坐标格式化输出。"],
	["examples/spyfamily.png", "检测并识别图片中的文字,将文本内容与坐标格式化输出。"],
	],
	inputs=[input_image, custom_prompt],
	label="Click to use example image"
	)

	gr.Markdown("""
	---
	### ℹ️ About

	This application combines state-of-the-art AI technologies to provide seamless manga translation:

	- OCR Engine: HunyuanOCR.
	- Translation: Powered by ERNIE 4.5 for natural and context-aware translations.
	- Development: Vibe coded with Gemini 3 Pro.
	""")

	return demo


	if __name__ == "__main__":
	# Create and launch the demo
	print("Loading model (this may take a minute on first run)...")

	demo = create_demo()

	# Launch with public link option
	demo.launch(
	server_name="0.0.0.0",
	share=False, # Set to True to create a public link
	show_error=True,
	ssr_mode=False
	)