import pytesseract from pdf2image import convert_from_path from PIL import Image import os import logging logger = logging.getLogger("ocr_engine") def extract_text_from_file(file_path: str) -> str: """ Extracts text from a PDF or Image file using Tesseract. """ if not os.path.exists(file_path): return "" text_content = "" try: # Handle PDF if file_path.lower().endswith('.pdf'): try: # Convert PDF pages to images images = convert_from_path(file_path) for i, image in enumerate(images): page_text = pytesseract.image_to_string(image) text_content += f"--- Page {i+1} ---\n{page_text}\n" except Exception as e: logger.error(f"Error converting PDF: {e}") return f"Error reading PDF: {str(e)}" # Handle Images (JPG, PNG, etc.) elif file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp')): try: image = Image.open(file_path) text_content = pytesseract.image_to_string(image) except Exception as e: logger.error(f"Error reading image: {e}") return f"Error reading image: {str(e)}" else: return "Unsupported file format. Please upload PDF or Image." except Exception as e: logger.error(f"OCR Critical Error: {e}") return f"OCR Failed: {str(e)}" return text_content.strip()