Spaces:
Sleeping
Sleeping
| import pytesseract | |
| from pytesseract import Output | |
| from pdf2image import convert_from_path | |
| from PIL import Image | |
| import os | |
| import logging | |
| import numpy as np | |
| logger = logging.getLogger("ocr_engine") | |
| def extract_text_and_conf(file_path: str) -> tuple[str, float]: | |
| """ | |
| Extracts text AND confidence score from a PDF or Image. | |
| Returns: (text_content, average_confidence_0_to_100) | |
| """ | |
| if not os.path.exists(file_path): | |
| return "", 0.0 | |
| text_content = "" | |
| confidences = [] | |
| try: | |
| images = [] | |
| # 1. Load Images | |
| if file_path.lower().endswith('.pdf'): | |
| try: | |
| images = convert_from_path(file_path) | |
| except Exception as e: | |
| logger.error(f"PDF Convert Error: {e}") | |
| return "", 0.0 | |
| elif file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp')): | |
| try: | |
| images = [Image.open(file_path)] | |
| except Exception as e: | |
| logger.error(f"Image Open Error: {e}") | |
| return "", 0.0 | |
| # 2. Process Each Page | |
| for i, image in enumerate(images): | |
| # A. Get Layout-Preserved Text (Best for LLM) | |
| page_text = pytesseract.image_to_string(image) | |
| text_content += f"--- Page {i+1} ---\n{page_text}\n" | |
| # B. Get Confidence Data (Best for KPIs) | |
| # data_dict keys: ['level', 'page_num', 'block_num', 'par_num', 'line_num', 'word_num', 'left', 'top', 'width', 'height', 'conf', 'text'] | |
| data = pytesseract.image_to_data(image, output_type=Output.DICT) | |
| # Filter valid confidences (ignore -1 which usually means whitespace/block info) | |
| for conf in data['conf']: | |
| # Tesseract returns -1 for structural elements (not words) | |
| if conf != -1: | |
| confidences.append(conf) | |
| # 3. Calculate Average Confidence | |
| avg_conf = 0.0 | |
| if confidences: | |
| avg_conf = sum(confidences) / len(confidences) | |
| return text_content.strip(), round(avg_conf, 2) | |
| except Exception as e: | |
| logger.error(f"OCR Critical Error: {e}") | |
| return "", 0.0 |