File size: 1,549 Bytes
dc79584
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
import os
import logging
logger = logging.getLogger("ocr_engine")
def extract_text_from_file(file_path: str) -> str:
    """
    Extracts text from a PDF or Image file using Tesseract.
    """
    if not os.path.exists(file_path):
        return ""

    text_content = ""
    
    try:
        # Handle PDF
        if file_path.lower().endswith('.pdf'):
            try:
                # Convert PDF pages to images
                images = convert_from_path(file_path)
                for i, image in enumerate(images):
                    page_text = pytesseract.image_to_string(image)
                    text_content += f"--- Page {i+1} ---\n{page_text}\n"
            except Exception as e:
                logger.error(f"Error converting PDF: {e}")
                return f"Error reading PDF: {str(e)}"
        
        # Handle Images (JPG, PNG, etc.)
        elif file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp')):
            try:
                image = Image.open(file_path)
                text_content = pytesseract.image_to_string(image)
            except Exception as e:
                logger.error(f"Error reading image: {e}")
                return f"Error reading image: {str(e)}"
        
        else:
            return "Unsupported file format. Please upload PDF or Image."

    except Exception as e:
        logger.error(f"OCR Critical Error: {e}")
        return f"OCR Failed: {str(e)}"

    return text_content.strip()