Spaces:

bla
/

layout_paddle

Paused

App Files Files Community

bla commited on Oct 20

Commit

d97b9d2

verified ·

1 Parent(s): 1184151

Create app.py

Browse files

Files changed (1) hide show

app.py +559 -0

app.py ADDED Viewed

	@@ -0,0 +1,559 @@

+import os
+import io
+import json
+import tempfile
+import requests
+from pathlib import Path
+from typing import Optional, Union
+import base64
+import fitz  # PyMuPDF
+import torch
+import torchvision
+import numpy as np
+from PIL import Image
+from fastapi import FastAPI, File, UploadFile, Form, HTTPException
+from fastapi.responses import HTMLResponse, JSONResponse
+from fastapi.staticfiles import StaticFiles
+from pydantic import BaseModel, HttpUrl
+from paddleocr import PaddleOCR
+from doclayout_yolo import YOLOv10
+# Initialize models
+ocr = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+layout_model = YOLOv10('/content/layout-model.pt')
+# Label mapping
+id_to_names = {
+    0: 'title',
+    1: 'plain text',
+    2: 'abandon',
+    3: 'figure',
+    4: 'figure_caption',
+    5: 'table',
+    6: 'table_caption',
+    7: 'table_footnote',
+    8: 'isolate_formula',
+    9: 'formula_caption'
+}
+app = FastAPI(title="Document Layout Analysis API", version="1.0.0")
+# Request models
+class URLRequest(BaseModel):
+    url: HttpUrl
+    resolution: Optional[int] = None
+# Helper functions
+def extract_number_from_caption(caption_text):
+    """Extract the number from a caption like 'Table 3' or 'Figure 2.1'"""
+    import re
+    if not caption_text:
+        return None
+    NUMBER_PATTERN = re.compile(r"(?:Table|Figure)\s*(\d+)", re.IGNORECASE)
+    match = NUMBER_PATTERN.search(caption_text)
+    return match.group(1) if match else None
+def detect_layout_regions(page, target_width=None, conf_threshold=0.25, iou_threshold=0.45):
+    """Use DocLayout-YOLO to detect document elements."""
+    # Get pixmap with optional resolution
+    if target_width:
+        pix = page.get_pixmap(dpi=150)
+        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+        # Resize to target width maintaining aspect ratio
+        aspect_ratio = img.height / img.width
+        target_height = int(target_width * aspect_ratio)
+        img = img.resize((target_width, target_height), Image.LANCZOS)
+        scale_x = target_width / pix.width
+        scale_y = target_height / pix.height
+    else:
+        pix = page.get_pixmap(dpi=150)
+        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+        scale_x = scale_y = 1.0
+    # Run layout detection
+    det_res = layout_model.predict(
+        img,
+        imgsz=1280,
+        conf=conf_threshold,
+        device=device,
+    )[0]
+    boxes = det_res.__dict__['boxes'].xyxy
+    classes = det_res.__dict__['boxes'].cls
+    scores = det_res.__dict__['boxes'].conf
+    # Apply NMS
+    indices = torchvision.ops.nms(
+        boxes=torch.Tensor(boxes),
+        scores=torch.Tensor(scores),
+        iou_threshold=iou_threshold
+    )
+    boxes, scores, classes = boxes[indices], scores[indices], classes[indices]
+    if len(boxes.shape) == 1:
+        boxes = np.expand_dims(boxes, 0)
+        scores = np.expand_dims(scores, 0)
+        classes = np.expand_dims(classes, 0)
+    detected_regions = []
+    for box, score, cls in zip(boxes, scores, classes):
+        # Scale boxes back if resolution was changed
+        box = [float(coord) for coord in box]
+        label_name = id_to_names[int(cls)]
+        detected_regions.append({
+            "bbox": box,
+            "type": label_name,
+            "confidence": float(score)
+        })
+    return detected_regions, img
+def extract_text_from_bbox(page, bbox, target_width=None, padding=5):
+    """Run OCR on a specific bounding box region to extract text."""
+    if target_width:
+        pix = page.get_pixmap(dpi=150)
+        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+        aspect_ratio = img.height / img.width
+        target_height = int(target_width * aspect_ratio)
+        img = img.resize((target_width, target_height), Image.LANCZOS)
+    else:
+        pix = page.get_pixmap(dpi=150)
+        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+    x0, y0, x1, y1 = [int(coord) for coord in bbox]
+    # Add padding and ensure within bounds
+    x0 = max(0, x0 - padding)
+    y0 = max(0, y0 - padding)
+    x1 = min(img.width, x1 + padding)
+    y1 = min(img.height, y1 + padding)
+    # Crop region
+    region = img.crop((x0, y0, x1, y1))
+    # Convert to bytes for OCR
+    img_byte_arr = io.BytesIO()
+    region.save(img_byte_arr, format='PNG')
+    img_bytes = img_byte_arr.getvalue()
+    # Run OCR
+    ocr_result = ocr.ocr(img_bytes, cls=True)
+    if not ocr_result or not ocr_result[0]:
+        return ""
+    # Concatenate all text
+    text_parts = []
+    for line in ocr_result[0]:
+        text = line[1][0]
+        text_parts.append(text)
+    return " ".join(text_parts)
+def process_document(file_path, target_width=None):
+    """Process a document and extract layout information."""
+    doc = fitz.open(file_path)
+    results = []
+    for page in doc:
+        detected_regions, processed_img = detect_layout_regions(
+            page,
+            target_width=target_width,
+            conf_threshold=0.25,
+            iou_threshold=0.45
+        )
+        image_entries = []
+        table_entries = []
+        # Group regions by type
+        figures = []
+        figure_captions = []
+        tables = []
+        table_captions = []
+        for region in detected_regions:
+            region_type = region["type"].lower()
+            if region_type == 'figure':
+                figures.append(region)
+            elif region_type == 'figure_caption':
+                figure_captions.append(region)
+            elif region_type == 'table':
+                tables.append(region)
+            elif region_type == 'table_caption':
+                table_captions.append(region)
+        # Match figures with their captions
+        for idx, figure in enumerate(figures, start=1):
+            figure_bbox = figure["bbox"]
+            caption_text = None
+            caption_bbox = None
+            min_distance = float('inf')
+            for caption in figure_captions:
+                cap_bbox = caption["bbox"]
+                distance = cap_bbox[1] - figure_bbox[3]
+                if 0 <= distance < min_distance:
+                    min_distance = distance
+                    caption_bbox = cap_bbox
+                    caption_text = extract_text_from_bbox(page, cap_bbox, target_width)
+            figure_number = extract_number_from_caption(caption_text) or str(idx)
+            image_entries.append({
+                "figure_number": figure_number,
+                "figure_bbox": figure_bbox,
+                "caption": caption_text,
+                "caption_bbox": caption_bbox,
+                "confidence": figure["confidence"]
+            })
+        # Match tables with their captions
+        for idx, table in enumerate(tables, start=1):
+            table_bbox = table["bbox"]
+            caption_text = None
+            caption_bbox = None
+            min_distance = float('inf')
+            for caption in table_captions:
+                cap_bbox = caption["bbox"]
+                distance = table_bbox[1] - cap_bbox[3]
+                if 0 <= distance < min_distance:
+                    min_distance = distance
+                    caption_bbox = cap_bbox
+                    caption_text = extract_text_from_bbox(page, cap_bbox, target_width)
+            table_number = extract_number_from_caption(caption_text) or str(idx)
+            table_entries.append({
+                "table_number": table_number,
+                "bbox": table_bbox,
+                "caption": caption_text,
+                "caption_bbox": caption_bbox,
+                "confidence": table["confidence"]
+            })
+        results.append({
+            "page_number": page.number + 1,
+            "figures": image_entries,
+            "tables": table_entries,
+            "image_dimensions": {
+                "width": processed_img.width,
+                "height": processed_img.height
+            }
+        })
+    doc.close()
+    return results
+# API Endpoints
+@app.get("/", response_class=HTMLResponse)
+async def read_root():
+    """Serve the frontend UI"""
+    html_content = """
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Document Layout Analysis API</title>
+    <script src="https://cdn.tailwindcss.com"></script>
+    <style>
+        .card-grainy { filter: url(#grainy); }
+    </style>
+</head>
+<body class="bg-[#09090B] min-h-screen">
+    <svg class="absolute h-0 w-0">
+        <filter id="grainy">
+            <feTurbulence type="fractalNoise" baseFrequency="0.7" numOctaves="2" result="noise" />
+            <feComponentTransfer>
+                <feFuncA type="table" tableValues="0 0.15 0" />
+            </feComponentTransfer>
+        </filter>
+    </svg>
+    <div class="container mx-auto px-4 py-12">
+        <!-- Header -->
+        <div class="mb-12 text-center">
+            <h3 class="text-sm font-semibold tracking-wider text-cyan-400/90 uppercase mb-4">AI-Powered Document Analysis</h3>
+            <h1 class="text-5xl font-bold mb-4">
+                <span class="bg-gradient-to-r from-gray-100 to-gray-300 bg-clip-text text-transparent">Document Layout</span>
+                <span class="text-gray-600"> Detection API</span>
+            </h1>
+            <p class="text-gray-400 text-lg">Extract tables, figures, and captions from PDFs and images with precision</p>
+        </div>
+        <!-- Main Card -->
+        <div class="relative isolate max-w-4xl mx-auto rounded-3xl border border-white/10 bg-gradient-to-br from-[#1A1D29] via-[#151821] to-[#0F1117] p-10">
+            <div class="card-grainy absolute top-0 left-0 h-full w-full"></div>
+            <div class="pointer-events-none absolute top-0 left-0 h-96 w-96 rounded-full bg-blue-500/5 blur-3xl"></div>
+            <div class="relative">
+                <!-- Upload Section -->
+                <div class="mb-8">
+                    <label class="block text-sm font-semibold text-gray-300 mb-4">Upload Document</label>
+                    <div class="rounded-2xl bg-black/30 p-8 ring-1 ring-white/10 backdrop-blur-sm">
+                        <input type="file" id="fileInput" accept=".pdf,.png,.jpg,.jpeg"
+                               class="block w-full text-sm text-gray-400 file:mr-4 file:py-3 file:px-6 file:rounded-lg file:border-0 file:text-sm file:font-semibold file:bg-cyan-500/10 file:text-cyan-400 hover:file:bg-cyan-500/20 cursor-pointer">
+                    </div>
+                </div>
+                <!-- OR Divider -->
+                <div class="flex items-center my-8">
+                    <div class="flex-1 h-px bg-white/10"></div>
+                    <span class="px-4 text-gray-500 text-sm font-semibold">OR</span>
+                    <div class="flex-1 h-px bg-white/10"></div>
+                </div>
+                <!-- URL Section -->
+                <div class="mb-8">
+                    <label class="block text-sm font-semibold text-gray-300 mb-4">Document URL</label>
+                    <div class="rounded-2xl bg-black/30 p-8 ring-1 ring-white/10 backdrop-blur-sm">
+                        <input type="url" id="urlInput" placeholder="https://example.com/document.pdf"
+                               class="w-full bg-white/5 border border-white/10 rounded-lg px-4 py-3 text-gray-300 placeholder-gray-600 focus:outline-none focus:ring-2 focus:ring-cyan-500/50">
+                    </div>
+                </div>
+                <!-- Resolution Section -->
+                <div class="mb-8">
+                    <label class="block text-sm font-semibold text-gray-300 mb-4">
+                        Target Width (Optional)
+                        <span class="text-gray-500 text-xs font-normal ml-2">Leave empty for original size</span>
+                    </label>
+                    <div class="rounded-2xl bg-black/30 p-8 ring-1 ring-white/10 backdrop-blur-sm">
+                        <input type="number" id="resolutionInput" placeholder="e.g., 1280" min="256" max="4096"
+                               class="w-full bg-white/5 border border-white/10 rounded-lg px-4 py-3 text-gray-300 placeholder-gray-600 focus:outline-none focus:ring-2 focus:ring-cyan-500/50">
+                    </div>
+                </div>
+                <!-- Analyze Button -->
+                <button id="analyzeBtn" onclick="analyzeDocument()"
+                        class="w-full py-4 rounded-lg bg-gradient-to-r from-cyan-500 to-blue-500 text-white font-semibold text-lg hover:from-cyan-600 hover:to-blue-600 transition-all shadow-lg hover:shadow-cyan-500/25">
+                    Analyze Document
+                </button>
+                <!-- Loading -->
+                <div id="loading" class="hidden mt-8 text-center">
+                    <div class="inline-block animate-spin rounded-full h-12 w-12 border-4 border-cyan-500 border-t-transparent"></div>
+                    <p class="text-gray-400 mt-4">Processing document...</p>
+                </div>
+                <!-- Results -->
+                <div id="results" class="hidden mt-8">
+                    <h3 class="text-xl font-bold text-gray-300 mb-4">Analysis Results</h3>
+                    <div class="rounded-2xl bg-black/30 p-8 ring-1 ring-white/10 backdrop-blur-sm">
+                        <pre id="resultsContent" class="text-sm text-gray-300 overflow-x-auto"></pre>
+                    </div>
+                    <button onclick="downloadJSON()" class="mt-4 px-6 py-3 rounded-lg bg-emerald-500/10 text-emerald-400 font-semibold hover:bg-emerald-500/20 transition-all ring-1 ring-emerald-500/30">
+                        Download JSON
+                    </button>
+                </div>
+                <!-- Error -->
+                <div id="error" class="hidden mt-8 rounded-2xl bg-rose-500/10 p-6 ring-1 ring-rose-500/30">
+                    <p class="text-rose-400 font-semibold" id="errorMessage"></p>
+                </div>
+            </div>
+        </div>
+        <!-- API Documentation -->
+        <div class="mt-16 max-w-4xl mx-auto">
+            <h2 class="text-3xl font-bold text-gray-300 mb-8">API Documentation</h2>
+            <div class="space-y-6">
+                <!-- Endpoint 1 -->
+                <div class="rounded-2xl border border-white/10 bg-gradient-to-br from-[#1A1D29] via-[#151821] to-[#0F1117] p-8">
+                    <div class="flex items-center gap-3 mb-4">
+                        <span class="inline-flex items-center rounded-lg bg-emerald-500/10 px-3 py-1.5 text-xs font-bold text-emerald-400 uppercase ring-1 ring-emerald-500/30">POST</span>
+                        <code class="text-cyan-400 text-lg font-mono">/analyze</code>
+                    </div>
+                    <p class="text-gray-400 mb-4">Analyze a document by uploading a file</p>
+                    <div class="bg-black/30 rounded-lg p-4 overflow-x-auto">
+                        <pre class="text-sm text-gray-300"><code>curl -X POST "http://your-api-url/analyze" \\
+  -F "[email protected]" \\
+  -F "resolution=1280"</code></pre>
+                    </div>
+                </div>
+                <!-- Endpoint 2 -->
+                <div class="rounded-2xl border border-white/10 bg-gradient-to-br from-[#1A1D29] via-[#151821] to-[#0F1117] p-8">
+                    <div class="flex items-center gap-3 mb-4">
+                        <span class="inline-flex items-center rounded-lg bg-emerald-500/10 px-3 py-1.5 text-xs font-bold text-emerald-400 uppercase ring-1 ring-emerald-500/30">POST</span>
+                        <code class="text-cyan-400 text-lg font-mono">/analyze-url</code>
+                    </div>
+                    <p class="text-gray-400 mb-4">Analyze a document from a URL</p>
+                    <div class="bg-black/30 rounded-lg p-4 overflow-x-auto">
+                        <pre class="text-sm text-gray-300"><code>curl -X POST "http://your-api-url/analyze-url" \\
+  -H "Content-Type: application/json" \\
+  -d '{"url": "https://example.com/doc.pdf", "resolution": 1280}'</code></pre>
+                    </div>
+                </div>
+            </div>
+        </div>
+    </div>
+    <script>
+        let analysisResults = null;
+        async function analyzeDocument() {
+            const fileInput = document.getElementById('fileInput');
+            const urlInput = document.getElementById('urlInput');
+            const resolutionInput = document.getElementById('resolutionInput');
+            const loading = document.getElementById('loading');
+            const results = document.getElementById('results');
+            const error = document.getElementById('error');
+            // Hide previous results
+            results.classList.add('hidden');
+            error.classList.add('hidden');
+            const resolution = resolutionInput.value ? parseInt(resolutionInput.value) : null;
+            try {
+                loading.classList.remove('hidden');
+                let response;
+                if (fileInput.files.length > 0) {
+                    // File upload
+                    const formData = new FormData();
+                    formData.append('file', fileInput.files[0]);
+                    if (resolution) formData.append('resolution', resolution);
+                    response = await fetch('/analyze', {
+                        method: 'POST',
+                        body: formData
+                    });
+                } else if (urlInput.value) {
+                    // URL analysis
+                    const body = { url: urlInput.value };
+                    if (resolution) body.resolution = resolution;
+                    response = await fetch('/analyze-url', {
+                        method: 'POST',
+                        headers: { 'Content-Type': 'application/json' },
+                        body: JSON.stringify(body)
+                    });
+                } else {
+                    throw new Error('Please provide a file or URL');
+                }
+                if (!response.ok) {
+                    const errorData = await response.json();
+                    throw new Error(errorData.detail || 'Analysis failed');
+                }
+                analysisResults = await response.json();
+                document.getElementById('resultsContent').textContent = JSON.stringify(analysisResults, null, 2);
+                results.classList.remove('hidden');
+            } catch (err) {
+                document.getElementById('errorMessage').textContent = err.message;
+                error.classList.remove('hidden');
+            } finally {
+                loading.classList.add('hidden');
+            }
+        }
+        function downloadJSON() {
+            if (!analysisResults) return;
+            const blob = new Blob([JSON.stringify(analysisResults, null, 2)], { type: 'application/json' });
+            const url = URL.createObjectURL(blob);
+            const a = document.createElement('a');
+            a.href = url;
+            a.download = 'layout_analysis.json';
+            a.click();
+            URL.revokeObjectURL(url);
+        }
+    </script>
+</body>
+</html>
+    """
+    return HTMLResponse(content=html_content)
+@app.post("/analyze")
+async def analyze_file(
+    file: UploadFile = File(...),
+    resolution: Optional[int] = Form(None)
+):
+    """Analyze an uploaded document file"""
+    try:
+        # Save uploaded file temporarily
+        with tempfile.NamedTemporaryFile(delete=False, suffix=Path(file.filename).suffix) as tmp:
+            content = await file.read()
+            tmp.write(content)
+            tmp_path = tmp.name
+        # Process document
+        results = process_document(tmp_path, target_width=resolution)
+        # Cleanup
+        os.unlink(tmp_path)
+        return JSONResponse(content={
+            "status": "success",
+            "filename": file.filename,
+            "pages": len(results),
+            "results": results
+        })
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/analyze-url")
+async def analyze_url(request: URLRequest):
+    """Analyze a document from a URL"""
+    try:
+        # Download file from URL
+        response = requests.get(str(request.url), timeout=30)
+        response.raise_for_status()
+        # Determine file extension
+        content_type = response.headers.get('content-type', '')
+        if 'pdf' in content_type:
+            ext = '.pdf'
+        elif 'image' in content_type:
+            ext = '.png'
+        else:
+            ext = '.pdf'  # default
+        # Save temporarily
+        with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp:
+            tmp.write(response.content)
+            tmp_path = tmp.name
+        # Process document
+        results = process_document(tmp_path, target_width=request.resolution)
+        # Cleanup
+        os.unlink(tmp_path)
+        return JSONResponse(content={
+            "status": "success",
+            "url": str(request.url),
+            "pages": len(results),
+            "results": results
+        })
+    except requests.RequestException as e:
+        raise HTTPException(status_code=400, detail=f"Failed to download file: {str(e)}")
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/health")
+async def health_check():
+    """Health check endpoint"""
+    return {"status": "healthy", "device": device}
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)