Spaces:
Runtime error
Runtime error
| # Copyright (C) 2021, Mindee. | |
| # This program is licensed under the Apache License version 2. | |
| # See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details. | |
| from typing import Any, Dict, List, Tuple | |
| import pandas as pd | |
| import numpy as np | |
| from scipy.cluster.hierarchy import fclusterdata | |
| from doctr.utils.geometry import estimate_page_angle, resolve_enclosing_bbox, resolve_enclosing_rbbox, rotate_boxes | |
| from doctr.utils.repr import NestedObject | |
| __all__ = ['DocumentBuilder'] | |
| class DocumentBuilder(NestedObject): | |
| """Implements a document builder | |
| Args: | |
| resolve_lines: whether words should be automatically grouped into lines | |
| resolve_blocks: whether lines should be automatically grouped into blocks | |
| paragraph_break: relative length of the minimum space separating paragraphs | |
| export_as_straight_boxes: if True, force straight boxes in the export (fit a rectangle | |
| box to all rotated boxes). Else, keep the boxes format unchanged, no matter what it is. | |
| """ | |
| def __init__( | |
| self, | |
| resolve_lines: bool = True, | |
| resolve_blocks: bool = True, | |
| paragraph_break: float = 0.035, | |
| export_as_straight_boxes: bool = False, | |
| ) -> None: | |
| self.resolve_lines = resolve_lines | |
| self.resolve_blocks = resolve_blocks | |
| self.paragraph_break = paragraph_break | |
| self.export_as_straight_boxes = export_as_straight_boxes | |
| def _sort_boxes(boxes: np.ndarray) -> np.ndarray: | |
| """Sort bounding boxes from top to bottom, left to right | |
| Args: | |
| boxes: bounding boxes of shape (N, 4) or (N, 4, 2) (in case of rotated bbox) | |
| Returns: | |
| tuple: indices of ordered boxes of shape (N,), boxes | |
| If straight boxes are passed tpo the function, boxes are unchanged | |
| else: boxes returned are straight boxes fitted to the straightened rotated boxes | |
| so that we fit the lines afterwards to the straigthened page | |
| """ | |
| if boxes.ndim == 3: | |
| boxes = rotate_boxes( | |
| loc_preds=boxes, | |
| angle=-estimate_page_angle(boxes), | |
| orig_shape=(1024, 1024), | |
| min_angle=5., | |
| ) | |
| boxes = np.concatenate((boxes.min(1), boxes.max(1)), -1) | |
| return (boxes[:, 0] + 2 * boxes[:, 3] / np.median(boxes[:, 3] - boxes[:, 1])).argsort(), boxes | |
| def _resolve_sub_lines(self, boxes: np.ndarray, word_idcs: List[int]) -> List[List[int]]: | |
| """Split a line in sub_lines | |
| Args: | |
| boxes: bounding boxes of shape (N, 4) | |
| word_idcs: list of indexes for the words of the line | |
| Returns: | |
| A list of (sub-)lines computed from the original line (words) | |
| """ | |
| lines = [] | |
| # Sort words horizontally | |
| word_idcs = [word_idcs[idx] | |
| for idx in boxes[word_idcs, 0].argsort().tolist()] | |
| # Eventually split line horizontally | |
| if len(word_idcs) < 2: | |
| lines.append(word_idcs) | |
| else: | |
| sub_line = [word_idcs[0]] | |
| for i in word_idcs[1:]: | |
| horiz_break = True | |
| prev_box = boxes[sub_line[-1]] | |
| # Compute distance between boxes | |
| dist = boxes[i, 0] - prev_box[2] | |
| # If distance between boxes is lower than paragraph break, same sub-line | |
| if dist < self.paragraph_break: | |
| horiz_break = False | |
| if horiz_break: | |
| lines.append(sub_line) | |
| sub_line = [] | |
| sub_line.append(i) | |
| lines.append(sub_line) | |
| return lines | |
| def _resolve_lines(self, boxes: np.ndarray) -> List[List[int]]: | |
| """Order boxes to group them in lines | |
| Args: | |
| boxes: bounding boxes of shape (N, 4) or (N, 4, 2) in case of rotated bbox | |
| Returns: | |
| nested list of box indices | |
| """ | |
| # Sort boxes, and straighten the boxes if they are rotated | |
| idxs, boxes = self._sort_boxes(boxes) | |
| # Compute median for boxes heights | |
| y_med = np.median(boxes[:, 3] - boxes[:, 1]) | |
| lines = [] | |
| words = [idxs[0]] # Assign the top-left word to the first line | |
| # Define a mean y-center for the line | |
| y_center_sum = boxes[idxs[0]][[1, 3]].mean() | |
| for idx in idxs[1:]: | |
| vert_break = True | |
| # Compute y_dist | |
| y_dist = abs(boxes[idx][[1, 3]].mean() - y_center_sum / len(words)) | |
| # If y-center of the box is close enough to mean y-center of the line, same line | |
| if y_dist < y_med / 2: | |
| vert_break = False | |
| if vert_break: | |
| # Compute sub-lines (horizontal split) | |
| lines.extend(self._resolve_sub_lines(boxes, words)) | |
| words = [] | |
| y_center_sum = 0 | |
| words.append(idx) | |
| y_center_sum += boxes[idx][[1, 3]].mean() | |
| # Use the remaining words to form the last(s) line(s) | |
| if len(words) > 0: | |
| # Compute sub-lines (horizontal split) | |
| lines.extend(self._resolve_sub_lines(boxes, words)) | |
| return lines | |
| def _resolve_blocks(boxes: np.ndarray, lines: List[List[int]]) -> List[List[List[int]]]: | |
| """Order lines to group them in blocks | |
| Args: | |
| boxes: bounding boxes of shape (N, 4) or (N, 4, 2) | |
| lines: list of lines, each line is a list of idx | |
| Returns: | |
| nested list of box indices | |
| """ | |
| # Resolve enclosing boxes of lines | |
| if boxes.ndim == 3: | |
| box_lines = np.asarray([ | |
| resolve_enclosing_rbbox( | |
| [tuple(boxes[idx, :, :]) for idx in line]) | |
| for line in lines # type: ignore[misc] | |
| ]) | |
| else: | |
| _box_lines = [ | |
| resolve_enclosing_bbox([ | |
| # type: ignore[misc] | |
| (tuple(boxes[idx, :2]), tuple(boxes[idx, 2:])) for idx in line | |
| ]) | |
| for line in lines | |
| ] | |
| box_lines = np.asarray([(x1, y1, x2, y2) | |
| for ((x1, y1), (x2, y2)) in _box_lines]) | |
| # Compute geometrical features of lines to clusterize | |
| # Clusterizing only with box centers yield to poor results for complex documents | |
| if boxes.ndim == 3: | |
| box_features = np.stack( | |
| ( | |
| (box_lines[:, 0, 0] + box_lines[:, 0, 1]) / 2, | |
| (box_lines[:, 0, 0] + box_lines[:, 2, 0]) / 2, | |
| (box_lines[:, 0, 0] + box_lines[:, 2, 1]) / 2, | |
| (box_lines[:, 0, 1] + box_lines[:, 2, 1]) / 2, | |
| (box_lines[:, 0, 1] + box_lines[:, 2, 0]) / 2, | |
| (box_lines[:, 2, 0] + box_lines[:, 2, 1]) / 2, | |
| ), axis=-1 | |
| ) | |
| else: | |
| box_features = np.stack( | |
| ( | |
| (box_lines[:, 0] + box_lines[:, 3]) / 2, | |
| (box_lines[:, 1] + box_lines[:, 2]) / 2, | |
| (box_lines[:, 0] + box_lines[:, 2]) / 2, | |
| (box_lines[:, 1] + box_lines[:, 3]) / 2, | |
| box_lines[:, 0], | |
| box_lines[:, 1], | |
| ), axis=-1 | |
| ) | |
| # Compute clusters | |
| clusters = fclusterdata( | |
| box_features, t=0.1, depth=4, criterion='distance', metric='euclidean') | |
| _blocks: Dict[int, List[int]] = {} | |
| # Form clusters | |
| for line_idx, cluster_idx in enumerate(clusters): | |
| if cluster_idx in _blocks.keys(): | |
| _blocks[cluster_idx].append(line_idx) | |
| else: | |
| _blocks[cluster_idx] = [line_idx] | |
| # Retrieve word-box level to return a fully nested structure | |
| blocks = [[lines[idx] for idx in block] for block in _blocks.values()] | |
| return blocks | |
| def _build_blocks(self, boxes: np.ndarray, word_preds: List[Tuple[str, float]], page_shapes: List[Tuple[int, int]]) -> Any: | |
| """Gather independent words in structured blocks | |
| Args: | |
| boxes: bounding boxes of all detected words of the page, of shape (N, 5) or (N, 4, 2) | |
| word_preds: list of all detected words of the page, of shape N | |
| Returns: | |
| list of block elements | |
| """ | |
| if boxes.shape[0] != len(word_preds): | |
| raise ValueError( | |
| f"Incompatible argument lengths: {boxes.shape[0]}, {len(word_preds)}") | |
| if boxes.shape[0] == 0: | |
| return [] | |
| # Decide whether we try to form lines | |
| _boxes = boxes | |
| if self.resolve_lines: | |
| lines = self._resolve_lines( | |
| _boxes if _boxes.ndim == 3 else _boxes[:, :4]) | |
| # Decide whether we try to form blocks | |
| if self.resolve_blocks and len(lines) > 1: | |
| _blocks = self._resolve_blocks( | |
| _boxes if _boxes.ndim == 3 else _boxes[:, :4], lines) | |
| else: | |
| _blocks = [lines] | |
| else: | |
| # Sort bounding boxes, one line for all boxes, one block for the line | |
| lines = [self._sort_boxes( | |
| _boxes if _boxes.ndim == 3 else _boxes[:, :4])[0]] | |
| _blocks = [lines] | |
| rows = [] | |
| for block_idx, lines in enumerate(_blocks): | |
| for line_idx, line in enumerate(lines): | |
| for i,idx in enumerate(line): | |
| h, w = page_shapes | |
| row = ( | |
| block_idx, line_idx, i, word_preds[idx], | |
| int(round(boxes[idx, 0]*w) | |
| ), int(round(boxes[idx, 1]*h)), | |
| int(round(boxes[idx, 2]*w) | |
| ), int(round(boxes[idx, 3]*h)), | |
| int(round(boxes[idx, 4]*100)) | |
| ) | |
| rows.append(row) | |
| return rows | |
| def extra_repr(self) -> str: | |
| return (f"resolve_lines={self.resolve_lines}, resolve_blocks={self.resolve_blocks}, " | |
| f"paragraph_break={self.paragraph_break}, " | |
| f"export_as_straight_boxes={self.export_as_straight_boxes}") | |
| def __call__( | |
| self, | |
| boxes: List[np.ndarray], | |
| text_preds: List[List[Tuple[str, float]]], | |
| page_shapes: List[Tuple[int, int]] | |
| ) -> pd.DataFrame: | |
| """Re-arrange detected words into structured blocks | |
| Args: | |
| boxes: list of N elements, where each element represents the localization predictions, of shape (*, 5) | |
| or (*, 6) for all words for a given page | |
| text_preds: list of N elements, where each element is the list of all word prediction (text + confidence) | |
| page_shape: shape of each page, of size N | |
| Returns: | |
| document object | |
| """ | |
| if len(boxes) != len(text_preds) or len(boxes) != len(page_shapes): | |
| raise ValueError( | |
| "All arguments are expected to be lists of the same size") | |
| if self.export_as_straight_boxes and len(boxes) > 0: | |
| # If boxes are already straight OK, else fit a bounding rect | |
| if boxes[0].ndim == 3: | |
| straight_boxes = [] | |
| # Iterate over pages | |
| for p_boxes in boxes: | |
| # Iterate over boxes of the pages | |
| straight_boxes.append(np.concatenate( | |
| (p_boxes.min(1), p_boxes.max(1)), 1)) | |
| boxes = straight_boxes | |
| _pages = [ | |
| pd.DataFrame.from_records(self._build_blocks(page_boxes, word_preds, shape), columns=[ | |
| "block_num", "line_num", "word_num" ,"word", "xmin", "ymin", "xmax", "ymax", "confidence_score" | |
| ]) | |
| for _idx, shape, page_boxes, word_preds in zip(range(len(boxes)), page_shapes, boxes, text_preds) | |
| ] | |
| return _pages |