Spaces:

AlhitawiMohammed22
/

HTD_HTR

Runtime error

App Files Files Community

HTD_HTR / builder.py

AlhitawiMohammed22

Create Builder Script

ff135d3 over 2 years ago

raw

history blame contribute delete

12.1 kB


	# Copyright (C) 2021, Mindee.

	# This program is licensed under the Apache License version 2.
	# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.


	from typing import Any, Dict, List, Tuple
	import pandas as pd

	import numpy as np
	from scipy.cluster.hierarchy import fclusterdata

	from doctr.utils.geometry import estimate_page_angle, resolve_enclosing_bbox, resolve_enclosing_rbbox, rotate_boxes
	from doctr.utils.repr import NestedObject

	__all__ = ['DocumentBuilder']


	class DocumentBuilder(NestedObject):
	"""Implements a document builder
	Args:
	resolve_lines: whether words should be automatically grouped into lines
	resolve_blocks: whether lines should be automatically grouped into blocks
	paragraph_break: relative length of the minimum space separating paragraphs
	export_as_straight_boxes: if True, force straight boxes in the export (fit a rectangle
	box to all rotated boxes). Else, keep the boxes format unchanged, no matter what it is.
	"""

	def __init__(
	self,
	resolve_lines: bool = True,
	resolve_blocks: bool = True,
	paragraph_break: float = 0.035,
	export_as_straight_boxes: bool = False,
	) -> None:

	self.resolve_lines = resolve_lines
	self.resolve_blocks = resolve_blocks
	self.paragraph_break = paragraph_break
	self.export_as_straight_boxes = export_as_straight_boxes

	@staticmethod
	def _sort_boxes(boxes: np.ndarray) -> np.ndarray:
	"""Sort bounding boxes from top to bottom, left to right
	Args:
	boxes: bounding boxes of shape (N, 4) or (N, 4, 2) (in case of rotated bbox)
	Returns:
	tuple: indices of ordered boxes of shape (N,), boxes
	If straight boxes are passed tpo the function, boxes are unchanged
	else: boxes returned are straight boxes fitted to the straightened rotated boxes
	so that we fit the lines afterwards to the straigthened page
	"""
	if boxes.ndim == 3:
	boxes = rotate_boxes(
	loc_preds=boxes,
	angle=-estimate_page_angle(boxes),
	orig_shape=(1024, 1024),
	min_angle=5.,
	)
	boxes = np.concatenate((boxes.min(1), boxes.max(1)), -1)
	return (boxes[:, 0] + 2 * boxes[:, 3] / np.median(boxes[:, 3] - boxes[:, 1])).argsort(), boxes

	def _resolve_sub_lines(self, boxes: np.ndarray, word_idcs: List[int]) -> List[List[int]]:
	"""Split a line in sub_lines
	Args:
	boxes: bounding boxes of shape (N, 4)
	word_idcs: list of indexes for the words of the line
	Returns:
	A list of (sub-)lines computed from the original line (words)
	"""
	lines = []
	# Sort words horizontally
	word_idcs = [word_idcs[idx]
	for idx in boxes[word_idcs, 0].argsort().tolist()]

	# Eventually split line horizontally
	if len(word_idcs) < 2:
	lines.append(word_idcs)
	else:
	sub_line = [word_idcs[0]]
	for i in word_idcs[1:]:
	horiz_break = True

	prev_box = boxes[sub_line[-1]]
	# Compute distance between boxes
	dist = boxes[i, 0] - prev_box[2]
	# If distance between boxes is lower than paragraph break, same sub-line
	if dist < self.paragraph_break:
	horiz_break = False

	if horiz_break:
	lines.append(sub_line)
	sub_line = []

	sub_line.append(i)
	lines.append(sub_line)

	return lines

	def _resolve_lines(self, boxes: np.ndarray) -> List[List[int]]:
	"""Order boxes to group them in lines
	Args:
	boxes: bounding boxes of shape (N, 4) or (N, 4, 2) in case of rotated bbox
	Returns:
	nested list of box indices
	"""

	# Sort boxes, and straighten the boxes if they are rotated
	idxs, boxes = self._sort_boxes(boxes)

	# Compute median for boxes heights
	y_med = np.median(boxes[:, 3] - boxes[:, 1])

	lines = []
	words = [idxs[0]] # Assign the top-left word to the first line
	# Define a mean y-center for the line
	y_center_sum = boxes[idxs[0]][[1, 3]].mean()

	for idx in idxs[1:]:
	vert_break = True

	# Compute y_dist
	y_dist = abs(boxes[idx][[1, 3]].mean() - y_center_sum / len(words))
	# If y-center of the box is close enough to mean y-center of the line, same line
	if y_dist < y_med / 2:
	vert_break = False

	if vert_break:
	# Compute sub-lines (horizontal split)
	lines.extend(self._resolve_sub_lines(boxes, words))
	words = []
	y_center_sum = 0

	words.append(idx)
	y_center_sum += boxes[idx][[1, 3]].mean()

	# Use the remaining words to form the last(s) line(s)
	if len(words) > 0:
	# Compute sub-lines (horizontal split)
	lines.extend(self._resolve_sub_lines(boxes, words))

	return lines

	@staticmethod
	def _resolve_blocks(boxes: np.ndarray, lines: List[List[int]]) -> List[List[List[int]]]:
	"""Order lines to group them in blocks
	Args:
	boxes: bounding boxes of shape (N, 4) or (N, 4, 2)
	lines: list of lines, each line is a list of idx
	Returns:
	nested list of box indices
	"""
	# Resolve enclosing boxes of lines
	if boxes.ndim == 3:
	box_lines = np.asarray([
	resolve_enclosing_rbbox(
	[tuple(boxes[idx, :, :]) for idx in line])
	for line in lines # type: ignore[misc]
	])
	else:
	_box_lines = [
	resolve_enclosing_bbox([
	# type: ignore[misc]
	(tuple(boxes[idx, :2]), tuple(boxes[idx, 2:])) for idx in line
	])
	for line in lines
	]
	box_lines = np.asarray([(x1, y1, x2, y2)
	for ((x1, y1), (x2, y2)) in _box_lines])

	# Compute geometrical features of lines to clusterize
	# Clusterizing only with box centers yield to poor results for complex documents
	if boxes.ndim == 3:
	box_features = np.stack(
	(
	(box_lines[:, 0, 0] + box_lines[:, 0, 1]) / 2,
	(box_lines[:, 0, 0] + box_lines[:, 2, 0]) / 2,
	(box_lines[:, 0, 0] + box_lines[:, 2, 1]) / 2,
	(box_lines[:, 0, 1] + box_lines[:, 2, 1]) / 2,
	(box_lines[:, 0, 1] + box_lines[:, 2, 0]) / 2,
	(box_lines[:, 2, 0] + box_lines[:, 2, 1]) / 2,
	), axis=-1
	)
	else:
	box_features = np.stack(
	(
	(box_lines[:, 0] + box_lines[:, 3]) / 2,
	(box_lines[:, 1] + box_lines[:, 2]) / 2,
	(box_lines[:, 0] + box_lines[:, 2]) / 2,
	(box_lines[:, 1] + box_lines[:, 3]) / 2,
	box_lines[:, 0],
	box_lines[:, 1],
	), axis=-1
	)
	# Compute clusters
	clusters = fclusterdata(
	box_features, t=0.1, depth=4, criterion='distance', metric='euclidean')

	_blocks: Dict[int, List[int]] = {}
	# Form clusters
	for line_idx, cluster_idx in enumerate(clusters):
	if cluster_idx in _blocks.keys():
	_blocks[cluster_idx].append(line_idx)
	else:
	_blocks[cluster_idx] = [line_idx]

	# Retrieve word-box level to return a fully nested structure
	blocks = [[lines[idx] for idx in block] for block in _blocks.values()]

	return blocks

	def _build_blocks(self, boxes: np.ndarray, word_preds: List[Tuple[str, float]], page_shapes: List[Tuple[int, int]]) -> Any:
	"""Gather independent words in structured blocks
	Args:
	boxes: bounding boxes of all detected words of the page, of shape (N, 5) or (N, 4, 2)
	word_preds: list of all detected words of the page, of shape N
	Returns:
	list of block elements
	"""

	if boxes.shape[0] != len(word_preds):
	raise ValueError(
	f"Incompatible argument lengths: {boxes.shape[0]}, {len(word_preds)}")

	if boxes.shape[0] == 0:
	return []

	# Decide whether we try to form lines
	_boxes = boxes
	if self.resolve_lines:
	lines = self._resolve_lines(
	_boxes if _boxes.ndim == 3 else _boxes[:, :4])
	# Decide whether we try to form blocks
	if self.resolve_blocks and len(lines) > 1:
	_blocks = self._resolve_blocks(
	_boxes if _boxes.ndim == 3 else _boxes[:, :4], lines)
	else:
	_blocks = [lines]
	else:
	# Sort bounding boxes, one line for all boxes, one block for the line
	lines = [self._sort_boxes(
	_boxes if _boxes.ndim == 3 else _boxes[:, :4])[0]]
	_blocks = [lines]

	rows = []
	for block_idx, lines in enumerate(_blocks):
	for line_idx, line in enumerate(lines):
	for i,idx in enumerate(line):
	h, w = page_shapes
	row = (
	block_idx, line_idx, i, word_preds[idx],
	int(round(boxes[idx, 0]*w)
	), int(round(boxes[idx, 1]*h)),
	int(round(boxes[idx, 2]*w)
	), int(round(boxes[idx, 3]*h)),
	int(round(boxes[idx, 4]*100))
	)
	rows.append(row)

	return rows

	def extra_repr(self) -> str:
	return (f"resolve_lines={self.resolve_lines}, resolve_blocks={self.resolve_blocks}, "
	f"paragraph_break={self.paragraph_break}, "
	f"export_as_straight_boxes={self.export_as_straight_boxes}")

	def __call__(
	self,
	boxes: List[np.ndarray],
	text_preds: List[List[Tuple[str, float]]],
	page_shapes: List[Tuple[int, int]]
	) -> pd.DataFrame:
	"""Re-arrange detected words into structured blocks
	Args:
	boxes: list of N elements, where each element represents the localization predictions, of shape (*, 5)
	or (*, 6) for all words for a given page
	text_preds: list of N elements, where each element is the list of all word prediction (text + confidence)
	page_shape: shape of each page, of size N
	Returns:
	document object
	"""
	if len(boxes) != len(text_preds) or len(boxes) != len(page_shapes):
	raise ValueError(
	"All arguments are expected to be lists of the same size")

	if self.export_as_straight_boxes and len(boxes) > 0:
	# If boxes are already straight OK, else fit a bounding rect
	if boxes[0].ndim == 3:
	straight_boxes = []
	# Iterate over pages
	for p_boxes in boxes:
	# Iterate over boxes of the pages
	straight_boxes.append(np.concatenate(
	(p_boxes.min(1), p_boxes.max(1)), 1))
	boxes = straight_boxes

	_pages = [
	pd.DataFrame.from_records(self._build_blocks(page_boxes, word_preds, shape), columns=[
	"block_num", "line_num", "word_num" ,"word", "xmin", "ymin", "xmax", "ymax", "confidence_score"
	])
	for _idx, shape, page_boxes, word_preds in zip(range(len(boxes)), page_shapes, boxes, text_preds)
	]

	return _pages