Spaces:

topdu
/

OpenOCR-UniRec-Demo

Running

File size: 10,069 Bytes

5de2f8f

import cv2
import numpy as np


def padding_image(img, size=(640, 640)):
    """
    Padding an image using OpenCV:
    - If the image is smaller than the target size, pad it to 640x640.
    - If the image is larger than the target size, split it into multiple 640x640 images and record positions.

    :param image_path: Path to the input image.
    :param output_dir: Directory to save the output images.
    :param size: The target size for padding or splitting (default 640x640).
    :return: List of tuples containing the coordinates of the top-left corner of each cropped 640x640 image.
    """

    img_height, img_width = img.shape[:2]
    target_width, target_height = size

    # If image is smaller than target size, pad the image to 640x640

    # Calculate padding amounts (top, bottom, left, right)
    pad_top = 0
    pad_bottom = target_height - img_height
    pad_left = 0
    pad_right = target_width - img_width

    # Pad the image (white padding, border type: constant)
    padded_img = cv2.copyMakeBorder(img,
                                    pad_top,
                                    pad_bottom,
                                    pad_left,
                                    pad_right,
                                    cv2.BORDER_CONSTANT,
                                    value=[0, 0, 0])

    # Return the padded area positions (top-left and bottom-right coordinates of the original image)
    return padded_img


def is_poly_outside_rect(poly, x, y, w, h):
    poly = np.array(poly)
    if poly[:, 0].max() < x or poly[:, 0].min() > x + w:
        return True
    if poly[:, 1].max() < y or poly[:, 1].min() > y + h:
        return True
    return False


def split_regions(axis):
    regions = []
    min_axis = 0
    for i in range(1, axis.shape[0]):
        if axis[i] != axis[i - 1] + 1:
            region = axis[min_axis:i]
            min_axis = i
            regions.append(region)
    return regions


def random_select(axis, max_size):
    xx = np.random.choice(axis, size=2)
    xmin = np.min(xx)
    xmax = np.max(xx)
    xmin = np.clip(xmin, 0, max_size - 1)
    xmax = np.clip(xmax, 0, max_size - 1)
    return xmin, xmax


def region_wise_random_select(regions, max_size):
    selected_index = list(np.random.choice(len(regions), 2))
    selected_values = []
    for index in selected_index:
        axis = regions[index]
        xx = int(np.random.choice(axis, size=1))
        selected_values.append(xx)
    xmin = min(selected_values)
    xmax = max(selected_values)
    return xmin, xmax


def crop_area(im, text_polys, min_crop_side_ratio, max_tries):
    h, w, _ = im.shape
    h_array = np.zeros(h, dtype=np.int32)
    w_array = np.zeros(w, dtype=np.int32)
    for points in text_polys:
        points = np.round(points, decimals=0).astype(np.int32)
        minx = np.min(points[:, 0])
        maxx = np.max(points[:, 0])
        w_array[minx:maxx] = 1
        miny = np.min(points[:, 1])
        maxy = np.max(points[:, 1])
        h_array[miny:maxy] = 1
    # ensure the cropped area not across a text
    h_axis = np.where(h_array == 0)[0]
    w_axis = np.where(w_array == 0)[0]

    if len(h_axis) == 0 or len(w_axis) == 0:
        return 0, 0, w, h

    h_regions = split_regions(h_axis)
    w_regions = split_regions(w_axis)

    for i in range(max_tries):
        if len(w_regions) > 1:
            xmin, xmax = region_wise_random_select(w_regions, w)
        else:
            xmin, xmax = random_select(w_axis, w)
        if len(h_regions) > 1:
            ymin, ymax = region_wise_random_select(h_regions, h)
        else:
            ymin, ymax = random_select(h_axis, h)

        if (xmax - xmin < min_crop_side_ratio * w or ymax - ymin < min_crop_side_ratio * h):
            # area too small
            continue
        num_poly_in_rect = 0
        for poly in text_polys:
            if not is_poly_outside_rect(poly, xmin, ymin, xmax - xmin,
                                        ymax - ymin):
                num_poly_in_rect += 1
                break

        if num_poly_in_rect > 0:
            return xmin, ymin, xmax - xmin, ymax - ymin

    return 0, 0, w, h


class EastRandomCropData(object):

    def __init__(
        self,
        size=(640, 640),
        max_tries=10,
        min_crop_side_ratio=0.1,
        keep_ratio=True,
        **kwargs,
    ):
        self.size = size
        self.max_tries = max_tries
        self.min_crop_side_ratio = min_crop_side_ratio
        self.keep_ratio = keep_ratio

    def __call__(self, data):
        img = data['image']
        text_polys = data['polys']
        ignore_tags = data['ignore_tags']
        texts = data['texts']
        all_care_polys = [
            text_polys[i] for i, tag in enumerate(ignore_tags) if not tag
        ]
        # 计算crop区域
        crop_x, crop_y, crop_w, crop_h = crop_area(img, all_care_polys,
                                                   self.min_crop_side_ratio,
                                                   self.max_tries)
        # crop 图片 保持比例填充
        scale_w = self.size[0] / crop_w
        scale_h = self.size[1] / crop_h
        scale = min(scale_w, scale_h)
        h = int(crop_h * scale)
        w = int(crop_w * scale)
        if self.keep_ratio:
            padimg = np.zeros((self.size[1], self.size[0], img.shape[2]),
                              img.dtype)
            padimg[:h, :w] = cv2.resize(
                img[crop_y:crop_y + crop_h, crop_x:crop_x + crop_w], (w, h))
            img = padimg
        else:
            img = cv2.resize(
                img[crop_y:crop_y + crop_h, crop_x:crop_x + crop_w],
                tuple(self.size),
            )
        # crop 文本框
        text_polys_crop = []
        ignore_tags_crop = []
        texts_crop = []
        for poly, text, tag in zip(text_polys, texts, ignore_tags):
            poly = ((poly - (crop_x, crop_y)) * scale).tolist()
            if not is_poly_outside_rect(poly, 0, 0, w, h):
                text_polys_crop.append(poly)
                ignore_tags_crop.append(tag)
                texts_crop.append(text)
        data['image'] = img
        data['polys'] = np.array(text_polys_crop)
        data['ignore_tags'] = ignore_tags_crop
        data['texts'] = texts_crop
        return data


class CropResize(object):

    def __init__(self, size=(640, 640), interpolation=cv2.INTER_LINEAR):
        self.size = size
        self.interpolation = interpolation

    def __call__(self, data):
        """
        Resize an image using OpenCV:
        - If the image is smaller than the target size, pad it to 640x640.
        - If the image is larger than the target size, split it into multiple 640x640 images and record positions.

        :param image_path: Path to the input image.
        :param output_dir: Directory to save the output images.
        :param size: The target size for padding or splitting (default 640x640).
        :return: List of tuples containing the coordinates of the top-left corner of each cropped 640x640 image.
        """
        img = data['image']
        img_height, img_width = img.shape[:2]
        target_width, target_height = self.size

        # If image is smaller than target size, pad the image to 640x640
        if img_width <= target_width and img_height <= target_height:
            # Calculate padding amounts (top, bottom, left, right)
            if img_width == target_width and img_height == target_height:
                return [img], [[0, 0, img_width, img_height]]
            padded_img = padding_image(img, self.size)

            # Return the padded area positions (top-left and bottom-right coordinates of the original image)
            return [padded_img], [[0, 0, img_width, img_height]]

        if img_width < target_width:
            img = cv2.copyMakeBorder(img,
                                     0,
                                     0,
                                     0,
                                     target_width - img_width,
                                     cv2.BORDER_CONSTANT,
                                     value=[0, 0, 0])

        if img_height < target_height:
            img = cv2.copyMakeBorder(img,
                                     0,
                                     target_height - img_height,
                                     0,
                                     0,
                                     cv2.BORDER_CONSTANT,
                                     value=[0, 0, 0])
            # raise ValueError("Image dimensions must be greater than or equal to target size")

        img_height, img_width = img.shape[:2]
        # If image is larger than or equal to target size, crop it into 640x640 tiles
        crop_positions = []
        count = 0
        cropped_img_list = []
        for top in range(0, img_height - target_height // 2,
                         target_height // 2):
            for left in range(0, img_width - target_height // 2,
                              target_width // 2):
                # Calculate the bottom and right boundaries for the crop
                right = min(left + target_width, img_width)
                bottom = min(top + target_height, img_height)
                if right > img_width:
                    right = img_width
                    left = max(0, right - target_width)
                if bottom > img_height:
                    bottom = img_height
                    top = max(0, bottom - target_height)
                # Crop the image
                cropped_img = img[top:bottom, left:right]
                if bottom - top < target_height or right - left < target_width:
                    cropped_img = padding_image(cropped_img, self.size)

                count += 1
                cropped_img_list.append(cropped_img)

                # Record the position of the cropped image
                crop_positions.append([left, top, right, bottom])

        # print(f"Images cropped and saved at {output_dir}.")

        return cropped_img_list, crop_positions