Spaces:

avans06
/

AdenzuMangaPanelExtractor

Running

File size: 19,554 Bytes

import os
from typing import Callable
import cv2
import warnings
import numpy as np
from image_processing.image import is_contour_rectangular, apply_adaptive_threshold, group_contours_horizontally, group_contours_vertically, adaptive_hconcat, adaptive_vconcat, group_bounding_boxes_horizontally, group_bounding_boxes_vertically
from myutils.myutils import load_images, load_image
from tqdm import tqdm
from image_processing.model import model
from manga_panel_processor import sort_panels_by_column_then_row

class OutputMode:
    BOUNDING = 'bounding'
    MASKED = 'masked'

    def from_index(index: int) -> str:
        return [OutputMode.BOUNDING, OutputMode.MASKED][index]


class MergeMode:
    NONE = 'none'
    VERTICAL = 'vertical'
    HORIZONTAL = 'horizontal'

    def from_index(index: int) -> str:
        return [MergeMode.NONE, MergeMode.VERTICAL, MergeMode.HORIZONTAL][index]
    

def get_background_intensity_range(grayscale_image: np.ndarray, min_range: int = 1) -> tuple[int, int]:
    """
    Returns the minimum and maximum intensity values of the background of the image
    """
    edges = [grayscale_image[-1, :], grayscale_image[0, :], grayscale_image[:, 0], grayscale_image[:, -1]]
    sorted_edges = sorted(edges, key=lambda x: np.var(x))

    least_varied_edge = sorted_edges[0]

    max_intensity = max(least_varied_edge)
    min_intensity = max(min(min(least_varied_edge), max_intensity - min_range), 0)

    return min_intensity, max_intensity


def generate_background_mask(grayscale_image: np.ndarray) -> np.ndarray:
    """
    Generates a mask by focusing on the largest area of white pixels
    """
    WHITE = 255
    LESS_WHITE, _ = get_background_intensity_range(grayscale_image, 25)
    LESS_WHITE = max(LESS_WHITE, 240)

    ret, thresh = cv2.threshold(grayscale_image, LESS_WHITE, WHITE, cv2.THRESH_BINARY)
    nlabels, labels, stats, centroids = cv2.connectedComponentsWithStats(thresh)

    mask = np.zeros_like(thresh)

    PAGE_TO_SEGMENT_RATIO = 1024

    halting_area_size = mask.size // PAGE_TO_SEGMENT_RATIO

    mask_height, mask_width = mask.shape
    base_background_size_error_threshold = 0.05
    whole_background_min_width = mask_width * (1 - base_background_size_error_threshold)
    whole_background_min_height = mask_height * (1 - base_background_size_error_threshold)

    for i in np.argsort(stats[1:, 4])[::-1]:
        contour_index = i + 1
        x, y, w, h, area = stats[contour_index]
        if area < halting_area_size:
            break
        if (
            (w > whole_background_min_width) or
            (h > whole_background_min_height) or
            (is_contour_rectangular(cv2.findContours((labels == contour_index).astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)[0][0]))
        ):
            mask[labels == contour_index] = WHITE

    mask = cv2.dilate(mask, np.ones((3, 3), np.uint8), iterations=2)

    return mask


def extract_panels(
    image: np.ndarray,
    panel_contours: list[np.ndarray],
    accept_page_as_panel: bool = True,
    mode: str = OutputMode.BOUNDING,
    fill_in_color: tuple[int, int, int] = (0, 0, 0),
) -> list[np.ndarray]:
    """
    Extracts panels from the image using the given contours corresponding to the panels
    
    Parameters:
    - image: The image to extract the panels from
    - panel_contours: The contours corresponding to the panels
    - accept_page_as_panel: Whether to accept the whole page as a panel
    - mode: The mode to use for extraction
        - 'masked': Extracts the panels by cuting out only the inside of the contours
        - 'bounding': Extracts the panels by using the bounding boxes of the contours
    - fill_in_color: The color to fill in the background of the panel images
    """
    height, width = image.shape[:2]

    returned_panels = []

    for contour in panel_contours:
        x, y, w, h = cv2.boundingRect(contour)

        if not accept_page_as_panel and ((w >= width * 0.99) or (h >= height * 0.99)):
            continue

        if mode == 'masked':
            mask = np.zeros_like(image)
            cv2.drawContours(mask, [contour], -1, (255, 255, 255), -1)
            masked_image = cv2.bitwise_and(image, mask)
            fitted_panel = masked_image[y:y + h, x:x + w]
            fitted_panel = cv2.bitwise_or(cv2.bitwise_and(cv2.bitwise_not(mask[y:y + h, x:x + w]), fill_in_color), fitted_panel)
        else:
            fitted_panel = image[y:y + h, x:x + w]
        
        returned_panels.append(fitted_panel)

    return returned_panels


def preprocess_image(grayscale_image: np.ndarray) -> np.ndarray:
    """
    Preprocesses the image for panel extraction
    """
    processed_image = cv2.GaussianBlur(grayscale_image, (3, 3), 0)
    processed_image = cv2.Laplacian(processed_image, -1)
    return processed_image


def preprocess_image_with_dilation(grayscale_image: np.ndarray) -> np.ndarray:
    """
    Preprocesses the image for panel extraction
    """
    processed_image = cv2.GaussianBlur(grayscale_image, (3, 3), 0)
    processed_image = cv2.Laplacian(processed_image, -1)
    processed_image = cv2.dilate(processed_image, np.ones((5, 5), np.uint8), iterations=1)
    processed_image = 255 - processed_image
    return processed_image


def joint_panel_split_extraction(grayscale_image: np.ndarray, background_mask: np.ndarray) -> np.ndarray:
    """
    Extracts the panels from the image with splitting the joint panels
    """
    pixels_before = np.count_nonzero(background_mask)
    background_mask = cv2.ximgproc.thinning(background_mask) 
    
    up_kernel = np.array([[0, 0, 0], [0, 1, 0], [0, 1, 0]], np.uint8)
    down_kernel = np.array([[0, 1, 0], [0, 1, 0], [0, 0, 0]], np.uint8)
    left_kernel = np.array([[0, 0, 0], [0, 1, 1], [0, 0, 0]], np.uint8)
    right_kernel = np.array([[0, 0, 0], [1, 1, 0], [0, 0, 0]], np.uint8)

    down_right_diagonal_kernel = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 0]], np.uint8)
    down_left_diagonal_kernel = np.array([[0, 0, 1], [0, 1, 0], [0, 0, 0]], np.uint8)
    up_left_diagonal_kernel = np.array([[0, 0, 0], [0, 1, 0], [0, 0, 1]], np.uint8)
    up_right_diagonal_kernel = np.array([[0, 0, 0], [0, 1, 0], [1, 0, 0]], np.uint8)
    
    PAGE_TO_JOINT_OBJECT_RATIO = 3
    image_height, image_width = grayscale_image.shape

    height_based_size = image_height // PAGE_TO_JOINT_OBJECT_RATIO
    width_based_size = (2 * image_width) // PAGE_TO_JOINT_OBJECT_RATIO

    height_based_size += height_based_size % 2 + 1
    width_based_size += width_based_size % 2 + 1

    up_dilation_kernel = np.zeros((height_based_size, height_based_size), np.uint8)
    up_dilation_kernel[height_based_size // 2:, height_based_size // 2] = 1

    down_dilation_kernel = np.zeros((height_based_size, height_based_size), np.uint8)
    down_dilation_kernel[:height_based_size // 2 + 1, height_based_size // 2] = 1

    left_dilation_kernel = np.zeros((width_based_size, width_based_size), np.uint8)
    left_dilation_kernel[width_based_size // 2, width_based_size // 2:] = 1

    right_dilation_kernel = np.zeros((width_based_size, width_based_size), np.uint8)
    right_dilation_kernel[width_based_size // 2, :width_based_size // 2 + 1] = 1

    min_based_size = min(width_based_size, height_based_size)

    down_right_dilation_kernel = np.identity(min_based_size // 2 + 1, dtype=np.uint8)
    down_right_dilation_kernel = np.pad(down_right_dilation_kernel, ((0, min_based_size // 2), (0, min_based_size // 2)))

    up_left_dilation_kernel = np.identity(min_based_size // 2 + 1, dtype=np.uint8)
    up_left_dilation_kernel = np.pad(up_left_dilation_kernel, ((min_based_size // 2, 0), (0, min_based_size // 2)))

    up_right_dilation_kernel = np.flip(np.identity(min_based_size // 2 + 1, dtype=np.uint8), axis=1)
    up_right_dilation_kernel = np.pad(up_right_dilation_kernel, ((min_based_size // 2, 0), (0, min_based_size // 2)))

    down_left_dilation_kernel = np.flip(np.identity(min_based_size // 2 + 1, dtype=np.uint8), axis=1)
    down_left_dilation_kernel = np.pad(down_left_dilation_kernel, ((0, min_based_size // 2), (min_based_size // 2, 0)))

    match_kernels = [
        up_kernel,
        down_kernel,
        left_kernel,
        right_kernel,
        down_right_diagonal_kernel,
        down_left_diagonal_kernel,
        up_left_diagonal_kernel,
        up_right_diagonal_kernel,
    ]

    dilation_kernels = [
        up_dilation_kernel,
        down_dilation_kernel,
        left_dilation_kernel,
        right_dilation_kernel,
        down_right_dilation_kernel,
        down_left_dilation_kernel,
        up_left_dilation_kernel,
        up_right_dilation_kernel,
    ]

    def get_dots(grayscale_image: np.ndarray, kernel: np.ndarray) -> tuple[np.ndarray, int]:
        temp = cv2.matchTemplate(grayscale_image, kernel, cv2.TM_CCOEFF_NORMED)
        _, temp = cv2.threshold(temp, 0.9, 1, cv2.THRESH_BINARY)
        temp = np.where(temp == 1, 255, 0).astype(np.uint8)
        pad_height = (kernel.shape[0] - 1) // 2
        pad_width = (kernel.shape[1] - 1) // 2
        temp = cv2.copyMakeBorder(temp, pad_height, kernel.shape[0] - pad_height - 1, pad_width, kernel.shape[1] - pad_width - 1, cv2.BORDER_CONSTANT, value=0)
        return temp
    
    for match_kernel, dilation_kernel in zip(match_kernels, dilation_kernels):
        dots = get_dots(background_mask, match_kernel)
        lines = cv2.dilate(dots, dilation_kernel, iterations=1)
        background_mask = cv2.bitwise_or(background_mask, lines)

    pixels_now = np.count_nonzero(background_mask)
    dilation_size = pixels_before // (4  * pixels_now)
    dilation_size += dilation_size % 2 + 1
    background_mask = cv2.dilate(background_mask, np.ones((dilation_size, dilation_size), np.uint8), iterations=1)

    page_without_background = 255 - background_mask

    return page_without_background


def is_contour_sufficiently_big(contour: np.ndarray, image_height: int, image_width: int) -> bool:
    PAGE_TO_PANEL_RATIO = 32
    image_area = image_width * image_height
    area_threshold = image_area // PAGE_TO_PANEL_RATIO
    area = cv2.contourArea(contour)
    return area > area_threshold


def threshold_extraction(
        image: np.ndarray, 
        grayscale_image: np.ndarray, 
        mode: str = OutputMode.BOUNDING,
) -> list[np.ndarray]:
    """
    Extracts panels from the image using thresholding
    """
    processed_image = cv2.GaussianBlur(grayscale_image, (3, 3), 0)
    processed_image = cv2.Laplacian(processed_image, -1)
    _, thresh = cv2.threshold(processed_image, 8, 255, cv2.THRESH_BINARY)
    processed_image = apply_adaptive_threshold(processed_image)
    processed_image = cv2.subtract(processed_image, thresh)
    processed_image = cv2.dilate(processed_image, np.ones((3, 3), np.uint8), iterations=2)
    contours, _ = cv2.findContours(processed_image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    contours = list(filter(lambda c: is_contour_sufficiently_big(c, image.shape[0], image.shape[1]), contours))
    panels = extract_panels(image, contours, False, mode=mode)

    return panels


def get_page_without_background(grayscale_image: np.ndarray, background_mask: np.ndarray, split_joint_panels = False) -> np.ndarray:
    """
    Returns the page without the background
    """
    STRIPE_FORMAT_MASK_AREA_RATIO = 0.3

    mask_area = np.count_nonzero(background_mask)
    mask_area_ratio = mask_area / background_mask.size

    if STRIPE_FORMAT_MASK_AREA_RATIO > mask_area_ratio and split_joint_panels:
        page_without_background = joint_panel_split_extraction(grayscale_image, background_mask)
    else:
        page_without_background = cv2.subtract(grayscale_image, background_mask)

    return page_without_background


def get_fallback_panels(
        image: np.ndarray, 
        grayscale_image: np.ndarray, 
        fallback: bool, 
        panels: list[np.ndarray],
        mode: str = OutputMode.BOUNDING,
) -> list[np.ndarray]:
    """
    Checks if the fallback is needed and returns the appropriate panels
    
    Parameters:
    - mode: The mode to use for extraction
        - 'masked': Extracts the panels by cuting out only the inside of the contours
        - 'bounding': Extracts the panels by using the bounding boxes of the contours
    """
    if fallback and len(panels) < 2:
        tmp = threshold_extraction(image, grayscale_image, mode=mode)
        if len(tmp) > len(panels):
            return tmp
    
    return panels


def generate_panel_blocks(
        image: np.ndarray, 
        background_generator: Callable[[np.ndarray], np.ndarray] = generate_background_mask,
        split_joint_panels: bool = False,
        fallback: bool = True,
        mode: str = OutputMode.BOUNDING,
        merge: str = MergeMode.NONE,
        rtl_order: bool = False
) -> list[np.ndarray]:
    """
    Generates the separate panel images from the base image
    
    Parameters:
    - mode: The mode to use for extraction
        - 'masked': Extracts the panels by cuting out only the inside of the contours
        - 'bounding': Extracts the panels by using the bounding boxes of the contours
    - rtl_order: If True, sort panels from right-to-left. Otherwise, left-to-right.
    """

    grayscale_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    processed_image = preprocess_image_with_dilation(grayscale_image)
    background_mask = background_generator(processed_image)
    page_without_background = get_page_without_background(grayscale_image, background_mask, split_joint_panels)
    contours, _ = cv2.findContours(page_without_background, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    contours = list(filter(lambda c: is_contour_sufficiently_big(c, image.shape[0], image.shape[1]), contours))
    
    # Sort by top-to-bottom (y-coordinate) first, then by horizontal order.
    # For RTL, we sort by x-coordinate in descending order (by negating it).
    if contours:
        image_height = image.shape[0]
        contours = sort_panels_by_column_then_row(contours, rtl_order)

    def get_panels(contours):
        panels = extract_panels(image, contours, mode=mode)
        panels = get_fallback_panels(image, grayscale_image, fallback, panels, mode=mode)
        return panels

    panels = []
    if merge == MergeMode.NONE:
        panels = get_panels(contours)
    elif merge == MergeMode.HORIZONTAL:
        grouped_contours = group_contours_horizontally(contours)
        for group in grouped_contours:
            panels.append(adaptive_hconcat(get_panels(group)))
    elif merge == MergeMode.VERTICAL:
        grouped_contours = group_contours_vertically(contours)
        for group in grouped_contours:
            panels.append(adaptive_vconcat(get_panels(group)))

    return panels


def generate_panel_blocks_by_ai(
        image: np.ndarray,
        merge: str = MergeMode.NONE,
        rtl_order: bool = False
) -> list[np.ndarray]:
    """
    Generates the separate panel images from the base image using AI with merge
    
    Parameters:
    - rtl_order: If True, sort panels from right-to-left. Otherwise, left-to-right.
    """
    grayscale_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    processed_image = preprocess_image(grayscale_image)

    warnings.filterwarnings("ignore", category=FutureWarning) # Ignore 'FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.'
    results = model(processed_image)
    warnings.filterwarnings("default", category=FutureWarning)

    bounding_boxes = []
    for detection in results.xyxy[0]:  # Access predictions in (x1, y1, x2, y2, confidence, class) format
        x1, y1, x2, y2, conf, cls = detection.tolist()  # Convert to Python list
        x1, y1, x2, y2 = map(int, [x1, y1, x2, y2])
        bounding_boxes.append((x1, y1, x2 - x1, y2 - y1))
        
    # Bounding boxes are already (x, y, w, h), so we access coordinates directly.
    if bounding_boxes:
        image_height = image.shape[0]
        bounding_boxes = sort_panels_by_column_then_row(bounding_boxes, rtl_order)

    def get_panels(bounding_boxes):
        panels = []
        for x, y, w, h in bounding_boxes:
            panel = image[y:y + h, x:x + w]
            panels.append(panel)
        return panels

    panels = []
    if merge == MergeMode.NONE:
        panels = get_panels(bounding_boxes)
    elif merge == MergeMode.HORIZONTAL:
        grouped_bounding_boxes = group_bounding_boxes_horizontally(bounding_boxes)
        for group in grouped_bounding_boxes:
            panels.append(adaptive_hconcat(get_panels(group)))
    elif merge == MergeMode.VERTICAL:
        grouped_bounding_boxes = group_bounding_boxes_vertically(bounding_boxes)
        for group in grouped_bounding_boxes:
            panels.append(adaptive_vconcat(get_panels(group)))

    return panels


def extract_panels_for_image(
        image_path: str, 
        output_dir: str, 
        fallback: bool = True, 
        split_joint_panels: bool = False,
        mode: str = OutputMode.BOUNDING,
        merge: str = MergeMode.NONE
        ) -> None:
    """
    Extracts panels for a single image
    """
    if not os.path.exists(image_path):
        return
    image_path = os.path.abspath(image_path)
    image = load_image(os.path.dirname(image_path), image_path)
    image_name, image_ext = os.path.splitext(image.image_name)
    panel_blocks = generate_panel_blocks(image.image, split_joint_panels=split_joint_panels, fallback=fallback, mode=mode, merge=merge)
    for k, panel in enumerate(tqdm(panel_blocks, total=len(panel_blocks))):
        out_path = os.path.join(output_dir, f"{image_name}_{k}{image_ext}")
        cv2.imwrite(out_path, panel)


def extract_panels_for_images_in_folder(
        input_dir: str, 
        output_dir: str, 
        fallback: bool = True, 
        split_joint_panels: bool = False,
        mode: str = OutputMode.BOUNDING,
        merge: str = MergeMode.NONE
        ) -> tuple[int, int]:
    """
    Basically the main function of the program,
    this is written with cli usage in mind
    """
    if not os.path.exists(output_dir):
        return (0, 0)
    files = os.listdir(input_dir)
    num_files = len(files)
    num_panels = 0
    for _, image in enumerate(tqdm(load_images(input_dir), total=num_files)):
        image_name, image_ext = os.path.splitext(image.image_name)
        panel_blocks = generate_panel_blocks(image.image, fallback=fallback, split_joint_panels=split_joint_panels, mode=mode, merge=merge)
        for j, panel in enumerate(panel_blocks):
            out_path = os.path.join(output_dir, f"{image_name}_{j}{image_ext}")
            cv2.imwrite(out_path, panel)
        num_panels += len(panel_blocks)
    return (num_files, num_panels)


def extract_panels_for_images_in_folder_by_ai(
        input_dir: str, 
        output_dir: str
        ) -> tuple[int, int]:
    """
    Basically the main function of the program,
    this is written with cli usage in mind
    """
    if not os.path.exists(output_dir):
        return (0, 0)
    files = os.listdir(input_dir)
    num_files = len(files)
    num_panels = 0
    for _, image in enumerate(tqdm(load_images(input_dir), total=num_files)):
        image_name, image_ext = os.path.splitext(image.image_name)
        panel_blocks = generate_panel_blocks_by_ai(image.image)
        for j, panel in enumerate(panel_blocks):
            out_path = os.path.join(output_dir, f"{image_name}_{j}{image_ext}")
            cv2.imwrite(out_path, panel)
        num_panels += len(panel_blocks)
    return (num_files, num_panels)