Spaces:

Realcat
/

image-matching-webui

Running

image-matching-webui / third_party /SOLD2 /sold2 /model /line_matcher.py

Vincentqyw

fix: roma

c74a070 about 1 year ago

12.2 kB

	"""
	Implements the full pipeline from raw images to line matches.
	"""
	import time
	import cv2
	import numpy as np
	import torch
	import torch.nn.functional as F
	from torch.nn.functional import softmax

	from .model_util import get_model
	from .loss import get_loss_and_weights
	from .metrics import super_nms
	from .line_detection import LineSegmentDetectionModule
	from .line_matching import WunschLineMatcher
	from ..train import convert_junc_predictions
	from ..misc.train_utils import adapt_checkpoint
	from .line_detector import line_map_to_segments


	class LineMatcher(object):
	"""Full line matcher including line detection and matching
	with the Needleman-Wunsch algorithm."""

	def __init__(
	self,
	model_cfg,
	ckpt_path,
	device,
	line_detector_cfg,
	line_matcher_cfg,
	multiscale=False,
	scales=[1.0, 2.0],
	):
	# Get loss weights if dynamic weighting
	_, loss_weights = get_loss_and_weights(model_cfg, device)
	self.device = device

	# Initialize the cnn backbone
	self.model = get_model(model_cfg, loss_weights)
	checkpoint = torch.load(ckpt_path, map_location=self.device)
	checkpoint = adapt_checkpoint(checkpoint["model_state_dict"])
	self.model.load_state_dict(checkpoint)
	self.model = self.model.to(self.device)
	self.model = self.model.eval()

	self.grid_size = model_cfg["grid_size"]
	self.junc_detect_thresh = model_cfg["detection_thresh"]
	self.max_num_junctions = model_cfg.get("max_num_junctions", 300)

	# Initialize the line detector
	self.line_detector = LineSegmentDetectionModule(**line_detector_cfg)
	self.multiscale = multiscale
	self.scales = scales

	# Initialize the line matcher
	self.line_matcher = WunschLineMatcher(**line_matcher_cfg)

	# Print some debug messages
	for key, val in line_detector_cfg.items():
	print(f"[Debug] {key}: {val}")
	# print("[Debug] detect_thresh: %f" % (line_detector_cfg["detect_thresh"]))
	# print("[Debug] num_samples: %d" % (line_detector_cfg["num_samples"]))

	# Perform line detection and descriptor inference on a single image
	def line_detection(
	self, input_image, valid_mask=None, desc_only=False, profile=False
	):
	# Restrict input_image to 4D torch tensor
	if (not len(input_image.shape) == 4) or (
	not isinstance(input_image, torch.Tensor)
	):
	raise ValueError("[Error] the input image should be a 4D torch tensor")

	# Move the input to corresponding device
	input_image = input_image.to(self.device)

	# Forward of the CNN backbone
	start_time = time.time()
	with torch.no_grad():
	net_outputs = self.model(input_image)

	outputs = {"descriptor": net_outputs["descriptors"]}

	if not desc_only:
	junc_np = convert_junc_predictions(
	net_outputs["junctions"],
	self.grid_size,
	self.junc_detect_thresh,
	self.max_num_junctions,
	)
	if valid_mask is None:
	junctions = np.where(junc_np["junc_pred_nms"].squeeze())
	else:
	junctions = np.where(junc_np["junc_pred_nms"].squeeze() * valid_mask)
	junctions = np.concatenate(
	[junctions[0][..., None], junctions[1][..., None]], axis=-1
	)

	if net_outputs["heatmap"].shape[1] == 2:
	# Convert to single channel directly from here
	heatmap = (
	softmax(net_outputs["heatmap"], dim=1)[:, 1:, :, :]
	.cpu()
	.numpy()
	.transpose(0, 2, 3, 1)
	)
	else:
	heatmap = (
	torch.sigmoid(net_outputs["heatmap"])
	.cpu()
	.numpy()
	.transpose(0, 2, 3, 1)
	)
	heatmap = heatmap[0, :, :, 0]

	# Run the line detector.
	line_map, junctions, heatmap = self.line_detector.detect(
	junctions, heatmap, device=self.device
	)
	if isinstance(line_map, torch.Tensor):
	line_map = line_map.cpu().numpy()
	if isinstance(junctions, torch.Tensor):
	junctions = junctions.cpu().numpy()
	outputs["heatmap"] = heatmap.cpu().numpy()
	outputs["junctions"] = junctions

	# If it's a line map with multiple detect_thresh and inlier_thresh
	if len(line_map.shape) > 2:
	num_detect_thresh = line_map.shape[0]
	num_inlier_thresh = line_map.shape[1]
	line_segments = []
	for detect_idx in range(num_detect_thresh):
	line_segments_inlier = []
	for inlier_idx in range(num_inlier_thresh):
	line_map_tmp = line_map[detect_idx, inlier_idx, :, :]
	line_segments_tmp = line_map_to_segments(
	junctions, line_map_tmp
	)
	line_segments_inlier.append(line_segments_tmp)
	line_segments.append(line_segments_inlier)
	else:
	line_segments = line_map_to_segments(junctions, line_map)

	outputs["line_segments"] = line_segments

	end_time = time.time()

	if profile:
	outputs["time"] = end_time - start_time

	return outputs

	# Perform line detection and descriptor inference at multiple scales
	def multiscale_line_detection(
	self,
	input_image,
	valid_mask=None,
	desc_only=False,
	profile=False,
	scales=[1.0, 2.0],
	aggregation="mean",
	):
	# Restrict input_image to 4D torch tensor
	if (not len(input_image.shape) == 4) or (
	not isinstance(input_image, torch.Tensor)
	):
	raise ValueError("[Error] the input image should be a 4D torch tensor")

	# Move the input to corresponding device
	input_image = input_image.to(self.device)
	img_size = input_image.shape[2:4]
	desc_size = tuple(np.array(img_size) // 4)

	# Run the inference at multiple image scales
	start_time = time.time()
	junctions, heatmaps, descriptors = [], [], []
	for s in scales:
	# Resize the image
	resized_img = F.interpolate(input_image, scale_factor=s, mode="bilinear")

	# Forward of the CNN backbone
	with torch.no_grad():
	net_outputs = self.model(resized_img)

	descriptors.append(
	F.interpolate(
	net_outputs["descriptors"], size=desc_size, mode="bilinear"
	)
	)

	if not desc_only:
	junc_prob = convert_junc_predictions(
	net_outputs["junctions"], self.grid_size
	)["junc_pred"]
	junctions.append(
	cv2.resize(
	junc_prob.squeeze(),
	(img_size[1], img_size[0]),
	interpolation=cv2.INTER_LINEAR,
	)
	)

	if net_outputs["heatmap"].shape[1] == 2:
	# Convert to single channel directly from here
	heatmap = softmax(net_outputs["heatmap"], dim=1)[:, 1:, :, :]
	else:
	heatmap = torch.sigmoid(net_outputs["heatmap"])
	heatmaps.append(F.interpolate(heatmap, size=img_size, mode="bilinear"))

	# Aggregate the results
	if aggregation == "mean":
	# Aggregation through the mean activation
	descriptors = torch.stack(descriptors, dim=0).mean(0)
	else:
	# Aggregation through the max activation
	descriptors = torch.stack(descriptors, dim=0).max(0)[0]
	outputs = {"descriptor": descriptors}

	if not desc_only:
	if aggregation == "mean":
	junctions = np.stack(junctions, axis=0).mean(0)[None]
	heatmap = torch.stack(heatmaps, dim=0).mean(0)[0, 0, :, :]
	heatmap = heatmap.cpu().numpy()
	else:
	junctions = np.stack(junctions, axis=0).max(0)[None]
	heatmap = torch.stack(heatmaps, dim=0).max(0)[0][0, 0, :, :]
	heatmap = heatmap.cpu().numpy()

	# Extract junctions
	junc_pred_nms = super_nms(
	junctions[..., None],
	self.grid_size,
	self.junc_detect_thresh,
	self.max_num_junctions,
	)
	if valid_mask is None:
	junctions = np.where(junc_pred_nms.squeeze())
	else:
	junctions = np.where(junc_pred_nms.squeeze() * valid_mask)
	junctions = np.concatenate(
	[junctions[0][..., None], junctions[1][..., None]], axis=-1
	)

	# Run the line detector.
	line_map, junctions, heatmap = self.line_detector.detect(
	junctions, heatmap, device=self.device
	)
	if isinstance(line_map, torch.Tensor):
	line_map = line_map.cpu().numpy()
	if isinstance(junctions, torch.Tensor):
	junctions = junctions.cpu().numpy()
	outputs["heatmap"] = heatmap.cpu().numpy()
	outputs["junctions"] = junctions

	# If it's a line map with multiple detect_thresh and inlier_thresh
	if len(line_map.shape) > 2:
	num_detect_thresh = line_map.shape[0]
	num_inlier_thresh = line_map.shape[1]
	line_segments = []
	for detect_idx in range(num_detect_thresh):
	line_segments_inlier = []
	for inlier_idx in range(num_inlier_thresh):
	line_map_tmp = line_map[detect_idx, inlier_idx, :, :]
	line_segments_tmp = line_map_to_segments(
	junctions, line_map_tmp
	)
	line_segments_inlier.append(line_segments_tmp)
	line_segments.append(line_segments_inlier)
	else:
	line_segments = line_map_to_segments(junctions, line_map)

	outputs["line_segments"] = line_segments

	end_time = time.time()

	if profile:
	outputs["time"] = end_time - start_time

	return outputs

	def __call__(self, images, valid_masks=[None, None], profile=False):
	# Line detection and descriptor inference on both images
	if self.multiscale:
	forward_outputs = [
	self.multiscale_line_detection(
	images[0], valid_masks[0], profile=profile, scales=self.scales
	),
	self.multiscale_line_detection(
	images[1], valid_masks[1], profile=profile, scales=self.scales
	),
	]
	else:
	forward_outputs = [
	self.line_detection(images[0], valid_masks[0], profile=profile),
	self.line_detection(images[1], valid_masks[1], profile=profile),
	]
	line_seg1 = forward_outputs[0]["line_segments"]
	line_seg2 = forward_outputs[1]["line_segments"]
	desc1 = forward_outputs[0]["descriptor"]
	desc2 = forward_outputs[1]["descriptor"]

	# Match the lines in both images
	start_time = time.time()
	matches = self.line_matcher.forward(line_seg1, line_seg2, desc1, desc2)
	end_time = time.time()

	outputs = {"line_segments": [line_seg1, line_seg2], "matches": matches}

	if profile:
	outputs["line_detection_time"] = (
	forward_outputs[0]["time"] + forward_outputs[1]["time"]
	)
	outputs["line_matching_time"] = end_time - start_time

	return outputs