Spaces:

hyper3labs
/

HyperView

Running

App Files Files Community

HyperView / src /hyperview /embeddings /projection.py

morozovdd

feat: add HyperView app for space

23680f2 3 days ago

raw

history blame contribute delete

9.13 kB

	"""Projection methods for dimensionality reduction."""

	import logging
	import warnings

	import numpy as np
	import umap

	logger = logging.getLogger(__name__)


	class ProjectionEngine:
	"""Engine for projecting high-dimensional embeddings to 2D."""

	def to_poincare_ball(
	self,
	hyperboloid_embeddings: np.ndarray,
	curvature: float \| None = None,
	clamp_radius: float = 0.999999,
	) -> np.ndarray:
	"""Convert hyperboloid (Lorentz) coordinates to Poincaré ball coordinates.

	Input is expected to be shape (N, D+1) with first coordinate being time-like.
	Points are assumed to satisfy: t^2 - \|\|x\|\|^2 = 1/c (c > 0).

	Returns Poincaré ball coordinates of shape (N, D) in the unit ball.

	Notes:
	- Many hyperbolic libraries parameterize curvature as a positive number c
	where the manifold has sectional curvature -c.
	- We map to the unit ball for downstream distance metrics (UMAP 'poincare').
	"""
	if hyperboloid_embeddings.ndim != 2 or hyperboloid_embeddings.shape[1] < 2:
	raise ValueError(
	"hyperboloid_embeddings must have shape (N, D+1) with D>=1"
	)

	c = float(curvature) if curvature is not None else 1.0
	if c <= 0:
	raise ValueError(f"curvature must be > 0, got {c}")

	# Radius R = 1/sqrt(c) for curvature -c
	R = 1.0 / np.sqrt(c)

	t = hyperboloid_embeddings[:, :1]
	x = hyperboloid_embeddings[:, 1:]

	# Map to ball radius R: u_R = x / (t + R)
	denom = t + R
	u_R = x / denom

	# Rescale to unit ball: u = u_R / R = sqrt(c) * u_R
	u = u_R / R

	# Numerical guard: ensure inside the unit ball
	radii = np.linalg.norm(u, axis=1)
	mask = radii >= clamp_radius
	if np.any(mask):
	u[mask] = u[mask] / radii[mask][:, np.newaxis] * clamp_radius

	return u.astype(np.float32)

	def project(
	self,
	embeddings: np.ndarray,
	*,
	input_geometry: str = "euclidean",
	output_geometry: str = "euclidean",
	curvature: float \| None = None,
	method: str = "umap",
	n_neighbors: int = 15,
	min_dist: float = 0.1,
	metric: str = "cosine",
	random_state: int = 42,
	) -> np.ndarray:
	"""Project embeddings to 2D with geometry-aware preprocessing.

	This separates two concerns:
	1) Geometry/model transforms for the input embeddings (e.g. hyperboloid -> Poincaré)
	2) Dimensionality reduction / layout (currently UMAP)

	Args:
	embeddings: Input embeddings (N x D) or hyperboloid (N x D+1).
	input_geometry: Geometry/model of the input embeddings (euclidean, hyperboloid).
	output_geometry: Geometry of the output coordinates (euclidean, poincare).
	curvature: Curvature parameter for hyperbolic embeddings (positive c).
	method: Layout method (currently only 'umap').
	n_neighbors: UMAP neighbors.
	min_dist: UMAP min_dist.
	metric: Input metric (used for euclidean inputs).
	random_state: Random seed.

	Returns:
	2D coordinates (N x 2).
	"""
	if method != "umap":
	raise ValueError(f"Invalid method: {method}. Only 'umap' is supported.")

	prepared = embeddings
	prepared_metric: str = metric

	if input_geometry == "hyperboloid":
	# Convert to unit Poincaré ball and use UMAP's built-in hyperbolic distance.
	prepared = self.to_poincare_ball(embeddings, curvature=curvature)
	prepared_metric = "poincare"

	if output_geometry == "poincare":
	return self.project_to_poincare(
	prepared,
	n_neighbors=n_neighbors,
	min_dist=min_dist,
	metric=prepared_metric,
	random_state=random_state,
	)

	if output_geometry == "euclidean":
	return self.project_umap(
	prepared,
	n_neighbors=n_neighbors,
	min_dist=min_dist,
	metric=prepared_metric,
	n_components=2,
	random_state=random_state,
	)

	raise ValueError(
	f"Invalid output_geometry: {output_geometry}. Must be 'euclidean' or 'poincare'."
	)

	def project_umap(
	self,
	embeddings: np.ndarray,
	n_neighbors: int = 15,
	min_dist: float = 0.1,
	metric: str = "cosine",
	n_components: int = 2,
	random_state: int = 42,
	) -> np.ndarray:
	"""Project embeddings to Euclidean 2D using UMAP."""
	n_neighbors = min(n_neighbors, len(embeddings) - 1)
	if n_neighbors < 2:
	n_neighbors = 2

	n_jobs = 1 if random_state is not None else -1

	reducer = umap.UMAP(
	n_neighbors=n_neighbors,
	min_dist=min_dist,
	n_components=n_components,
	metric=metric,
	random_state=random_state,
	n_jobs=n_jobs,
	)

	coords = reducer.fit_transform(embeddings)
	coords = self._normalize_coords(coords)

	return coords

	def project_to_poincare(
	self,
	embeddings: np.ndarray,
	n_neighbors: int = 15,
	min_dist: float = 0.1,
	metric: str = "cosine",
	random_state: int = 42,
	) -> np.ndarray:
	"""Project embeddings to the Poincaré disk using UMAP with hyperboloid output."""
	n_neighbors = min(n_neighbors, len(embeddings) - 1)
	if n_neighbors < 2:
	n_neighbors = 2

	n_jobs = 1 if random_state is not None else -1

	# Suppress warning about missing gradient for poincare metric (only affects inverse_transform)
	with warnings.catch_warnings():
	warnings.filterwarnings("ignore", message="gradient function is not yet implemented")
	reducer = umap.UMAP(
	n_neighbors=n_neighbors,
	min_dist=min_dist,
	n_components=2,
	metric=metric,
	output_metric="hyperboloid",
	random_state=random_state,
	n_jobs=n_jobs,
	)
	spatial_coords = reducer.fit_transform(embeddings)

	squared_norm = np.sum(spatial_coords**2, axis=1)
	t = np.sqrt(1 + squared_norm)

	# Project to Poincaré disk: u = x / (1 + t)
	denom = 1 + t
	poincare_coords = spatial_coords / denom[:, np.newaxis]

	# Clamp to unit disk for numerical stability
	radii = np.linalg.norm(poincare_coords, axis=1)
	max_radius = 0.999
	mask = radii > max_radius
	if np.any(mask):
	logger.warning(f"Clamping {np.sum(mask)} points to unit disk.")
	poincare_coords[mask] = (
	poincare_coords[mask] / radii[mask][:, np.newaxis] * max_radius
	)

	poincare_coords = self._center_poincare(poincare_coords)
	poincare_coords = self._scale_poincare(poincare_coords, factor=0.65)

	return poincare_coords

	def _scale_poincare(self, coords: np.ndarray, factor: float) -> np.ndarray:
	"""Scale points towards the origin in hyperbolic space.

	Scales hyperbolic distance from origin by `factor`. If factor < 1, points move closer to center.
	"""
	radii = np.linalg.norm(coords, axis=1)
	mask = radii > 1e-6

	r = radii[mask]
	r = np.minimum(r, 0.9999999)
	r_new = np.tanh(factor * np.arctanh(r))

	scale_ratios = np.ones_like(radii)
	scale_ratios[mask] = r_new / r

	return coords * scale_ratios[:, np.newaxis]

	def _center_poincare(self, coords: np.ndarray) -> np.ndarray:
	"""Center points in the Poincaré disk using a Möbius transformation."""
	if len(coords) == 0:
	return coords

	z = coords[:, 0] + 1j * coords[:, 1]
	centroid = np.mean(z)

	if np.abs(centroid) > 0.99 or np.abs(centroid) < 1e-6:
	return coords

	# Möbius transformation: w = (z - a) / (1 - conj(a) * z)
	a = centroid
	w = (z - a) / (1 - np.conj(a) * z)

	return np.stack([w.real, w.imag], axis=1)

	def _normalize_coords(self, coords: np.ndarray) -> np.ndarray:
	"""Normalize coordinates to [-1, 1] range."""
	if len(coords) == 0:
	return coords

	coords = coords - coords.mean(axis=0)
	max_abs = np.abs(coords).max()
	if max_abs > 0:
	coords = coords / max_abs * 0.95

	return coords

	def poincare_distance(self, u: np.ndarray, v: np.ndarray) -> float:
	"""Compute the Poincaré distance between two points."""
	u_norm_sq = np.sum(u**2)
	v_norm_sq = np.sum(v**2)
	diff_norm_sq = np.sum((u - v) ** 2)

	u_norm_sq = min(u_norm_sq, 0.99999)
	v_norm_sq = min(v_norm_sq, 0.99999)

	delta = 2 * diff_norm_sq / ((1 - u_norm_sq) * (1 - v_norm_sq))
	return np.arccosh(1 + delta)