cased / retrieval_cased.py

Fix cache dir (#7)

6c28148 over 1 year ago

9.3 kB

	import tarfile
	from collections import defaultdict
	from pathlib import Path

	import faiss
	import numpy as np
	import pyarrow as pa
	import requests
	from tqdm import tqdm

	__all__ = ["RetrievalDatabase", "download_retrieval_databases"]

	RETRIEVAL_DATABASES_URLS = {
	"cc12m": {
	"url": "https://storage-cased.alessandroconti.me/cc12m.tar.gz",
	"cache_subdir": "./cc12m/vit-l-14/",
	},
	}


	def download_retrieval_databases(cache_dir: str):
	"""Download data if needed.

	Args:
	cache_dir (str): Path to cache directory. Defaults to "~/.cache/cased".
	"""
	databases_path = Path(cache_dir, "databases")

	for name, items in RETRIEVAL_DATABASES_URLS.items():
	url = items["url"]
	database_path = Path(databases_path, name)
	if database_path.exists():
	continue

	# download data
	target_path = Path(databases_path, name + ".tar.gz")
	target_path.parent.mkdir(parents=True, exist_ok=True)
	with requests.get(url, stream=True) as r:
	r.raise_for_status()
	total_bytes_size = int(r.headers.get("content-length", 0))
	chunk_size = 8192
	p_bar = tqdm(
	desc="Downloading cc12m index",
	total=total_bytes_size,
	unit="iB",
	unit_scale=True,
	)
	with open(target_path, "wb") as f:
	for chunk in r.iter_content(chunk_size=chunk_size):
	f.write(chunk)
	p_bar.update(len(chunk))
	p_bar.close()

	# extract data
	tar = tarfile.open(target_path, "r:gz")
	tar.extractall(target_path.parent)
	tar.close()
	target_path.unlink()


	class RetrievalDatabaseMetadataProvider:
	"""Metadata provider for the retrieval database.

	Args:
	metadata_dir (str): Path to the metadata directory.
	"""

	def __init__(self, metadata_dir: str):
	metadatas = [str(a) for a in sorted(Path(metadata_dir).glob("*/")) if a.is_file()]
	self.table = pa.concat_tables(
	[
	pa.ipc.RecordBatchFileReader(pa.memory_map(metadata, "r")).read_all()
	for metadata in metadatas
	]
	)

	def get(self, ids):
	"""Get the metadata for the given ids.

	Args:
	ids (list): List of ids.
	"""
	columns = self.table.schema.names
	end_ids = [i + 1 for i in ids]
	t = pa.concat_tables([self.table[start:end] for start, end in zip(ids, end_ids)])
	return t.select(columns).to_pandas().to_dict("records")


	class RetrievalDatabase:
	"""Retrieval database.

	Args:
	database_name (str): Name of the database.
	cache_dir (str): Path to cache directory. Defaults to "~/.cache/cased".
	"""

	def __init__(self, database_name: str, cache_dir: str):
	assert database_name in RETRIEVAL_DATABASES_URLS.keys(), (
	f"Database name should be one of "
	f"{list(RETRIEVAL_DATABASES_URLS.keys())}, got {database_name}."
	)

	database_dir = Path(cache_dir) / "databases"
	database_dir = database_dir / RETRIEVAL_DATABASES_URLS[database_name]["cache_subdir"]
	self._database_dir = database_dir

	image_index_fp = Path(database_dir) / "image.index"
	text_index_fp = Path(database_dir) / "text.index"

	image_index = (
	faiss.read_index(str(image_index_fp), faiss.IO_FLAG_MMAP \| faiss.IO_FLAG_READ_ONLY)
	if image_index_fp.exists()
	else None
	)
	text_index = (
	faiss.read_index(str(text_index_fp), faiss.IO_FLAG_MMAP \| faiss.IO_FLAG_READ_ONLY)
	if text_index_fp.exists()
	else None
	)

	metadata_dir = str(Path(database_dir) / "metadata")
	metadata_provider = RetrievalDatabaseMetadataProvider(metadata_dir)

	self._image_index = image_index
	self._text_index = text_index
	self._metadata_provider = metadata_provider

	def _map_to_metadata(self, indices: list, distances: list, embs: list, num_images: int):
	"""Map the indices to metadata.

	Args:
	indices (list): List of indices.
	distances (list): List of distances.
	embs (list): List of results embeddings.
	num_images (int): Number of images.
	"""
	results = []
	metas = self._metadata_provider.get(indices[:num_images])
	for key, (d, i, emb) in enumerate(zip(distances, indices, embs)):
	output = {}
	meta = None if key + 1 > len(metas) else metas[key]
	if meta is not None:
	output.update(self._meta_to_dict(meta))
	output["id"] = i.item()
	output["similarity"] = d.item()
	output["sample_z"] = emb.tolist()
	results.append(output)

	return results

	def _meta_to_dict(self, metadata):
	"""Convert metadata to dict.

	Args:
	metadata (dict): Metadata.
	"""
	output = {}
	for k, v in metadata.items():
	if isinstance(v, bytes):
	v = v.decode()
	elif type(v).__module__ == np.__name__:
	v = v.item()
	output[k] = v
	return output

	def _get_connected_components(self, neighbors):
	"""Find connected components in a graph.

	Args:
	neighbors (dict): Dictionary of neighbors.
	"""
	seen = set()

	def component(node):
	r = []
	nodes = {node}
	while nodes:
	node = nodes.pop()
	seen.add(node)
	nodes \|= set(neighbors[node]) - seen
	r.append(node)
	return r

	u = []
	for node in neighbors:
	if node not in seen:
	u.append(component(node))
	return u

	def _deduplicate_embeddings(self, embeddings, threshold=0.94):
	"""Deduplicate embeddings.

	Args:
	embeddings (np.matrix): Embeddings to deduplicate.
	threshold (float): Threshold to use for deduplication. Default is 0.94.
	"""
	index = faiss.IndexFlatIP(embeddings.shape[1])
	index.add(embeddings)
	l, _, indices = index.range_search(embeddings, threshold)

	same_mapping = defaultdict(list)

	for i in range(embeddings.shape[0]):
	start = l[i]
	end = l[i + 1]
	for j in indices[start:end]:
	same_mapping[int(i)].append(int(j))

	groups = self._get_connected_components(same_mapping)
	non_uniques = set()
	for g in groups:
	for e in g[1:]:
	non_uniques.add(e)

	return set(list(non_uniques))

	def query(
	self, query: np.matrix, modality: str = "text", num_samples: int = 10
	) -> list[list[dict]]:
	"""Query the database.

	Args:
	query (np.matrix): Query to search.
	modality (str): Modality to search. One of `image` or `text`. Default to `text`.
	num_samples (int): Number of samples to return. Default is 40.
	"""
	index = self._image_index if modality == "image" else self._text_index

	distances, indices, embeddings = index.search_and_reconstruct(query, num_samples)
	results = [indices[i] for i in range(len(indices))]

	nb_results = [np.where(r == -1)[0] for r in results]
	total_distances = []
	total_indices = []
	total_embeddings = []
	for i in range(len(results)):
	num_res = nb_results[i][0] if len(nb_results[i]) > 0 else len(results[i])

	result_indices = results[i][:num_res]
	result_distances = distances[i][:num_res]
	result_embeddings = embeddings[i][:num_res]

	# normalise embeddings
	l2 = np.atleast_1d(np.linalg.norm(result_embeddings, 2, -1))
	l2[l2 == 0] = 1
	result_embeddings = result_embeddings / np.expand_dims(l2, -1)

	# deduplicate embeddings
	local_indices_to_remove = self._deduplicate_embeddings(result_embeddings)
	indices_to_remove = set()
	for local_index in local_indices_to_remove:
	indices_to_remove.add(result_indices[local_index])

	curr_indices = []
	curr_distances = []
	curr_embeddings = []
	for ind, dis, emb in zip(result_indices, result_distances, result_embeddings):
	if ind not in indices_to_remove:
	indices_to_remove.add(ind)
	curr_indices.append(ind)
	curr_distances.append(dis)
	curr_embeddings.append(emb)

	total_indices.append(curr_indices)
	total_distances.append(curr_distances)
	total_embeddings.append(curr_embeddings)

	if len(total_distances) == 0:
	return []

	total_results = []
	for i in range(len(total_distances)):
	results = self._map_to_metadata(
	total_indices[i], total_distances[i], total_embeddings[i], num_samples
	)
	total_results.append(results)

	return total_results