Spaces:

vespa-engine
/

colpali-vespa-visual-retrieval

Running on L40S

App Files Files Community

colpali-vespa-visual-retrieval / feed_vespa.py

thomasht86

Upload folder using huggingface_hub

be59b6e verified 4 months ago

raw

history blame contribute delete

7.62 kB

	#!/usr/bin/env python3

	import argparse
	import torch
	from torch.utils.data import DataLoader
	from tqdm import tqdm
	from io import BytesIO
	from typing import cast
	import os
	import json
	import hashlib

	from colpali_engine.models import ColPali, ColPaliProcessor
	from colpali_engine.utils.torch_utils import get_torch_device
	from vidore_benchmark.utils.image_utils import scale_image, get_base64_image
	import requests
	from pdf2image import convert_from_path
	from pypdf import PdfReader
	import numpy as np
	from vespa.application import Vespa
	from vespa.io import VespaResponse
	from dotenv import load_dotenv

	load_dotenv()


	def main():
	parser = argparse.ArgumentParser(description="Feed data into Vespa application")
	parser.add_argument(
	"--application_name",
	required=True,
	default="colpalidemo",
	help="Vespa application name",
	)
	parser.add_argument(
	"--vespa_schema_name",
	required=True,
	default="pdf_page",
	help="Vespa schema name",
	)
	args = parser.parse_args()

	vespa_app_url = os.getenv("VESPA_APP_URL")
	vespa_cloud_secret_token = os.getenv("VESPA_CLOUD_SECRET_TOKEN")
	# Set application and schema names
	application_name = args.application_name
	schema_name = args.vespa_schema_name
	# Instantiate Vespa connection using token
	app = Vespa(url=vespa_app_url, vespa_cloud_secret_token=vespa_cloud_secret_token)
	app.get_application_status()
	model_name = "vidore/colpali-v1.2"

	device = get_torch_device("auto")
	print(f"Using device: {device}")

	# Load the model
	model = cast(
	ColPali,
	ColPali.from_pretrained(
	model_name,
	torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
	device_map=device,
	),
	).eval()

	# Load the processor
	processor = cast(ColPaliProcessor, ColPaliProcessor.from_pretrained(model_name))

	# Define functions to work with PDFs
	def download_pdf(url):
	response = requests.get(url)
	if response.status_code == 200:
	return BytesIO(response.content)
	else:
	raise Exception(
	f"Failed to download PDF: Status code {response.status_code}"
	)

	def get_pdf_images(pdf_url):
	# Download the PDF
	pdf_file = download_pdf(pdf_url)
	# Save the PDF temporarily to disk (pdf2image requires a file path)
	temp_file = "temp.pdf"
	with open(temp_file, "wb") as f:
	f.write(pdf_file.read())
	reader = PdfReader(temp_file)
	page_texts = []
	for page_number in range(len(reader.pages)):
	page = reader.pages[page_number]
	text = page.extract_text()
	page_texts.append(text)
	images = convert_from_path(temp_file)
	assert len(images) == len(page_texts)
	return (images, page_texts)

	# Define sample PDFs
	sample_pdfs = [
	{
	"title": "ConocoPhillips Sustainability Highlights - Nature (24-0976)",
	"url": "https://static.conocophillips.com/files/resources/24-0976-sustainability-highlights_nature.pdf",
	},
	{
	"title": "ConocoPhillips Managing Climate Related Risks",
	"url": "https://static.conocophillips.com/files/resources/conocophillips-2023-managing-climate-related-risks.pdf",
	},
	{
	"title": "ConocoPhillips 2023 Sustainability Report",
	"url": "https://static.conocophillips.com/files/resources/conocophillips-2023-sustainability-report.pdf",
	},
	]

	# Check if vespa_feed.json exists
	if os.path.exists("vespa_feed.json"):
	print("Loading vespa_feed from vespa_feed.json")
	with open("vespa_feed.json", "r") as f:
	vespa_feed_saved = json.load(f)
	vespa_feed = []
	for doc in vespa_feed_saved:
	put_id = doc["put"]
	fields = doc["fields"]
	# Extract document_id from put_id
	# Format: 'id:application_name:schema_name::document_id'
	parts = put_id.split("::")
	document_id = parts[1] if len(parts) > 1 else ""
	page = {"id": document_id, "fields": fields}
	vespa_feed.append(page)
	else:
	print("Generating vespa_feed")
	# Process PDFs
	for pdf in sample_pdfs:
	page_images, page_texts = get_pdf_images(pdf["url"])
	pdf["images"] = page_images
	pdf["texts"] = page_texts

	# Generate embeddings
	for pdf in sample_pdfs:
	page_embeddings = []
	dataloader = DataLoader(
	pdf["images"],
	batch_size=2,
	shuffle=False,
	collate_fn=lambda x: processor.process_images(x),
	)
	for batch_doc in tqdm(dataloader):
	with torch.no_grad():
	batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
	embeddings_doc = model(**batch_doc)
	page_embeddings.extend(list(torch.unbind(embeddings_doc.to("cpu"))))
	pdf["embeddings"] = page_embeddings

	# Prepare Vespa feed
	vespa_feed = []
	for pdf in sample_pdfs:
	url = pdf["url"]
	title = pdf["title"]
	for page_number, (page_text, embedding, image) in enumerate(
	zip(pdf["texts"], pdf["embeddings"], pdf["images"])
	):
	base_64_image = get_base64_image(
	scale_image(image, 640), add_url_prefix=False
	)
	base_64_full_image = get_base64_image(image, add_url_prefix=False)
	embedding_dict = dict()
	for idx, patch_embedding in enumerate(embedding):
	binary_vector = (
	np.packbits(np.where(patch_embedding > 0, 1, 0))
	.astype(np.int8)
	.tobytes()
	.hex()
	)
	embedding_dict[idx] = binary_vector
	# id_hash should be md5 hash of url and page_number
	id_hash = hashlib.md5(f"{url}_{page_number}".encode()).hexdigest()
	page = {
	"id": id_hash,
	"fields": {
	"id": id_hash,
	"url": url,
	"title": title,
	"page_number": page_number,
	"image": base_64_image,
	"full_image": base_64_full_image,
	"text": page_text,
	"embedding": embedding_dict,
	},
	}
	vespa_feed.append(page)

	# Save vespa_feed to vespa_feed.json in the specified format
	vespa_feed_to_save = []
	for page in vespa_feed:
	document_id = page["id"]
	put_id = f"id:{application_name}:{schema_name}::{document_id}"
	vespa_feed_to_save.append({"put": put_id, "fields": page["fields"]})
	with open("vespa_feed.json", "w") as f:
	json.dump(vespa_feed_to_save, f)

	def callback(response: VespaResponse, id: str):
	if not response.is_successful():
	print(
	f"Failed to feed document {id} with status code {response.status_code}: Reason {response.get_json()}"
	)

	# Feed data into Vespa
	app.feed_iterable(vespa_feed, schema=schema_name, callback=callback)


	if __name__ == "__main__":
	main()