Spaces:

m97j
/

pls-rag

Sleeping

pls-rag / db /initializer.py

Initial codes commit

33b550a about 1 month ago

1.34 kB

	# rag/db/initializer.py
	import faiss
	import numpy as np
	from huggingface_hub import hf_hub_download
	from config import HF_DS_REPO_ID, HF_INDEX_FILE, HF_IDS_FILE
	from modules.retriever import set_index
	from modules.corpus import prepare_corpus, _get_datasets, set_id_to_row

	_vector_ids = None

	def _load_index_in_memory():
	"""HF Hub에서 인덱스/ID 매핑을 받아 메모리에 로드"""
	index_path = hf_hub_download(
	repo_id=HF_DS_REPO_ID,
	filename=HF_INDEX_FILE,
	repo_type="dataset"
	)
	ids_path = hf_hub_download(
	repo_id=HF_DS_REPO_ID,
	filename=HF_IDS_FILE,
	repo_type="dataset"
	)
	index = faiss.read_index(index_path)
	set_index(index)
	global _vector_ids
	_vector_ids = np.load(ids_path, allow_pickle=True)

	def get_vector_ids():
	global _vector_ids
	return _vector_ids

	def initialize_dbs():
	# 1) 코퍼스 준비 (최초 1회 parquet 다운로드)
	prepare_corpus()
	# 2) 인덱스/ID 매핑 메모리 로드
	_load_index_in_memory()
	# 3) 데이터셋 로드 및 page_id → row 매핑 생성
	datasets = _get_datasets()
	id_to_row = {}
	for _subset, ds in datasets.items():
	for r in ds:
	id_to_row[r["page_id"]] = r
	set_id_to_row(id_to_row)

	def force_update():
	_load_index_in_memory()