|
|
|
|
|
import faiss |
|
|
import numpy as np |
|
|
from huggingface_hub import hf_hub_download |
|
|
from config import HF_DS_REPO_ID, HF_INDEX_FILE, HF_IDS_FILE |
|
|
from modules.retriever import set_index |
|
|
from modules.corpus import prepare_corpus, _get_datasets, set_id_to_row |
|
|
|
|
|
_vector_ids = None |
|
|
|
|
|
def _load_index_in_memory(): |
|
|
"""HF Hubμμ μΈλ±μ€/ID λ§€νμ λ°μ λ©λͺ¨λ¦¬μ λ‘λ""" |
|
|
index_path = hf_hub_download( |
|
|
repo_id=HF_DS_REPO_ID, |
|
|
filename=HF_INDEX_FILE, |
|
|
repo_type="dataset" |
|
|
) |
|
|
ids_path = hf_hub_download( |
|
|
repo_id=HF_DS_REPO_ID, |
|
|
filename=HF_IDS_FILE, |
|
|
repo_type="dataset" |
|
|
) |
|
|
index = faiss.read_index(index_path) |
|
|
set_index(index) |
|
|
global _vector_ids |
|
|
_vector_ids = np.load(ids_path, allow_pickle=True) |
|
|
|
|
|
def get_vector_ids(): |
|
|
global _vector_ids |
|
|
return _vector_ids |
|
|
|
|
|
def initialize_dbs(): |
|
|
|
|
|
prepare_corpus() |
|
|
|
|
|
_load_index_in_memory() |
|
|
|
|
|
datasets = _get_datasets() |
|
|
id_to_row = {} |
|
|
for _subset, ds in datasets.items(): |
|
|
for r in ds: |
|
|
id_to_row[r["page_id"]] = r |
|
|
set_id_to_row(id_to_row) |
|
|
|
|
|
def force_update(): |
|
|
_load_index_in_memory() |
|
|
|
|
|
|