Spaces:

m97j
/

pls-rag

Sleeping

App Files Files Community

m97j commited on Nov 8

Commit

33b550a

1 Parent(s): 5f612cd

Initial codes commit

Browse files

Files changed (3) hide show

db/initializer.py +20 -9
models/reranker.py +2 -2
modules/corpus.py +21 -24

db/initializer.py CHANGED Viewed

@@ -4,14 +4,22 @@ import numpy as np
 from huggingface_hub import hf_hub_download
 from config import HF_DS_REPO_ID, HF_INDEX_FILE, HF_IDS_FILE
 from modules.retriever import set_index
-from modules import corpus
 _vector_ids = None
 def _load_index_in_memory():
     """HF Hub에서 인덱스/ID 매핑을 받아 메모리에 로드"""
-    index_path = hf_hub_download(repo_id=HF_DS_REPO_ID, filename=HF_INDEX_FILE, repo_type="dataset")
-    ids_path = hf_hub_download(repo_id=HF_DS_REPO_ID, filename=HF_IDS_FILE, repo_type="dataset")
     index = faiss.read_index(index_path)
     set_index(index)
     global _vector_ids
@@ -22,15 +30,18 @@ def get_vector_ids():
     return _vector_ids
 def initialize_dbs():
-    # 1) 코퍼스 준비 (최초 1회만 다운로드)
-    corpus.prepare_corpus()
     # 2) 인덱스/ID 매핑 메모리 로드
     _load_index_in_memory()
-    # 3) Arrow 캐시 생성
-    datasets = corpus._get_datasets()
     for _subset, ds in datasets.items():
-        # dummy 호출로 캐시 생성
-        _ = ds.filter(lambda r: False)
 def force_update():
     _load_index_in_memory()

 from huggingface_hub import hf_hub_download
 from config import HF_DS_REPO_ID, HF_INDEX_FILE, HF_IDS_FILE
 from modules.retriever import set_index
+from modules.corpus import prepare_corpus, _get_datasets, set_id_to_row
 _vector_ids = None
 def _load_index_in_memory():
     """HF Hub에서 인덱스/ID 매핑을 받아 메모리에 로드"""
+    index_path = hf_hub_download(
+        repo_id=HF_DS_REPO_ID,
+        filename=HF_INDEX_FILE,
+        repo_type="dataset"
+    )
+    ids_path = hf_hub_download(
+        repo_id=HF_DS_REPO_ID,
+        filename=HF_IDS_FILE,
+        repo_type="dataset"
+    )
     index = faiss.read_index(index_path)
     set_index(index)
     global _vector_ids
     return _vector_ids
 def initialize_dbs():
+    # 1) 코퍼스 준비 (최초 1회 parquet 다운로드)
+    prepare_corpus()
     # 2) 인덱스/ID 매핑 메모리 로드
     _load_index_in_memory()
+    # 3) 데이터셋 로드 및 page_id → row 매핑 생성
+    datasets = _get_datasets()
+    id_to_row = {}
     for _subset, ds in datasets.items():
+        for r in ds:
+            id_to_row[r["page_id"]] = r
+    set_id_to_row(id_to_row)
 def force_update():
     _load_index_in_memory()

models/reranker.py CHANGED Viewed

@@ -21,8 +21,8 @@ def rerank(request: Request, query: str, contexts: List[Dict]) -> List[Dict]:
     pairs = [(query, ctx["text"]) for ctx in contexts]
     inputs = tokenizer(pairs, return_tensors="np", padding=True, truncation=True, max_length=256)
     ort_inputs = {k: v for k, v in inputs.items()}
-    scores = sess.run(None, ort_inputs)[0]  # [batch, 1] 형태
-    scores = scores.squeeze(-1)
     for ctx, sc in zip(contexts, scores):
         ctx["score"] = float(sc)

     pairs = [(query, ctx["text"]) for ctx in contexts]
     inputs = tokenizer(pairs, return_tensors="np", padding=True, truncation=True, max_length=256)
     ort_inputs = {k: v for k, v in inputs.items()}
+    scores = sess.run(None, ort_inputs)[0]  # [batch] 형태
+    scores = scores.reshape(-1)  # 항상 1차원 벡터로 변환
     for ctx, sc in zip(contexts, scores):
         ctx["score"] = float(sc)

modules/corpus.py CHANGED Viewed

@@ -1,10 +1,11 @@
 # rag/modules/corpus.py
 from typing import List, Dict, Any
-from datasets import load_dataset, DatasetDict, Dataset
 from config import HF_CORPUS_REPO, HF_CORPUS_SUBSET, HF_CORPUS_SPLIT, MARKER_DIR, CORPUS_READY_MARK
 from modules.utils import ensure_dir, exists, touch
 _datasets: Dict[str, Dataset] = {}
 def prepare_corpus():
     """
@@ -30,32 +31,28 @@ def _get_datasets() -> Dict[str, Dataset]:
             )
     return _datasets
 def fetch_contexts_by_ids(ids: List[int]) -> List[Dict[str, Any]]:
     if not ids:
         return []
-    datasets = _get_datasets()
-    id_set = set(ids)
     results: List[Dict[str, Any]] = []
-    # 모든 subset을 순회하며 page_id 매칭
-    for subset, ds in datasets.items():
-        # filter를 사용하면 전체 순회보다 빠름 (병렬 최적화)
-        rows = ds.filter(lambda r: r["page_id"] in id_set)
-        id_to_row = {r["page_id"]: r for r in rows}
-        for i in ids:
-            r = id_to_row.get(i)
-            if r:
-                results.append({
-                    "id": r["page_id"],
-                    "title": r.get("title", ""),
-                    "text": r.get("wikitext", ""),
-                    "url": r.get("url", ""),
-                    "metadata": {
-                        "date_modified": r.get("date_modified", ""),
-                        "in_language": r.get("in_language", ""),
-                        "wikidata_id": r.get("wikidata_id", "")
-                    }
-                })
     return results

 # rag/modules/corpus.py
 from typing import List, Dict, Any
+from datasets import load_dataset, Dataset
 from config import HF_CORPUS_REPO, HF_CORPUS_SUBSET, HF_CORPUS_SPLIT, MARKER_DIR, CORPUS_READY_MARK
 from modules.utils import ensure_dir, exists, touch
 _datasets: Dict[str, Dataset] = {}
+_id_to_row: Dict[int, Dict[str, Any]] = {}
 def prepare_corpus():
     """
             )
     return _datasets
+def set_id_to_row(mapping: Dict[int, Dict[str, Any]]):
+    """initializer에서 생성한 page_id → row 매핑을 저장"""
+    global _id_to_row
+    _id_to_row = mapping
 def fetch_contexts_by_ids(ids: List[int]) -> List[Dict[str, Any]]:
     if not ids:
         return []
     results: List[Dict[str, Any]] = []
+    for i in ids:
+        r = _id_to_row.get(i)
+        if r:
+            results.append({
+                "id": r["page_id"],
+                "title": r.get("title", ""),
+                "text": r.get("wikitext", ""),
+                "url": r.get("url", ""),
+                "metadata": {
+                    "date_modified": r.get("date_modified", ""),
+                    "in_language": r.get("in_language", ""),
+                    "wikidata_id": r.get("wikidata_id", "")
+                }
+            })
     return results