m97j commited on
Commit
33b550a
ยท
1 Parent(s): 5f612cd

Initial codes commit

Browse files
Files changed (3) hide show
  1. db/initializer.py +20 -9
  2. models/reranker.py +2 -2
  3. modules/corpus.py +21 -24
db/initializer.py CHANGED
@@ -4,14 +4,22 @@ import numpy as np
4
  from huggingface_hub import hf_hub_download
5
  from config import HF_DS_REPO_ID, HF_INDEX_FILE, HF_IDS_FILE
6
  from modules.retriever import set_index
7
- from modules import corpus
8
 
9
  _vector_ids = None
10
 
11
  def _load_index_in_memory():
12
  """HF Hub์—์„œ ์ธ๋ฑ์Šค/ID ๋งคํ•‘์„ ๋ฐ›์•„ ๋ฉ”๋ชจ๋ฆฌ์— ๋กœ๋“œ"""
13
- index_path = hf_hub_download(repo_id=HF_DS_REPO_ID, filename=HF_INDEX_FILE, repo_type="dataset")
14
- ids_path = hf_hub_download(repo_id=HF_DS_REPO_ID, filename=HF_IDS_FILE, repo_type="dataset")
 
 
 
 
 
 
 
 
15
  index = faiss.read_index(index_path)
16
  set_index(index)
17
  global _vector_ids
@@ -22,15 +30,18 @@ def get_vector_ids():
22
  return _vector_ids
23
 
24
  def initialize_dbs():
25
- # 1) ์ฝ”ํผ์Šค ์ค€๋น„ (์ตœ์ดˆ 1ํšŒ๋งŒ ๋‹ค์šด๋กœ๋“œ)
26
- corpus.prepare_corpus()
27
  # 2) ์ธ๋ฑ์Šค/ID ๋งคํ•‘ ๋ฉ”๋ชจ๋ฆฌ ๋กœ๋“œ
28
  _load_index_in_memory()
29
- # 3) Arrow ์บ์‹œ ์ƒ์„ฑ
30
- datasets = corpus._get_datasets()
 
31
  for _subset, ds in datasets.items():
32
- # dummy ํ˜ธ์ถœ๋กœ ์บ์‹œ ์ƒ์„ฑ
33
- _ = ds.filter(lambda r: False)
 
34
 
35
  def force_update():
36
  _load_index_in_memory()
 
 
4
  from huggingface_hub import hf_hub_download
5
  from config import HF_DS_REPO_ID, HF_INDEX_FILE, HF_IDS_FILE
6
  from modules.retriever import set_index
7
+ from modules.corpus import prepare_corpus, _get_datasets, set_id_to_row
8
 
9
  _vector_ids = None
10
 
11
  def _load_index_in_memory():
12
  """HF Hub์—์„œ ์ธ๋ฑ์Šค/ID ๋งคํ•‘์„ ๋ฐ›์•„ ๋ฉ”๋ชจ๋ฆฌ์— ๋กœ๋“œ"""
13
+ index_path = hf_hub_download(
14
+ repo_id=HF_DS_REPO_ID,
15
+ filename=HF_INDEX_FILE,
16
+ repo_type="dataset"
17
+ )
18
+ ids_path = hf_hub_download(
19
+ repo_id=HF_DS_REPO_ID,
20
+ filename=HF_IDS_FILE,
21
+ repo_type="dataset"
22
+ )
23
  index = faiss.read_index(index_path)
24
  set_index(index)
25
  global _vector_ids
 
30
  return _vector_ids
31
 
32
  def initialize_dbs():
33
+ # 1) ์ฝ”ํผ์Šค ์ค€๋น„ (์ตœ์ดˆ 1ํšŒ parquet ๋‹ค์šด๋กœ๋“œ)
34
+ prepare_corpus()
35
  # 2) ์ธ๋ฑ์Šค/ID ๋งคํ•‘ ๋ฉ”๋ชจ๋ฆฌ ๋กœ๋“œ
36
  _load_index_in_memory()
37
+ # 3) ๋ฐ์ดํ„ฐ์…‹ ๋กœ๋“œ ๋ฐ page_id โ†’ row ๋งคํ•‘ ์ƒ์„ฑ
38
+ datasets = _get_datasets()
39
+ id_to_row = {}
40
  for _subset, ds in datasets.items():
41
+ for r in ds:
42
+ id_to_row[r["page_id"]] = r
43
+ set_id_to_row(id_to_row)
44
 
45
  def force_update():
46
  _load_index_in_memory()
47
+
models/reranker.py CHANGED
@@ -21,8 +21,8 @@ def rerank(request: Request, query: str, contexts: List[Dict]) -> List[Dict]:
21
  pairs = [(query, ctx["text"]) for ctx in contexts]
22
  inputs = tokenizer(pairs, return_tensors="np", padding=True, truncation=True, max_length=256)
23
  ort_inputs = {k: v for k, v in inputs.items()}
24
- scores = sess.run(None, ort_inputs)[0] # [batch, 1] ํ˜•ํƒœ
25
- scores = scores.squeeze(-1)
26
 
27
  for ctx, sc in zip(contexts, scores):
28
  ctx["score"] = float(sc)
 
21
  pairs = [(query, ctx["text"]) for ctx in contexts]
22
  inputs = tokenizer(pairs, return_tensors="np", padding=True, truncation=True, max_length=256)
23
  ort_inputs = {k: v for k, v in inputs.items()}
24
+ scores = sess.run(None, ort_inputs)[0] # [batch] ํ˜•ํƒœ
25
+ scores = scores.reshape(-1) # ํ•ญ์ƒ 1์ฐจ์› ๋ฒกํ„ฐ๋กœ ๋ณ€ํ™˜
26
 
27
  for ctx, sc in zip(contexts, scores):
28
  ctx["score"] = float(sc)
modules/corpus.py CHANGED
@@ -1,10 +1,11 @@
1
  # rag/modules/corpus.py
2
  from typing import List, Dict, Any
3
- from datasets import load_dataset, DatasetDict, Dataset
4
  from config import HF_CORPUS_REPO, HF_CORPUS_SUBSET, HF_CORPUS_SPLIT, MARKER_DIR, CORPUS_READY_MARK
5
  from modules.utils import ensure_dir, exists, touch
6
 
7
  _datasets: Dict[str, Dataset] = {}
 
8
 
9
  def prepare_corpus():
10
  """
@@ -30,32 +31,28 @@ def _get_datasets() -> Dict[str, Dataset]:
30
  )
31
  return _datasets
32
 
 
 
 
 
 
33
  def fetch_contexts_by_ids(ids: List[int]) -> List[Dict[str, Any]]:
34
  if not ids:
35
  return []
36
 
37
- datasets = _get_datasets()
38
- id_set = set(ids)
39
  results: List[Dict[str, Any]] = []
40
-
41
- # ๋ชจ๋“  subset์„ ์ˆœํšŒํ•˜๋ฉฐ page_id ๋งค์นญ
42
- for subset, ds in datasets.items():
43
- # filter๋ฅผ ์‚ฌ์šฉํ•˜๋ฉด ์ „์ฒด ์ˆœํšŒ๋ณด๋‹ค ๋น ๋ฆ„ (๋ณ‘๋ ฌ ์ตœ์ ํ™”)
44
- rows = ds.filter(lambda r: r["page_id"] in id_set)
45
-
46
- id_to_row = {r["page_id"]: r for r in rows}
47
- for i in ids:
48
- r = id_to_row.get(i)
49
- if r:
50
- results.append({
51
- "id": r["page_id"],
52
- "title": r.get("title", ""),
53
- "text": r.get("wikitext", ""),
54
- "url": r.get("url", ""),
55
- "metadata": {
56
- "date_modified": r.get("date_modified", ""),
57
- "in_language": r.get("in_language", ""),
58
- "wikidata_id": r.get("wikidata_id", "")
59
- }
60
- })
61
  return results
 
1
  # rag/modules/corpus.py
2
  from typing import List, Dict, Any
3
+ from datasets import load_dataset, Dataset
4
  from config import HF_CORPUS_REPO, HF_CORPUS_SUBSET, HF_CORPUS_SPLIT, MARKER_DIR, CORPUS_READY_MARK
5
  from modules.utils import ensure_dir, exists, touch
6
 
7
  _datasets: Dict[str, Dataset] = {}
8
+ _id_to_row: Dict[int, Dict[str, Any]] = {}
9
 
10
  def prepare_corpus():
11
  """
 
31
  )
32
  return _datasets
33
 
34
+ def set_id_to_row(mapping: Dict[int, Dict[str, Any]]):
35
+ """initializer์—์„œ ์ƒ์„ฑํ•œ page_id โ†’ row ๋งคํ•‘์„ ์ €์žฅ"""
36
+ global _id_to_row
37
+ _id_to_row = mapping
38
+
39
  def fetch_contexts_by_ids(ids: List[int]) -> List[Dict[str, Any]]:
40
  if not ids:
41
  return []
42
 
 
 
43
  results: List[Dict[str, Any]] = []
44
+ for i in ids:
45
+ r = _id_to_row.get(i)
46
+ if r:
47
+ results.append({
48
+ "id": r["page_id"],
49
+ "title": r.get("title", ""),
50
+ "text": r.get("wikitext", ""),
51
+ "url": r.get("url", ""),
52
+ "metadata": {
53
+ "date_modified": r.get("date_modified", ""),
54
+ "in_language": r.get("in_language", ""),
55
+ "wikidata_id": r.get("wikidata_id", "")
56
+ }
57
+ })
 
 
 
 
 
 
 
58
  return results