Spaces:

Junhoee
/

Megumin-chat

Sleeping

App Files Files Community

Junhoee commited on 14 days ago

Commit

06b2015

verified ·

1 Parent(s): 558f57f

Upload 6 files

Browse files

Files changed (3) hide show

megumin_agent/agent.py +1 -1
megumin_agent/bootstrap.py +61 -498
megumin_agent/retrieval.py +112 -44

megumin_agent/agent.py CHANGED Viewed

@@ -127,7 +127,7 @@ root_agent = LlmAgent(
 이 tool은 스타일/페르소나용 사례 top-3와 사실/설정용 사례 top-3를 5:5 비중으로 함께 돌려줍니다.
 persona_matches는 메구밍의 성격, 말투, 감정선, 답변 리듬을 참고하는 용도입니다.
 fact_matches는 설정, 관계, 사건, 세계관 사실을 참고하는 용도입니다.
-두 종류의 사례를 모두 참고하되, 검색된 답변을 그대로 복사하지 마세요.
 검색 결과가 약하거나 없는 경우에도 메구밍 페르소나는 유지하되, 모르는 내용은 지어내지 말고 솔직하게 답하세요.
 최종 답변은 언제나 메구밍의 페르소나를 강하게 반영해야 하며, 내부 tool 이름이나 구현 세부사항은 드러내지 마세요.
 """.strip(),

 이 tool은 스타일/페르소나용 사례 top-3와 사실/설정용 사례 top-3를 5:5 비중으로 함께 돌려줍니다.
 persona_matches는 메구밍의 성격, 말투, 감정선, 답변 리듬을 참고하는 용도입니다.
 fact_matches는 설정, 관계, 사건, 세계관 사실을 참고하는 용도입니다.
+두 종류의 사례를 모두 참고하되 검색된 답변을 그대로 복사하지 마세요.
 검색 결과가 약하거나 없는 경우에도 메구밍 페르소나는 유지하되, 모르는 내용은 지어내지 말고 솔직하게 답하세요.
 최종 답변은 언제나 메구밍의 페르소나를 강하게 반영해야 하며, 내부 tool 이름이나 구현 세부사항은 드러내지 마세요.
 """.strip(),

megumin_agent/bootstrap.py CHANGED Viewed

@@ -1,531 +1,94 @@
 from __future__ import annotations
-import json
-import math
 import os
-import re
-import unicodedata
-from dataclasses import dataclass
-from functools import lru_cache
 from pathlib import Path
-from typing import Any
-from typing import Iterable
-import faiss
-import numpy as np
-from google import genai
-from google.genai import types
-QUESTION_KEYS = (
-    "question",
-    "query",
-    "q",
-    "prompt",
-    "user",
-    "instruction",
-    "input",
-)
-ANSWER_KEYS = (
-    "answer",
-    "response",
-    "a",
-    "output",
-    "assistant",
-    "completion",
-)
-COLLECTION_KEYS = ("items", "data", "examples", "dataset", "records")
-EMBEDDING_MODEL_NAME = os.getenv("MEGUMIN_EMBEDDING_MODEL", "gemini-embedding-001")
-EMBEDDING_DIMENSION = int(os.getenv("MEGUMIN_EMBEDDING_DIM", "768"))
-EMBEDDING_BATCH_SIZE = int(os.getenv("MEGUMIN_EMBEDDING_BATCH_SIZE", "100"))
-FAISS_INDEX_FILENAME = os.getenv("MEGUMIN_FAISS_INDEX_FILENAME", "megumin_questions.faiss")
-FAISS_QA_INDEX_FILENAME = os.getenv(
-    "MEGUMIN_FAISS_QA_INDEX_FILENAME",
-    "megumin_question_answer.faiss",
-)
-FAISS_METADATA_FILENAME = os.getenv(
-    "MEGUMIN_FAISS_METADATA_FILENAME",
-    "megumin_questions_meta.json",
-)
-PERSONA_DATASET_PATTERNS = ("megumin_qa_dataset.json",)
-FACT_DATASET_PATTERNS = ("namuwiki*.json",)
-def _normalize_text(value: Any) -> str:
-    text = str(value or "")
-    text = unicodedata.normalize("NFKC", text).strip()
-    text = re.sub(r"\s+", " ", text)
-    return text
-def _safe_excerpt(text: str, limit: int = 220) -> str:
-    compact = re.sub(r"\s+", " ", str(text or "")).strip()
-    if len(compact) <= limit:
-        return compact
-    return compact[: limit - 3].rstrip() + "..."
-def _normalize_patterns(patterns: Iterable[str] | None) -> tuple[str, ...]:
-    normalized = tuple(pattern.strip() for pattern in (patterns or ()) if pattern.strip())
-    return normalized
-def _record_search_text(record: "QaRecord", mode: str) -> str:
-    if mode == "question_answer":
-        return f"{record.question}\n{record.answer}".strip()
-    return record.question
-@dataclass(frozen=True)
-class QaRecord:
-    question: str
-    answer: str
-    source_file: str
-    metadata: dict[str, Any]
-    @property
-    def normalized_question(self) -> str:
-        return _normalize_text(self.question)
-@dataclass(frozen=True)
-class VectorStore:
-    records: tuple[QaRecord, ...]
-    index: faiss.Index
-    embedding_model: str
-    dimension: int
-def _extract_collection(payload: Any) -> list[Any]:
-    if isinstance(payload, list):
-        return payload
-    if isinstance(payload, dict):
-        for key in COLLECTION_KEYS:
-            value = payload.get(key)
-            if isinstance(value, list):
-                return value
-    return []
-def _pick_first(mapping: dict[str, Any], keys: tuple[str, ...]) -> str | None:
-    lowered = {str(key).lower(): value for key, value in mapping.items()}
-    for key in keys:
-        if key in lowered and lowered[key] not in (None, ""):
-            return str(lowered[key]).strip()
-    return None
-def _record_from_mapping(item: dict[str, Any], source_file: str) -> QaRecord | None:
-    question = _pick_first(item, QUESTION_KEYS)
-    answer = _pick_first(item, ANSWER_KEYS)
-    if not question or not answer:
-        return None
-    metadata = {
-        key: value
-        for key, value in item.items()
-        if str(key).lower() not in QUESTION_KEYS + ANSWER_KEYS
-    }
-    return QaRecord(
-        question=question,
-        answer=answer,
-        source_file=source_file,
-        metadata=metadata,
-    )
-def _load_json_records(path: Path) -> list[QaRecord]:
-    raw_text = path.read_text(encoding="utf-8")
-    stripped = raw_text.strip()
-    if not stripped:
-        return []
-    records: list[QaRecord] = []
     try:
-        payload = json.loads(stripped)
-    except json.JSONDecodeError:
-        payload = None
-    if payload is not None:
-        for item in _extract_collection(payload):
-            if isinstance(item, dict):
-                record = _record_from_mapping(item, path.name)
-                if record:
-                    records.append(record)
-        if records:
-            return records
-    for line in stripped.splitlines():
-        line = line.strip()
-        if not line:
-            continue
-        try:
-            item = json.loads(line)
-        except json.JSONDecodeError:
-            continue
-        if isinstance(item, dict):
-            record = _record_from_mapping(item, path.name)
-            if record:
-                records.append(record)
-    return records
-def _load_metadata_records(path: Path) -> tuple[QaRecord, ...]:
-    payload = json.loads(path.read_text(encoding="utf-8"))
-    records: list[QaRecord] = []
-    for item in _extract_collection(payload):
-        if isinstance(item, dict):
-            record = _record_from_mapping(item, path.name)
-            if record:
-                records.append(record)
-    return tuple(records)
-def _iter_matching_paths(root: Path, include_patterns: tuple[str, ...]) -> list[Path]:
-    if not include_patterns:
-        return sorted(root.glob("*.json"))
-    seen: set[Path] = set()
-    paths: list[Path] = []
-    for pattern in include_patterns:
-        for path in sorted(root.glob(pattern)):
-            if path in seen or path.suffix.lower() != ".json":
-                continue
-            seen.add(path)
-            paths.append(path)
-    return paths
-@lru_cache(maxsize=16)
-def _load_records(dataset_dir: str, include_patterns: tuple[str, ...] = ()) -> tuple[QaRecord, ...]:
-    root = Path(dataset_dir)
-    if not root.exists():
-        return tuple()
-    all_records: list[QaRecord] = []
-    for path in _iter_matching_paths(root, include_patterns):
-        try:
-            all_records.extend(_load_json_records(path))
-        except OSError:
-            continue
-        except UnicodeDecodeError:
-            continue
-    return tuple(all_records)
-@lru_cache(maxsize=2)
-def _get_genai_client() -> genai.Client:
-    return genai.Client()
-def _embed_texts(
-    texts: list[str],
-    *,
-    task_type: str,
-    embedding_model: str,
-    output_dimensionality: int,
-) -> np.ndarray:
-    if not texts:
-        return np.zeros((0, output_dimensionality), dtype="float32")
-    batches: list[np.ndarray] = []
-    batch_size = max(1, min(EMBEDDING_BATCH_SIZE, 100))
-    for start in range(0, len(texts), batch_size):
-        chunk = texts[start : start + batch_size]
-        response = _get_genai_client().models.embed_content(
-            model=embedding_model,
-            contents=chunk,
-            config=types.EmbedContentConfig(
-                task_type=task_type,
-                output_dimensionality=output_dimensionality,
-            ),
         )
-        vectors = np.array(
-            [embedding.values for embedding in response.embeddings],
-            dtype="float32",
-        )
-        if vectors.size == 0:
-            continue
-        faiss.normalize_L2(vectors)
-        batches.append(vectors)
-    if not batches:
-        return np.zeros((0, output_dimensionality), dtype="float32")
-    return np.vstack(batches)
-def _index_artifact_paths(dataset_dir: str | Path) -> tuple[Path, Path]:
-    root = Path(dataset_dir)
-    return (
-        root / FAISS_INDEX_FILENAME,
-        root / FAISS_METADATA_FILENAME,
-    )
-def _build_index_from_records(
-    records: tuple[QaRecord, ...],
-    *,
-    embedding_model: str,
-    output_dimensionality: int,
-    mode: str,
-) -> faiss.IndexFlatIP:
-    search_texts = [_record_search_text(record, mode) for record in records]
-    vectors = _embed_texts(
-        search_texts,
-        task_type="RETRIEVAL_DOCUMENT",
-        embedding_model=embedding_model,
-        output_dimensionality=output_dimensionality,
-    )
-    if vectors.size == 0:
-        raise RuntimeError("No embeddings were generated for the dataset records.")
-    index = faiss.IndexFlatIP(int(vectors.shape[1]))
-    index.add(vectors)
-    return index
-def build_and_save_faiss_index(
-    dataset_dir: str | Path,
-    *,
-    embedding_model: str = EMBEDDING_MODEL_NAME,
-    output_dimensionality: int = EMBEDDING_DIMENSION,
-    index_filename: str = FAISS_INDEX_FILENAME,
-    qa_index_filename: str = FAISS_QA_INDEX_FILENAME,
-    metadata_filename: str = FAISS_METADATA_FILENAME,
-    include_patterns: Iterable[str] | None = None,
-) -> tuple[Path, Path, Path]:
-    root = Path(dataset_dir)
-    records = _load_records(str(root.resolve()), _normalize_patterns(include_patterns))
-    if not records:
-        raise FileNotFoundError(f"No JSON records found under {root}")
-    question_index = _build_index_from_records(
-        records,
-        embedding_model=embedding_model,
-        output_dimensionality=output_dimensionality,
-        mode="question",
-    )
-    qa_index = _build_index_from_records(
-        records,
-        embedding_model=embedding_model,
-        output_dimensionality=output_dimensionality,
-        mode="question_answer",
-    )
-    index_path = root / index_filename
-    qa_index_path = root / qa_index_filename
-    metadata_path = root / metadata_filename
-    faiss.write_index(question_index, str(index_path))
-    faiss.write_index(qa_index, str(qa_index_path))
-    metadata_payload = {
-        "items": [
-            {
-                "question": record.question,
-                "answer": record.answer,
-                "source_file": record.source_file,
-                **record.metadata,
-            }
-            for record in records
-        ]
-    }
-    metadata_path.write_text(
-        json.dumps(metadata_payload, ensure_ascii=False, indent=2),
-        encoding="utf-8",
-    )
-    return index_path, qa_index_path, metadata_path
-@lru_cache(maxsize=8)
-def _load_vector_store(
-    dataset_dir: str,
-    embedding_model: str,
-    output_dimensionality: int,
-    include_patterns: tuple[str, ...] = (),
-    index_filename: str | None = FAISS_INDEX_FILENAME,
-    qa_index_filename: str | None = FAISS_QA_INDEX_FILENAME,
-    metadata_filename: str | None = FAISS_METADATA_FILENAME,
-    mode: str = "question",
-) -> VectorStore:
-    selected_index_filename = index_filename if mode == "question" else qa_index_filename
-    if selected_index_filename and metadata_filename:
-        index_path = Path(dataset_dir) / selected_index_filename
-        metadata_path = Path(dataset_dir) / metadata_filename
-    else:
-        index_path = metadata_path = None
-    if index_path and metadata_path and index_path.exists() and metadata_path.exists():
-        index = faiss.read_index(str(index_path))
-        records = _load_metadata_records(metadata_path)
-        if index.ntotal != len(records):
-            raise ValueError(
-                f"FAISS index size ({index.ntotal}) does not match metadata size ({len(records)})."
-            )
-        return VectorStore(
-            records=records,
-            index=index,
-            embedding_model=embedding_model,
-            dimension=index.d,
-        )
-    records = _load_records(dataset_dir, include_patterns)
-    if not records:
-        empty_index = faiss.IndexFlatIP(output_dimensionality)
-        return VectorStore(
-            records=tuple(),
-            index=empty_index,
-            embedding_model=embedding_model,
-            dimension=output_dimensionality,
-        )
-    index = _build_index_from_records(
-        records,
-        embedding_model=embedding_model,
-        output_dimensionality=output_dimensionality,
-        mode=mode,
-    )
-    return VectorStore(
-        records=records,
-        index=index,
-        embedding_model=embedding_model,
-        dimension=index.d,
-    )
-class JsonQaRetriever:
-    def __init__(
-        self,
-        dataset_dir: str | Path,
-        *,
-        embedding_model: str = EMBEDDING_MODEL_NAME,
-        output_dimensionality: int = EMBEDDING_DIMENSION,
-        include_patterns: Iterable[str] | None = None,
-        index_filename: str | None = FAISS_INDEX_FILENAME,
-        qa_index_filename: str | None = FAISS_QA_INDEX_FILENAME,
-        metadata_filename: str | None = FAISS_METADATA_FILENAME,
-    ):
-        self.dataset_dir = Path(dataset_dir)
-        self.embedding_model = embedding_model
-        self.output_dimensionality = output_dimensionality
-        self.include_patterns = _normalize_patterns(include_patterns)
-        self.index_filename = index_filename
-        self.qa_index_filename = qa_index_filename
-        self.metadata_filename = metadata_filename
-    def warmup(self) -> None:
-        _load_vector_store(
-            str(self.dataset_dir.resolve()),
-            self.embedding_model,
-            self.output_dimensionality,
-            self.include_patterns,
-            self.index_filename,
-            self.qa_index_filename,
-            self.metadata_filename,
-            "question",
-        )
-        _load_vector_store(
-            str(self.dataset_dir.resolve()),
-            self.embedding_model,
-            self.output_dimensionality,
-            self.include_patterns,
-            self.index_filename,
-            self.qa_index_filename,
-            self.metadata_filename,
-            "question_answer",
-        )
-    def _style_notes(self, matches: list[dict[str, Any]]) -> list[str]:
-        if not matches:
-            return [
-                "No strong example was retrieved, so stay in Megumin's persona without inventing unsupported canon facts.",
-            ]
-        notes = [
-            "Answer in first person as Megumin, with respectful but dramatic confidence.",
-            "Use the retrieved cases to mirror tone and answer shape, but do not copy them verbatim.",
-            "Prefer the retrieved answers as evidence for facts, relationships, and recurring phrasing.",
-        ]
-        long_answers = sum(
-            1 for match in matches if len(match.get("answer", "")) >= 180
-        )
-        if long_answers >= max(1, math.ceil(len(matches) / 2)):
-            notes.append(
-                "The retrieved examples skew narrative, so a short anecdotal lead-in is acceptable."
-            )
-        else:
-            notes.append(
-                "The retrieved examples are compact, so keep the answer concise and pointed."
-            )
-        return notes
-    def retrieve(self, query: str, top_k: int = 3) -> dict[str, Any]:
-        question_store = _load_vector_store(
-            str(self.dataset_dir.resolve()),
-            self.embedding_model,
-            self.output_dimensionality,
-            self.include_patterns,
-            self.index_filename,
-            self.qa_index_filename,
-            self.metadata_filename,
-            "question",
-        )
-        qa_store = _load_vector_store(
-            str(self.dataset_dir.resolve()),
-            self.embedding_model,
-            self.output_dimensionality,
-            self.include_patterns,
-            self.index_filename,
-            self.qa_index_filename,
-            self.metadata_filename,
-            "question_answer",
-        )
-        if not question_store.records:
-            return {
-                "query": query,
-                "match_count": 0,
-                "matches": [],
-                "style_notes": [
-                    "No processed JSON dataset was found for retrieval.",
-                ],
-            }
-        query_vector = _embed_texts(
-            [_normalize_text(query) or query],
-            task_type="RETRIEVAL_QUERY",
-            embedding_model=question_store.embedding_model,
-            output_dimensionality=question_store.dimension,
-        )
-        search_k = max(1, min(top_k, len(question_store.records)))
-        candidates: dict[int, dict[str, Any]] = {}
-        for store_name, store in (("question", question_store), ("question_answer", qa_store)):
-            scores, indices = store.index.search(query_vector, search_k)
-            for score, index in zip(scores[0], indices[0]):
-                if index < 0:
                     continue
-                record = store.records[int(index)]
-                current = candidates.get(int(index))
-                score_value = round(float(score), 6)
-                if current is None or score_value > current["score"]:
-                    candidates[int(index)] = {
-                        "question": record.question,
-                        "answer": _safe_excerpt(record.answer),
-                        "score": score_value,
-                        "source_file": record.source_file,
-                        "metadata": record.metadata,
-                        "matched_via": store_name,
-                    }
-        matches = sorted(
-            candidates.values(),
-            key=lambda item: item["score"],
-            reverse=True,
-        )[:top_k]
-        return {
-            "query": query,
-            "match_count": len(matches),
-            "matches": matches,
-            "style_notes": self._style_notes(matches),
-        }

 from __future__ import annotations
 import os
+import sys
 from pathlib import Path
+from dotenv import load_dotenv
+from huggingface_hub import hf_hub_download
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+ADK_SRC = PROJECT_ROOT / "adk-python" / "src"
+LOCAL_DATASET_DIR = PROJECT_ROOT / "data" / "processed"
+RUNTIME_DATASET_DIR = PROJECT_ROOT / "data" / "_runtime_processed"
+def _dataset_repo_id() -> str:
+    return os.getenv("MEGUMIN_HF_DATASET_REPO_ID", "Junhoee/megumin-chat")
+def _dataset_filename() -> str:
+    return os.getenv("MEGUMIN_HF_DATASET_FILENAME", "megumin_qa_dataset.json")
+def _index_filename() -> str:
+    return os.getenv("MEGUMIN_FAISS_INDEX_FILENAME", "megumin_questions.faiss")
+def _qa_index_filename() -> str:
+    return os.getenv("MEGUMIN_FAISS_QA_INDEX_FILENAME", "megumin_question_answer.faiss")
+def _metadata_filename() -> str:
+    return os.getenv("MEGUMIN_FAISS_METADATA_FILENAME", "megumin_questions_meta.json")
+def _fact_dataset_filename() -> str:
+    return os.getenv("MEGUMIN_HF_FACT_DATASET_FILENAME", "namuwiki_qa.json")
+def _fact_index_filename() -> str:
+    return os.getenv("MEGUMIN_HF_FACT_INDEX_FILENAME", "namuwiki_questions.faiss")
+def _fact_qa_index_filename() -> str:
+    return os.getenv("MEGUMIN_HF_FACT_QA_INDEX_FILENAME", "namuwiki_question_answer.faiss")
+def _fact_metadata_filename() -> str:
+    return os.getenv("MEGUMIN_HF_FACT_METADATA_FILENAME", "namuwiki_questions_meta.json")
+def bootstrap_environment() -> None:
+    load_dotenv(PROJECT_ROOT / ".env", override=True)
+    if ADK_SRC.exists():
+        adk_src = str(ADK_SRC)
+        if adk_src not in sys.path:
+            sys.path.insert(0, adk_src)
+def resolve_dataset_dir() -> Path:
+    RUNTIME_DATASET_DIR.mkdir(parents=True, exist_ok=True)
     try:
+        hf_token = os.getenv("HF_TOKEN") or None
+        repo_id = _dataset_repo_id()
+        artifact_names = (
+            _dataset_filename(),
+            _index_filename(),
+            _qa_index_filename(),
+            _metadata_filename(),
+            _fact_dataset_filename(),
+            _fact_index_filename(),
+            _fact_qa_index_filename(),
+            _fact_metadata_filename(),
         )
+        for artifact_name in artifact_names:
+            try:
+                hf_hub_download(
+                    repo_id=repo_id,
+                    repo_type="dataset",
+                    filename=artifact_name,
+                    token=hf_token,
+                    local_dir=str(RUNTIME_DATASET_DIR),
+                )
+            except Exception:
+                if artifact_name not in {_dataset_filename(), _fact_dataset_filename()}:
                     continue
+                raise
+        return RUNTIME_DATASET_DIR
+    except Exception:
+        if LOCAL_DATASET_DIR.exists() and any(LOCAL_DATASET_DIR.glob("*.json")):
+            return LOCAL_DATASET_DIR
+        raise

megumin_agent/retrieval.py CHANGED Viewed

@@ -39,6 +39,10 @@ EMBEDDING_MODEL_NAME = os.getenv("MEGUMIN_EMBEDDING_MODEL", "gemini-embedding-00
 EMBEDDING_DIMENSION = int(os.getenv("MEGUMIN_EMBEDDING_DIM", "768"))
 EMBEDDING_BATCH_SIZE = int(os.getenv("MEGUMIN_EMBEDDING_BATCH_SIZE", "100"))
 FAISS_INDEX_FILENAME = os.getenv("MEGUMIN_FAISS_INDEX_FILENAME", "megumin_questions.faiss")
 FAISS_METADATA_FILENAME = os.getenv(
     "MEGUMIN_FAISS_METADATA_FILENAME",
     "megumin_questions_meta.json",
@@ -66,6 +70,12 @@ def _normalize_patterns(patterns: Iterable[str] | None) -> tuple[str, ...]:
     return normalized
 @dataclass(frozen=True)
 class QaRecord:
     question: str
@@ -254,36 +264,60 @@ def _index_artifact_paths(dataset_dir: str | Path) -> tuple[Path, Path]:
     )
 def build_and_save_faiss_index(
     dataset_dir: str | Path,
     *,
     embedding_model: str = EMBEDDING_MODEL_NAME,
     output_dimensionality: int = EMBEDDING_DIMENSION,
     index_filename: str = FAISS_INDEX_FILENAME,
     metadata_filename: str = FAISS_METADATA_FILENAME,
     include_patterns: Iterable[str] | None = None,
-) -> tuple[Path, Path]:
     root = Path(dataset_dir)
     records = _load_records(str(root.resolve()), _normalize_patterns(include_patterns))
     if not records:
         raise FileNotFoundError(f"No JSON records found under {root}")
-    questions = [record.normalized_question or record.question for record in records]
-    question_vectors = _embed_texts(
-        questions,
-        task_type="RETRIEVAL_DOCUMENT",
         embedding_model=embedding_model,
         output_dimensionality=output_dimensionality,
     )
-    if question_vectors.size == 0:
-        raise RuntimeError("No embeddings were generated for the dataset questions.")
-    index = faiss.IndexFlatIP(int(question_vectors.shape[1]))
-    index.add(question_vectors)
     index_path = root / index_filename
     metadata_path = root / metadata_filename
-    faiss.write_index(index, str(index_path))
     metadata_payload = {
         "items": [
             {
@@ -299,7 +333,7 @@ def build_and_save_faiss_index(
         json.dumps(metadata_payload, ensure_ascii=False, indent=2),
         encoding="utf-8",
     )
-    return index_path, metadata_path
 @lru_cache(maxsize=8)
@@ -309,10 +343,13 @@ def _load_vector_store(
     output_dimensionality: int,
     include_patterns: tuple[str, ...] = (),
     index_filename: str | None = FAISS_INDEX_FILENAME,
     metadata_filename: str | None = FAISS_METADATA_FILENAME,
 ) -> VectorStore:
-    if index_filename and metadata_filename:
-        index_path = Path(dataset_dir) / index_filename
         metadata_path = Path(dataset_dir) / metadata_filename
     else:
         index_path = metadata_path = None
@@ -341,21 +378,17 @@ def _load_vector_store(
             dimension=output_dimensionality,
         )
-    questions = [record.normalized_question or record.question for record in records]
-    question_vectors = _embed_texts(
-        questions,
-        task_type="RETRIEVAL_DOCUMENT",
         embedding_model=embedding_model,
         output_dimensionality=output_dimensionality,
     )
-    dimension = int(question_vectors.shape[1])
-    index = faiss.IndexFlatIP(dimension)
-    index.add(question_vectors)
     return VectorStore(
         records=records,
         index=index,
         embedding_model=embedding_model,
-        dimension=dimension,
     )
@@ -368,6 +401,7 @@ class JsonQaRetriever:
         output_dimensionality: int = EMBEDDING_DIMENSION,
         include_patterns: Iterable[str] | None = None,
         index_filename: str | None = FAISS_INDEX_FILENAME,
         metadata_filename: str | None = FAISS_METADATA_FILENAME,
     ):
         self.dataset_dir = Path(dataset_dir)
@@ -375,6 +409,7 @@ class JsonQaRetriever:
         self.output_dimensionality = output_dimensionality
         self.include_patterns = _normalize_patterns(include_patterns)
         self.index_filename = index_filename
         self.metadata_filename = metadata_filename
     def warmup(self) -> None:
@@ -384,7 +419,19 @@ class JsonQaRetriever:
             self.output_dimensionality,
             self.include_patterns,
             self.index_filename,
             self.metadata_filename,
         )
     def _style_notes(self, matches: list[dict[str, Any]]) -> list[str]:
@@ -413,15 +460,27 @@ class JsonQaRetriever:
         return notes
     def retrieve(self, query: str, top_k: int = 3) -> dict[str, Any]:
-        store = _load_vector_store(
             str(self.dataset_dir.resolve()),
             self.embedding_model,
             self.output_dimensionality,
             self.include_patterns,
             self.index_filename,
             self.metadata_filename,
         )
-        if not store.records:
             return {
                 "query": query,
                 "match_count": 0,
@@ -434,26 +493,35 @@ class JsonQaRetriever:
         query_vector = _embed_texts(
             [_normalize_text(query) or query],
             task_type="RETRIEVAL_QUERY",
-            embedding_model=store.embedding_model,
-            output_dimensionality=store.dimension,
         )
-        search_k = max(1, min(top_k, len(store.records)))
-        scores, indices = store.index.search(query_vector, search_k)
-        matches: list[dict[str, Any]] = []
-        for score, index in zip(scores[0], indices[0]):
-            if index < 0:
-                continue
-            record = store.records[int(index)]
-            matches.append(
-                {
-                    "question": record.question,
-                    "answer": _safe_excerpt(record.answer),
-                    "score": round(float(score), 6),
-                    "source_file": record.source_file,
-                    "metadata": record.metadata,
-                }
-            )
         return {
             "query": query,

 EMBEDDING_DIMENSION = int(os.getenv("MEGUMIN_EMBEDDING_DIM", "768"))
 EMBEDDING_BATCH_SIZE = int(os.getenv("MEGUMIN_EMBEDDING_BATCH_SIZE", "100"))
 FAISS_INDEX_FILENAME = os.getenv("MEGUMIN_FAISS_INDEX_FILENAME", "megumin_questions.faiss")
+FAISS_QA_INDEX_FILENAME = os.getenv(
+    "MEGUMIN_FAISS_QA_INDEX_FILENAME",
+    "megumin_question_answer.faiss",
+)
 FAISS_METADATA_FILENAME = os.getenv(
     "MEGUMIN_FAISS_METADATA_FILENAME",
     "megumin_questions_meta.json",
     return normalized
+def _record_search_text(record: "QaRecord", mode: str) -> str:
+    if mode == "question_answer":
+        return f"{record.question}\n{record.answer}".strip()
+    return record.question
 @dataclass(frozen=True)
 class QaRecord:
     question: str
     )
+def _build_index_from_records(
+    records: tuple[QaRecord, ...],
+    *,
+    embedding_model: str,
+    output_dimensionality: int,
+    mode: str,
+) -> faiss.IndexFlatIP:
+    search_texts = [_record_search_text(record, mode) for record in records]
+    vectors = _embed_texts(
+        search_texts,
+        task_type="RETRIEVAL_DOCUMENT",
+        embedding_model=embedding_model,
+        output_dimensionality=output_dimensionality,
+    )
+    if vectors.size == 0:
+        raise RuntimeError("No embeddings were generated for the dataset records.")
+    index = faiss.IndexFlatIP(int(vectors.shape[1]))
+    index.add(vectors)
+    return index
 def build_and_save_faiss_index(
     dataset_dir: str | Path,
     *,
     embedding_model: str = EMBEDDING_MODEL_NAME,
     output_dimensionality: int = EMBEDDING_DIMENSION,
     index_filename: str = FAISS_INDEX_FILENAME,
+    qa_index_filename: str = FAISS_QA_INDEX_FILENAME,
     metadata_filename: str = FAISS_METADATA_FILENAME,
     include_patterns: Iterable[str] | None = None,
+) -> tuple[Path, Path, Path]:
     root = Path(dataset_dir)
     records = _load_records(str(root.resolve()), _normalize_patterns(include_patterns))
     if not records:
         raise FileNotFoundError(f"No JSON records found under {root}")
+    question_index = _build_index_from_records(
+        records,
         embedding_model=embedding_model,
         output_dimensionality=output_dimensionality,
+        mode="question",
+    )
+    qa_index = _build_index_from_records(
+        records,
+        embedding_model=embedding_model,
+        output_dimensionality=output_dimensionality,
+        mode="question_answer",
     )
     index_path = root / index_filename
+    qa_index_path = root / qa_index_filename
     metadata_path = root / metadata_filename
+    faiss.write_index(question_index, str(index_path))
+    faiss.write_index(qa_index, str(qa_index_path))
     metadata_payload = {
         "items": [
             {
         json.dumps(metadata_payload, ensure_ascii=False, indent=2),
         encoding="utf-8",
     )
+    return index_path, qa_index_path, metadata_path
 @lru_cache(maxsize=8)
     output_dimensionality: int,
     include_patterns: tuple[str, ...] = (),
     index_filename: str | None = FAISS_INDEX_FILENAME,
+    qa_index_filename: str | None = FAISS_QA_INDEX_FILENAME,
     metadata_filename: str | None = FAISS_METADATA_FILENAME,
+    mode: str = "question",
 ) -> VectorStore:
+    selected_index_filename = index_filename if mode == "question" else qa_index_filename
+    if selected_index_filename and metadata_filename:
+        index_path = Path(dataset_dir) / selected_index_filename
         metadata_path = Path(dataset_dir) / metadata_filename
     else:
         index_path = metadata_path = None
             dimension=output_dimensionality,
         )
+    index = _build_index_from_records(
+        records,
         embedding_model=embedding_model,
         output_dimensionality=output_dimensionality,
+        mode=mode,
     )
     return VectorStore(
         records=records,
         index=index,
         embedding_model=embedding_model,
+        dimension=index.d,
     )
         output_dimensionality: int = EMBEDDING_DIMENSION,
         include_patterns: Iterable[str] | None = None,
         index_filename: str | None = FAISS_INDEX_FILENAME,
+        qa_index_filename: str | None = FAISS_QA_INDEX_FILENAME,
         metadata_filename: str | None = FAISS_METADATA_FILENAME,
     ):
         self.dataset_dir = Path(dataset_dir)
         self.output_dimensionality = output_dimensionality
         self.include_patterns = _normalize_patterns(include_patterns)
         self.index_filename = index_filename
+        self.qa_index_filename = qa_index_filename
         self.metadata_filename = metadata_filename
     def warmup(self) -> None:
             self.output_dimensionality,
             self.include_patterns,
             self.index_filename,
+            self.qa_index_filename,
+            self.metadata_filename,
+            "question",
+        )
+        _load_vector_store(
+            str(self.dataset_dir.resolve()),
+            self.embedding_model,
+            self.output_dimensionality,
+            self.include_patterns,
+            self.index_filename,
+            self.qa_index_filename,
             self.metadata_filename,
+            "question_answer",
         )
     def _style_notes(self, matches: list[dict[str, Any]]) -> list[str]:
         return notes
     def retrieve(self, query: str, top_k: int = 3) -> dict[str, Any]:
+        question_store = _load_vector_store(
             str(self.dataset_dir.resolve()),
             self.embedding_model,
             self.output_dimensionality,
             self.include_patterns,
             self.index_filename,
+            self.qa_index_filename,
             self.metadata_filename,
+            "question",
         )
+        qa_store = _load_vector_store(
+            str(self.dataset_dir.resolve()),
+            self.embedding_model,
+            self.output_dimensionality,
+            self.include_patterns,
+            self.index_filename,
+            self.qa_index_filename,
+            self.metadata_filename,
+            "question_answer",
+        )
+        if not question_store.records:
             return {
                 "query": query,
                 "match_count": 0,
         query_vector = _embed_texts(
             [_normalize_text(query) or query],
             task_type="RETRIEVAL_QUERY",
+            embedding_model=question_store.embedding_model,
+            output_dimensionality=question_store.dimension,
         )
+        search_k = max(1, min(top_k, len(question_store.records)))
+        candidates: dict[int, dict[str, Any]] = {}
+        for store_name, store in (("question", question_store), ("question_answer", qa_store)):
+            scores, indices = store.index.search(query_vector, search_k)
+            for score, index in zip(scores[0], indices[0]):
+                if index < 0:
+                    continue
+                record = store.records[int(index)]
+                current = candidates.get(int(index))
+                score_value = round(float(score), 6)
+                if current is None or score_value > current["score"]:
+                    candidates[int(index)] = {
+                        "question": record.question,
+                        "answer": _safe_excerpt(record.answer),
+                        "score": score_value,
+                        "source_file": record.source_file,
+                        "metadata": record.metadata,
+                        "matched_via": store_name,
+                    }
+        matches = sorted(
+            candidates.values(),
+            key=lambda item: item["score"],
+            reverse=True,
+        )[:top_k]
         return {
             "query": query,