|
import os
|
|
import json
|
|
import faiss
|
|
import numpy as np
|
|
from sentence_transformers import SentenceTransformer
|
|
from tqdm import tqdm
|
|
|
|
|
|
source_paths = [
|
|
r"data/real_estate_agent/raw/past_papers/brokerage_law.jsonl",
|
|
r"data/real_estate_agent/raw/past_papers/civil_law.jsonl",
|
|
r"data/real_estate_agent/raw/past_papers/disclosure_taxation.jsonl",
|
|
r"data/real_estate_agent/raw/past_papers/introduction.jsonl",
|
|
r"data/real_estate_agent/raw/past_papers/public_law.jsonl",
|
|
]
|
|
|
|
INDEX_PATH = "data/index/index.faiss"
|
|
DOCS_PATH = "data/index/docs.npy"
|
|
|
|
|
|
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
|
|
|
|
def init_faiss():
|
|
questions = []
|
|
|
|
|
|
for path in source_paths:
|
|
with open(path, "r", encoding="utf-8") as f:
|
|
for line in f:
|
|
data = json.loads(line)
|
|
question_text = data.get("question", "")
|
|
if question_text:
|
|
questions.append(question_text)
|
|
|
|
print(f"โ
์ด {len(questions)}๊ฐ ์ง๋ฌธ ๋ก๋ฉ ์๋ฃ")
|
|
|
|
|
|
embeddings = embedding_model.encode(
|
|
questions,
|
|
batch_size=32,
|
|
show_progress_bar=True
|
|
)
|
|
embeddings = np.array(embeddings).astype('float32')
|
|
|
|
|
|
dimension = embeddings.shape[1]
|
|
index = faiss.IndexFlatL2(dimension)
|
|
index.add(embeddings)
|
|
|
|
|
|
os.makedirs(os.path.dirname(INDEX_PATH), exist_ok=True)
|
|
faiss.write_index(index, INDEX_PATH)
|
|
np.save(DOCS_PATH, questions)
|
|
|
|
print(f"โ
FAISS ์ธ๋ฑ์ค์ ๋ฌธ์ ์ ์ฅ ์๋ฃ!")
|
|
|
|
if __name__ == "__main__":
|
|
init_faiss()
|
|
|