File size: 1,849 Bytes
9b14ff1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
import os
import json
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
# 1. ๋ฐ์ดํฐ ๊ฒฝ๋ก ์ค์
source_paths = [
r"data/real_estate_agent/raw/past_papers/brokerage_law.jsonl",
r"data/real_estate_agent/raw/past_papers/civil_law.jsonl",
r"data/real_estate_agent/raw/past_papers/disclosure_taxation.jsonl",
r"data/real_estate_agent/raw/past_papers/introduction.jsonl",
r"data/real_estate_agent/raw/past_papers/public_law.jsonl",
]
INDEX_PATH = "data/index/index.faiss"
DOCS_PATH = "data/index/docs.npy"
# 2. ์๋ฒ ๋ฉ ๋ชจ๋ธ ๋ก๋
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
def init_faiss():
questions = []
# 3. JSONL ํ์ผ ์ฝ๊ธฐ
for path in source_paths:
with open(path, "r", encoding="utf-8") as f:
for line in f:
data = json.loads(line)
question_text = data.get("question", "")
if question_text: # ์ง๋ฌธ์ด ๋น์ด์์ง ์์ผ๋ฉด ์ถ๊ฐ
questions.append(question_text)
print(f"โ
์ด {len(questions)}๊ฐ ์ง๋ฌธ ๋ก๋ฉ ์๋ฃ")
# 4. ์๋ฒ ๋ฉ ์์ฑ
embeddings = embedding_model.encode(
questions,
batch_size=32,
show_progress_bar=True
)
embeddings = np.array(embeddings).astype('float32')
# 5. FAISS ์ธ๋ฑ์ค ์์ฑ
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension) # L2 ๊ฑฐ๋ฆฌ ๊ธฐ๋ฐ ์ธ๋ฑ์ค
index.add(embeddings)
# 6. ์ ์ฅ
os.makedirs(os.path.dirname(INDEX_PATH), exist_ok=True)
faiss.write_index(index, INDEX_PATH)
np.save(DOCS_PATH, questions)
print(f"โ
FAISS ์ธ๋ฑ์ค์ ๋ฌธ์ ์ ์ฅ ์๋ฃ!")
if __name__ == "__main__":
init_faiss()
|