File size: 1,849 Bytes
9b14ff1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import os
import json
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

# 1. ๋ฐ์ดํ„ฐ ๊ฒฝ๋กœ ์„ค์ •
source_paths = [
    r"data/real_estate_agent/raw/past_papers/brokerage_law.jsonl",
    r"data/real_estate_agent/raw/past_papers/civil_law.jsonl",
    r"data/real_estate_agent/raw/past_papers/disclosure_taxation.jsonl",
    r"data/real_estate_agent/raw/past_papers/introduction.jsonl",
    r"data/real_estate_agent/raw/past_papers/public_law.jsonl",
]

INDEX_PATH = "data/index/index.faiss"
DOCS_PATH = "data/index/docs.npy"

# 2. ์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ ๋กœ๋“œ
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def init_faiss():
    questions = []

    # 3. JSONL ํŒŒ์ผ ์ฝ๊ธฐ
    for path in source_paths:
        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                data = json.loads(line)
                question_text = data.get("question", "")
                if question_text:  # ์งˆ๋ฌธ์ด ๋น„์–ด์žˆ์ง€ ์•Š์œผ๋ฉด ์ถ”๊ฐ€
                    questions.append(question_text)

    print(f"โœ… ์ด {len(questions)}๊ฐœ ์งˆ๋ฌธ ๋กœ๋”ฉ ์™„๋ฃŒ")

    # 4. ์ž„๋ฒ ๋”ฉ ์ƒ์„ฑ
    embeddings = embedding_model.encode(
        questions, 
        batch_size=32, 
        show_progress_bar=True
    )
    embeddings = np.array(embeddings).astype('float32')

    # 5. FAISS ์ธ๋ฑ์Šค ์ƒ์„ฑ
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)  # L2 ๊ฑฐ๋ฆฌ ๊ธฐ๋ฐ˜ ์ธ๋ฑ์Šค
    index.add(embeddings)

    # 6. ์ €์žฅ
    os.makedirs(os.path.dirname(INDEX_PATH), exist_ok=True)
    faiss.write_index(index, INDEX_PATH)
    np.save(DOCS_PATH, questions)

    print(f"โœ… FAISS ์ธ๋ฑ์Šค์™€ ๋ฌธ์„œ ์ €์žฅ ์™„๋ฃŒ!")

if __name__ == "__main__":
    init_faiss()