Test / src /scripts /2_dump_vectors.py
Архипов Дмитрий
test
565e754
import time
import pandas as pd
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from src.db_utils.sql_utils import sql_fetch_batch
from src.db_utils.qdrant_utils import qdrant_insert, qdrant_create_index
from src.data.splitter import Splitter
if __name__ == "__main__":
splitter_mode = "recursive"
model_name = "deepvk/USER-bge-m3"
vector_index_name = f"{splitter_mode}_{model_name.split('/')[1]}"
# Инициализация объектов
splitter = Splitter(splitter_mode, chunk_size=256, chunk_overlap=64)
emb = HuggingFaceEmbeddings(
model_name=model_name,
encode_kwargs={"normalize_embeddings": True},
)
# Создание индекса
qdrant_create_index(
index_name=vector_index_name,
dim=len(emb.embed_documents(["None"])[0]),
distance="cosine",
)
# Загрузка документов батчами
batch_size = 16
offset = 0
while True:
rows = sql_fetch_batch(batch_size=batch_size, offset=offset)
if not rows:
break # дошли до конца
dfs = []
for r in rows:
chunks = splitter.split_text(r["content"])
vectors = emb.embed_documents(chunks)
dfs.append(pd.DataFrame({"doc_id": r["ctid"], "text": chunks, "vector": vectors}))
print(f"{offset} - {offset + batch_size}:", qdrant_insert(pd.concat(dfs), vector_index_name))
offset += batch_size
time.sleep(0.3)