trantuan1701's picture
initialize project
602e9df
raw
history blame
2.93 kB
import qdrant_client
from langchain.vectorstores import Qdrant
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
import os
import json
import re
from config import EMBEDDING_MODEL, QDRANT_HOST, QDRANT_API_KEY, QDRANT_COLECTION_NAME, EMBEDDING_SIZE
embedding_model = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
def load_json(file_path):
if not os.path.exists(file_path):
raise FileNotFoundError(f"File not found: {file_path}")
with open(file_path, "r", encoding="utf-8") as f:
return json.load(f)
def load_data(filepath):
products = load_json(filepath)
documents = []
for p in products:
text = f"""
URL: {p.get("url", "Không có thông tin")},
Tên: {p.get("product_name", "Không có tên")},
Mã sản phẩm: {p.get("product_id", "Không có mã")},
Model: {p.get("model", "Không có model")},
Giá gốc: {p.get("old_price", "Không có giá")} VND,
Giá KM: {p.get("price", "Không có giá")} VND,
Giảm giá: {p.get("discount_rate", "Không có thông tin")},
Mô tả: {p.get("description", "Không có mô tả")},
Thông số kỹ thuật: {json.dumps(p.get("specifications", {}), ensure_ascii=False)}
"""
text = re.sub(r'\n\s*', ' ', text)
documents.append(Document(page_content=text,
metadata={"L1": p.get("L1", "N/A"),
"L2": p.get("L2", "N/A"),
"L3": p.get("L3", "N/A"),
"L4": p.get("L4", "N/A"),
"SKU": p.get("SKU", "N/A"),
"Price": p.get("price", "N/A")}))
return documents
client = qdrant_client.QdrantClient(
QDRANT_HOST,
api_key = QDRANT_API_KEY,
)
def load_vectordb():
collections_info = client.get_collections()
if not any(col.name == QDRANT_COLECTION_NAME for col in collections_info.collections):
vectors_config = qdrant_client.http.models.VectorParams(
size=EMBEDDING_SIZE,
distance=qdrant_client.http.models.Distance.COSINE,
)
client.create_collection(
collection_name= QDRANT_COLECTION_NAME,
vectors_config=vectors_config,
)
vector_store = Qdrant(
client=client,
collection_name=QDRANT_COLECTION_NAME,
embeddings=embedding_model,
content_payload_key="page_content",
metadata_payload_key="metadata",
)
vector_store.add_documents(load_data("data/rangdong.json"))
else:
vector_store = Qdrant(
client=client,
collection_name=QDRANT_COLECTION_NAME,
embeddings=embedding_model,
content_payload_key="page_content",
metadata_payload_key="metadata",
)
return vector_store