from qdrant_client import QdrantClient from qdrant_client.http.models import Distance, VectorParams from qdrant_client.http.models import PointStruct import tqdm import glob import model import re if __name__ == '__main__': client = QdrantClient("127.0.0.1", port=6333) collection_name = "mdn-docs" client.recreate_collection( collection_name=collection_name, vectors_config=VectorParams(size=768, distance=Distance.COSINE), ) count = 0 files = glob.glob("translated-content/files/zh-cn/**/*.md", recursive=True) print(len(files)) for file in tqdm.tqdm(files): count+=1 with open(file, 'r', encoding='utf-8') as f: print('file', file) text = f.read() matchObj = re.match(r'\s*---[\n\r]+title:(((?!---).)+)', text, re.M|re.I) if matchObj: title = matchObj.group(1).strip() else: title = file vector = model.encode(text) client.upsert( collection_name=collection_name, wait=True, points=[ PointStruct(id=count, vector=vector, payload={"title": title, "text": text }), ], )