embeddings-api-ernie / init_data.py
justest's picture
Duplicate from justest/embeddings-api
b19a023
raw
history blame
1.25 kB
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
from qdrant_client.http.models import PointStruct
import tqdm
import glob
import model
import re
if __name__ == '__main__':
client = QdrantClient("127.0.0.1", port=6333)
collection_name = "mdn-docs"
client.recreate_collection(
collection_name=collection_name,
vectors_config=VectorParams(size=768, distance=Distance.COSINE),
)
count = 0
files = glob.glob("translated-content/files/zh-cn/**/*.md", recursive=True)
print(len(files))
for file in tqdm.tqdm(files):
count+=1
with open(file, 'r', encoding='utf-8') as f:
print('file', file)
text = f.read()
matchObj = re.match(r'\s*---[\n\r]+title:(((?!---).)+)', text, re.M|re.I)
if matchObj:
title = matchObj.group(1).strip()
else:
title = file
vector = model.encode(text)
client.upsert(
collection_name=collection_name,
wait=True,
points=[
PointStruct(id=count, vector=vector, payload={"title": title, "text": text }),
],
)