Spaces:
Sleeping
Sleeping
from qdrant_client import QdrantClient | |
from qdrant_client.http.models import Distance, VectorParams | |
from qdrant_client.http.models import PointStruct | |
import tqdm | |
import glob | |
import model | |
import re | |
if __name__ == '__main__': | |
client = QdrantClient("127.0.0.1", port=6333) | |
collection_name = "mdn-docs" | |
client.recreate_collection( | |
collection_name=collection_name, | |
vectors_config=VectorParams(size=768, distance=Distance.COSINE), | |
) | |
count = 0 | |
files = glob.glob("translated-content/files/zh-cn/**/*.md", recursive=True) | |
print(len(files)) | |
for file in tqdm.tqdm(files): | |
count+=1 | |
with open(file, 'r', encoding='utf-8') as f: | |
print('file', file) | |
text = f.read() | |
matchObj = re.match(r'\s*---[\n\r]+title:(((?!---).)+)', text, re.M|re.I) | |
if matchObj: | |
title = matchObj.group(1).strip() | |
else: | |
title = file | |
vector = model.encode(text) | |
client.upsert( | |
collection_name=collection_name, | |
wait=True, | |
points=[ | |
PointStruct(id=count, vector=vector, payload={"title": title, "text": text }), | |
], | |
) | |