File size: 1,251 Bytes
03f8b69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
from qdrant_client.http.models import PointStruct
import tqdm
import glob
import model
import re

if __name__ == '__main__':
    client = QdrantClient("127.0.0.1", port=6333)
    collection_name = "mdn-docs"
    client.recreate_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=768, distance=Distance.COSINE),
    )

    count = 0
    files = glob.glob("translated-content/files/zh-cn/**/*.md", recursive=True)
    print(len(files))
    for file in tqdm.tqdm(files):
        count+=1
        with open(file, 'r', encoding='utf-8') as f:
            print('file', file)
            text = f.read()
            matchObj = re.match(r'\s*---[\n\r]+title:(((?!---).)+)', text, re.M|re.I)
            if matchObj:
                title = matchObj.group(1).strip()
            else:
                title = file

            vector = model.encode(text)
            client.upsert(
                collection_name=collection_name,
                wait=True,
                points=[
                    PointStruct(id=count, vector=vector, payload={"title": title, "text": text }),
                ],
            )