lwdgit commited on
Commit
03f8b69
1 Parent(s): 597fe90

feat: add init scripts

Browse files
Files changed (2) hide show
  1. init_data.py +39 -0
  2. translated-content +1 -0
init_data.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from qdrant_client import QdrantClient
2
+ from qdrant_client.http.models import Distance, VectorParams
3
+ from qdrant_client.http.models import PointStruct
4
+ import tqdm
5
+ import glob
6
+ import model
7
+ import re
8
+
9
+ if __name__ == '__main__':
10
+ client = QdrantClient("127.0.0.1", port=6333)
11
+ collection_name = "mdn-docs"
12
+ client.recreate_collection(
13
+ collection_name=collection_name,
14
+ vectors_config=VectorParams(size=768, distance=Distance.COSINE),
15
+ )
16
+
17
+ count = 0
18
+ files = glob.glob("translated-content/files/zh-cn/**/*.md", recursive=True)
19
+ print(len(files))
20
+ for file in tqdm.tqdm(files):
21
+ count+=1
22
+ with open(file, 'r', encoding='utf-8') as f:
23
+ print('file', file)
24
+ text = f.read()
25
+ matchObj = re.match(r'\s*---[\n\r]+title:(((?!---).)+)', text, re.M|re.I)
26
+ if matchObj:
27
+ title = matchObj.group(1).strip()
28
+ else:
29
+ title = file
30
+
31
+ vector = model.encode(text)
32
+ client.upsert(
33
+ collection_name=collection_name,
34
+ wait=True,
35
+ points=[
36
+ PointStruct(id=count, vector=vector, payload={"title": title, "text": text }),
37
+ ],
38
+ )
39
+
translated-content ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit 79462bd3fd2533e3b71a117d1c98fafb8d4ca0e2