KevinHuSh commited on
Commit
64350e9
·
1 Parent(s): cf3a5c1

chage tas execution logic (#103)

Browse files
deepdoc/vision/layout_recognizer.py CHANGED
@@ -15,6 +15,8 @@ import re
15
  from collections import Counter
16
  from copy import deepcopy
17
  import numpy as np
 
 
18
  from api.utils.file_utils import get_project_base_directory
19
  from deepdoc.vision import Recognizer
20
 
@@ -35,6 +37,7 @@ class LayoutRecognizer(Recognizer):
35
  ]
36
  def __init__(self, domain):
37
  super().__init__(self.labels, domain, os.path.join(get_project_base_directory(), "rag/res/deepdoc/"))
 
38
 
39
  def __call__(self, image_list, ocr_res, scale_factor=3, thr=0.2, batch_size=16):
40
  def __is_garbage(b):
@@ -85,7 +88,7 @@ class LayoutRecognizer(Recognizer):
85
  i += 1
86
  continue
87
  lts_[ii]["visited"] = True
88
- if lts_[ii]["type"] in ["footer", "header", "reference"]:
89
  if lts_[ii]["type"] not in garbages:
90
  garbages[lts_[ii]["type"]] = []
91
  garbages[lts_[ii]["type"]].append(bxs[i]["text"])
 
15
  from collections import Counter
16
  from copy import deepcopy
17
  import numpy as np
18
+
19
+ from api.db import ParserType
20
  from api.utils.file_utils import get_project_base_directory
21
  from deepdoc.vision import Recognizer
22
 
 
37
  ]
38
  def __init__(self, domain):
39
  super().__init__(self.labels, domain, os.path.join(get_project_base_directory(), "rag/res/deepdoc/"))
40
+ self.garbage_layouts = ["footer", "header", "reference"]
41
 
42
  def __call__(self, image_list, ocr_res, scale_factor=3, thr=0.2, batch_size=16):
43
  def __is_garbage(b):
 
88
  i += 1
89
  continue
90
  lts_[ii]["visited"] = True
91
+ if lts_[ii]["type"] in self.garbage_layouts:
92
  if lts_[ii]["type"] not in garbages:
93
  garbages[lts_[ii]["type"]] = []
94
  garbages[lts_[ii]["type"]].append(bxs[i]["text"])
docker/entrypoint.sh CHANGED
@@ -6,11 +6,10 @@ export LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu/
6
 
7
  PY=/root/miniconda3/envs/py11/bin/python
8
 
9
-
10
-
11
  function task_exe(){
12
- sleep 60;
13
- while [ 1 -eq 1 ];do mpirun -n 4 --allow-run-as-root $PY rag/svr/task_executor.py ; done
 
14
  }
15
 
16
  function watch_broker(){
@@ -29,7 +28,12 @@ function task_bro(){
29
  }
30
 
31
  task_bro &
32
- task_exe &
 
 
 
 
 
33
 
34
  $PY api/ragflow_server.py
35
 
 
6
 
7
  PY=/root/miniconda3/envs/py11/bin/python
8
 
 
 
9
  function task_exe(){
10
+ while [ 1 -eq 1 ];do
11
+ $PY rag/svr/task_executor.py $1 $2;
12
+ done
13
  }
14
 
15
  function watch_broker(){
 
28
  }
29
 
30
  task_bro &
31
+
32
+ WS=8
33
+ for ((i=0;i<WS;i++))
34
+ do
35
+ task_exe $i $WS &
36
+ done
37
 
38
  $PY api/ragflow_server.py
39
 
rag/nlp/__init__.py CHANGED
@@ -119,7 +119,6 @@ def add_positions(d, poss):
119
  d["page_num_int"].append(pn + 1)
120
  d["top_int"].append(top)
121
  d["position_int"].append((pn + 1, left, right, top, bottom))
122
- d["top_int"] = d["top_int"][:1]
123
 
124
 
125
  def remove_contents_table(sections, eng=False):
 
119
  d["page_num_int"].append(pn + 1)
120
  d["top_int"].append(top)
121
  d["position_int"].append((pn + 1, left, right, top, bottom))
 
122
 
123
 
124
  def remove_contents_table(sections, eng=False):
rag/nlp/query.py CHANGED
@@ -157,11 +157,11 @@ class EsQueryer:
157
  s = 1e-9
158
  for k, v in qtwt.items():
159
  if k in dtwt:
160
- s += v * dtwt[k]
161
  q = 1e-9
162
  for k, v in qtwt.items():
163
  q += v * v
164
  d = 1e-9
165
  for k, v in dtwt.items():
166
  d += v * v
167
- return s / math.sqrt(q) / math.sqrt(d)
 
157
  s = 1e-9
158
  for k, v in qtwt.items():
159
  if k in dtwt:
160
+ s += v# * dtwt[k]
161
  q = 1e-9
162
  for k, v in qtwt.items():
163
  q += v * v
164
  d = 1e-9
165
  for k, v in dtwt.items():
166
  d += v * v
167
+ return s / q#math.sqrt(q) / math.sqrt(d)
rag/nlp/search.py CHANGED
@@ -192,7 +192,7 @@ class Dealer:
192
  return [float(t) for t in txt.split("\t")]
193
 
194
  def insert_citations(self, answer, chunks, chunk_v,
195
- embd_mdl, tkweight=0.3, vtweight=0.7):
196
  assert len(chunks) == len(chunk_v)
197
  pieces = re.split(r"([;。?!!\n]|[a-z][.?;!][ \n])", answer)
198
  for i in range(1, len(pieces)):
@@ -224,7 +224,7 @@ class Dealer:
224
  chunks_tks,
225
  tkweight, vtweight)
226
  mx = np.max(sim) * 0.99
227
- if mx < 0.55:
228
  continue
229
  cites[idx[i]] = list(
230
  set([str(ii) for ii in range(len(chunk_v)) if sim[ii] > mx]))[:4]
@@ -237,7 +237,7 @@ class Dealer:
237
  if i not in cites:
238
  continue
239
  for c in cites[i]: assert int(c) < len(chunk_v)
240
- res += "##%s$$" % "$".join(cites[i])
241
 
242
  return res
243
 
 
192
  return [float(t) for t in txt.split("\t")]
193
 
194
  def insert_citations(self, answer, chunks, chunk_v,
195
+ embd_mdl, tkweight=0.7, vtweight=0.3):
196
  assert len(chunks) == len(chunk_v)
197
  pieces = re.split(r"([;。?!!\n]|[a-z][.?;!][ \n])", answer)
198
  for i in range(1, len(pieces)):
 
224
  chunks_tks,
225
  tkweight, vtweight)
226
  mx = np.max(sim) * 0.99
227
+ if mx < 0.35:
228
  continue
229
  cites[idx[i]] = list(
230
  set([str(ii) for ii in range(len(chunk_v)) if sim[ii] > mx]))[:4]
 
237
  if i not in cites:
238
  continue
239
  for c in cites[i]: assert int(c) < len(chunk_v)
240
+ for c in cites[i]: res += f" ##{c}$$"
241
 
242
  return res
243
 
rag/nlp/term_weight.py CHANGED
@@ -152,6 +152,7 @@ class Dealer:
152
  def ner(t):
153
  if not self.ne or t not in self.ne:
154
  return 1
 
155
  m = {"toxic": 2, "func": 1, "corp": 3, "loca": 3, "sch": 3, "stock": 3,
156
  "firstnm": 1}
157
  return m[self.ne[t]]
 
152
  def ner(t):
153
  if not self.ne or t not in self.ne:
154
  return 1
155
+ if re.match(r"[0-9,.]+$", t): return 2
156
  m = {"toxic": 2, "func": 1, "corp": 3, "loca": 3, "sch": 3, "stock": 3,
157
  "firstnm": 1}
158
  return m[self.ne[t]]
rag/settings.py CHANGED
@@ -36,3 +36,5 @@ es_logger = getLogger("es")
36
  minio_logger = getLogger("minio")
37
  cron_logger = getLogger("cron_logger")
38
  chunk_logger = getLogger("chunk_logger")
 
 
 
36
  minio_logger = getLogger("minio")
37
  cron_logger = getLogger("cron_logger")
38
  chunk_logger = getLogger("chunk_logger")
39
+ database_logger = getLogger("database")
40
+
rag/svr/task_executor.py CHANGED
@@ -23,13 +23,14 @@ import re
23
  import sys
24
  import traceback
25
  from functools import partial
26
- from timeit import default_timer as timer
 
 
27
 
28
  import numpy as np
29
  from elasticsearch_dsl import Q
30
 
31
  from api.db.services.task_service import TaskService
32
- from rag.settings import cron_logger, DOC_MAXIMUM_SIZE
33
  from rag.utils import ELASTICSEARCH
34
  from rag.utils import MINIO
35
  from rag.utils import rmSpace, findMaxTm
@@ -43,7 +44,6 @@ from rag.app import laws, paper, presentation, manual, qa, table, book, resume,
43
  from api.db import LLMType, ParserType
44
  from api.db.services.document_service import DocumentService
45
  from api.db.services.llm_service import LLMBundle
46
- from api.settings import database_logger
47
  from api.utils.file_utils import get_project_base_directory
48
 
49
  BATCH_SIZE = 64
@@ -267,4 +267,4 @@ if __name__ == "__main__":
267
  from mpi4py import MPI
268
 
269
  comm = MPI.COMM_WORLD
270
- main(comm.Get_size(), comm.Get_rank())
 
23
  import sys
24
  import traceback
25
  from functools import partial
26
+
27
+ from rag.settings import database_logger
28
+ from rag.settings import cron_logger, DOC_MAXIMUM_SIZE
29
 
30
  import numpy as np
31
  from elasticsearch_dsl import Q
32
 
33
  from api.db.services.task_service import TaskService
 
34
  from rag.utils import ELASTICSEARCH
35
  from rag.utils import MINIO
36
  from rag.utils import rmSpace, findMaxTm
 
44
  from api.db import LLMType, ParserType
45
  from api.db.services.document_service import DocumentService
46
  from api.db.services.llm_service import LLMBundle
 
47
  from api.utils.file_utils import get_project_base_directory
48
 
49
  BATCH_SIZE = 64
 
267
  from mpi4py import MPI
268
 
269
  comm = MPI.COMM_WORLD
270
+ main(int(sys.argv[2]), int(sys.argv[1]))