Kevin Hu commited on
Commit
64508f3
·
1 Parent(s): e9c1552

let presentation do raptor (#2838)

Browse files

### What problem does this PR solve?

#2837

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

Files changed (2) hide show
  1. api/apps/document_app.py +3 -2
  2. rag/app/qa.py +10 -1
api/apps/document_app.py CHANGED
@@ -439,8 +439,9 @@ def change_parser():
439
  else:
440
  return get_json_result(data=True)
441
 
442
- if doc.type == FileType.VISUAL or re.search(
443
- r"\.(ppt|pptx|pages)$", doc.name):
 
444
  return get_data_error_result(retmsg="Not supported yet!")
445
 
446
  e = DocumentService.update_by_id(doc.id,
 
439
  else:
440
  return get_json_result(data=True)
441
 
442
+ if ((doc.type == FileType.VISUAL and req["parser_id"] != "picture")
443
+ or (re.search(
444
+ r"\.(ppt|pptx|pages)$", doc.name) and req["parser_id"] != "presentation")):
445
  return get_data_error_result(retmsg="Not supported yet!")
446
 
447
  e = DocumentService.update_by_id(doc.id,
rag/app/qa.py CHANGED
@@ -68,6 +68,7 @@ class Excel(ExcelParser):
68
  [rmPrefix(q) for q, _ in random_choices(res, k=30) if len(q) > 1])
69
  return res
70
 
 
71
  class Pdf(PdfParser):
72
  def __call__(self, filename, binary=None, from_page=0,
73
  to_page=100000, zoomin=3, callback=None):
@@ -155,6 +156,7 @@ class Pdf(PdfParser):
155
  if last_q:
156
  qai_list.append((last_q, last_a, *self.crop(last_tag, need_position=True)))
157
  return qai_list, tbls
 
158
  def get_tbls_info(self, tbls, tbl_index):
159
  if tbl_index >= len(tbls):
160
  return 1, 0, 0, 0, 0, '@@0\t0\t0\t0\t0##', ''
@@ -166,10 +168,13 @@ class Pdf(PdfParser):
166
  tbl_tag = "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
167
  .format(tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom)
168
  tbl_text = ''.join(tbls[tbl_index][0][1])
169
- return tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text
 
 
170
  class Docx(DocxParser):
171
  def __init__(self):
172
  pass
 
173
  def get_picture(self, document, paragraph):
174
  img = paragraph._element.xpath('.//pic:pic')
175
  if not img:
@@ -242,6 +247,7 @@ class Docx(DocxParser):
242
  tbls.append(((None, html), ""))
243
  return qai_list, tbls
244
 
 
245
  def rmPrefix(txt):
246
  return re.sub(
247
  r"^(问题|答案|回答|user|assistant|Q|A|Question|Answer|问|答)[\t:: ]+", "", txt.strip(), flags=re.IGNORECASE)
@@ -258,6 +264,7 @@ def beAdocPdf(d, q, a, eng, image, poss):
258
  add_positions(d, poss)
259
  return d
260
 
 
261
  def beAdocDocx(d, q, a, eng, image):
262
  qprefix = "Question: " if eng else "问题:"
263
  aprefix = "Answer: " if eng else "回答:"
@@ -268,6 +275,7 @@ def beAdocDocx(d, q, a, eng, image):
268
  d["image"] = image
269
  return d
270
 
 
271
  def beAdoc(d, q, a, eng):
272
  qprefix = "Question: " if eng else "问题:"
273
  aprefix = "Answer: " if eng else "回答:"
@@ -282,6 +290,7 @@ def mdQuestionLevel(s):
282
  match = re.match(r'#*', s)
283
  return (len(match.group(0)), s.lstrip('#').lstrip()) if match else (0, s)
284
 
 
285
  def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
286
  """
287
  Excel and csv(txt) format files are supported.
 
68
  [rmPrefix(q) for q, _ in random_choices(res, k=30) if len(q) > 1])
69
  return res
70
 
71
+
72
  class Pdf(PdfParser):
73
  def __call__(self, filename, binary=None, from_page=0,
74
  to_page=100000, zoomin=3, callback=None):
 
156
  if last_q:
157
  qai_list.append((last_q, last_a, *self.crop(last_tag, need_position=True)))
158
  return qai_list, tbls
159
+
160
  def get_tbls_info(self, tbls, tbl_index):
161
  if tbl_index >= len(tbls):
162
  return 1, 0, 0, 0, 0, '@@0\t0\t0\t0\t0##', ''
 
168
  tbl_tag = "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
169
  .format(tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom)
170
  tbl_text = ''.join(tbls[tbl_index][0][1])
171
+ return tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag,
172
+
173
+
174
  class Docx(DocxParser):
175
  def __init__(self):
176
  pass
177
+
178
  def get_picture(self, document, paragraph):
179
  img = paragraph._element.xpath('.//pic:pic')
180
  if not img:
 
247
  tbls.append(((None, html), ""))
248
  return qai_list, tbls
249
 
250
+
251
  def rmPrefix(txt):
252
  return re.sub(
253
  r"^(问题|答案|回答|user|assistant|Q|A|Question|Answer|问|答)[\t:: ]+", "", txt.strip(), flags=re.IGNORECASE)
 
264
  add_positions(d, poss)
265
  return d
266
 
267
+
268
  def beAdocDocx(d, q, a, eng, image):
269
  qprefix = "Question: " if eng else "问题:"
270
  aprefix = "Answer: " if eng else "回答:"
 
275
  d["image"] = image
276
  return d
277
 
278
+
279
  def beAdoc(d, q, a, eng):
280
  qprefix = "Question: " if eng else "问题:"
281
  aprefix = "Answer: " if eng else "回答:"
 
290
  match = re.match(r'#*', s)
291
  return (len(match.group(0)), s.lstrip('#').lstrip()) if match else (0, s)
292
 
293
+
294
  def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
295
  """
296
  Excel and csv(txt) format files are supported.