aopstudio commited on
Commit
d923a42
·
1 Parent(s): 8227469

Add markdown support for QA parser (#1180)

Browse files

### What problem does this PR solve?

Add markdown support for QA parser

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

Files changed (1) hide show
  1. rag/app/qa.py +54 -3
rag/app/qa.py CHANGED
@@ -145,6 +145,10 @@ def beAdoc(d, q, a, eng):
145
  d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
146
  return d
147
 
 
 
 
 
148
 
149
  def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
150
  """
@@ -214,6 +218,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
214
 
215
  return res
216
  elif re.search(r"\.pdf$", filename, re.IGNORECASE):
 
217
  pdf_parser = Pdf()
218
  count = 0
219
  qai_list, tbls = pdf_parser(filename if not binary else binary,
@@ -225,10 +230,58 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
225
  count += 1
226
  res.append(beAdocPdf(deepcopy(doc), q, a, eng, image, poss))
227
  return res
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
 
229
 
230
  raise NotImplementedError(
231
- "Excel and csv(txt) format files are supported.")
232
 
233
 
234
  if __name__ == "__main__":
@@ -236,6 +289,4 @@ if __name__ == "__main__":
236
 
237
  def dummy(prog=None, msg=""):
238
  pass
239
- import json
240
-
241
  chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)
 
145
  d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
146
  return d
147
 
148
+ def mdQuestionLevel(s):
149
+ match = re.match(r'#*', s)
150
+ return (len(match.group(0)), s.lstrip('#').lstrip()) if match else (0, s)
151
+
152
 
153
  def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
154
  """
 
218
 
219
  return res
220
  elif re.search(r"\.pdf$", filename, re.IGNORECASE):
221
+ callback(0.1, "Start to parse.")
222
  pdf_parser = Pdf()
223
  count = 0
224
  qai_list, tbls = pdf_parser(filename if not binary else binary,
 
230
  count += 1
231
  res.append(beAdocPdf(deepcopy(doc), q, a, eng, image, poss))
232
  return res
233
+ elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
234
+ callback(0.1, "Start to parse.")
235
+ txt = ""
236
+ if binary:
237
+ encoding = find_codec(binary)
238
+ txt = binary.decode(encoding, errors="ignore")
239
+ else:
240
+ with open(filename, "r") as f:
241
+ while True:
242
+ l = f.readline()
243
+ if not l:
244
+ break
245
+ txt += l
246
+ lines = txt.split("\n")
247
+ comma, tab = 0, 0
248
+ last_question, last_answer = "", ""
249
+ question_stack, level_stack = [], []
250
+ code_block = False
251
+ level_index = [-1] * 7
252
+ for index, l in enumerate(lines):
253
+ if not l.strip():
254
+ continue
255
+ if l.strip().startswith('```'):
256
+ code_block = not code_block
257
+ question_level, question = 0, ''
258
+ if not code_block:
259
+ question_level, question = mdQuestionLevel(l)
260
+
261
+ if not question_level or question_level > 6: # not a question
262
+ last_answer = f'{last_answer}\n{l}'
263
+ else: # is a question
264
+ if last_answer:
265
+ sum_question = ('\n').join(question_stack)
266
+ if sum_question:
267
+ res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng))
268
+ last_answer = ''
269
+
270
+ i = question_level
271
+ while question_stack and i <= level_stack[-1]:
272
+ question_stack.pop()
273
+ level_stack.pop()
274
+ question_stack.append(question)
275
+ level_stack.append(question_level)
276
+ if last_answer:
277
+ sum_question = ('\n').join(question_stack)
278
+ if sum_question:
279
+ res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng))
280
+ return res
281
 
282
 
283
  raise NotImplementedError(
284
+ "Excel, csv(txt), pdf and markdown format files are supported.")
285
 
286
 
287
  if __name__ == "__main__":
 
289
 
290
  def dummy(prog=None, msg=""):
291
  pass
 
 
292
  chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)