KevinHuSh
commited on
Commit
·
b085dec
1
Parent(s):
0cf2c67
add use layout or not option (#145)
Browse files* add use layout or not option
* trival
- api/apps/conversation_app.py +4 -1
- api/apps/document_app.py +6 -1
- api/db/init_data.py +2 -3
- api/db/services/document_service.py +17 -1
- api/settings.py +1 -1
- deepdoc/parser/__init__.py +1 -1
- deepdoc/parser/pdf_parser.py +32 -0
- rag/app/book.py +12 -15
- rag/app/laws.py +13 -23
- rag/app/manual.py +62 -14
- rag/app/naive.py +11 -19
- rag/app/one.py +10 -6
- rag/app/paper.py +18 -32
- rag/app/presentation.py +16 -18
- rag/nlp/__init__.py +19 -0
- rag/nlp/huqie.py +5 -1
- rag/nlp/search.py +4 -3
- rag/svr/task_broker.py +7 -3
api/apps/conversation_app.py
CHANGED
@@ -196,7 +196,10 @@ def chat(dialog, messages, **kwargs):
|
|
196 |
|
197 |
for _ in range(len(questions)//2):
|
198 |
questions.append(questions[-1])
|
199 |
-
|
|
|
|
|
|
|
200 |
dialog.similarity_threshold,
|
201 |
dialog.vector_similarity_weight, top=1024, aggs=False)
|
202 |
knowledges = [ck["content_with_weight"] for ck in kbinfos["chunks"]]
|
|
|
196 |
|
197 |
for _ in range(len(questions)//2):
|
198 |
questions.append(questions[-1])
|
199 |
+
if "knowledge" not in [p["key"] for p in prompt_config["parameters"]]:
|
200 |
+
kbinfos = {"total":0, "chunks":[],"doc_aggs":[]}
|
201 |
+
else:
|
202 |
+
kbinfos = retrievaler.retrieval(" ".join(questions), embd_mdl, dialog.tenant_id, dialog.kb_ids, 1, dialog.top_n,
|
203 |
dialog.similarity_threshold,
|
204 |
dialog.vector_similarity_weight, top=1024, aggs=False)
|
205 |
knowledges = [ck["content_with_weight"] for ck in kbinfos["chunks"]]
|
api/apps/document_app.py
CHANGED
@@ -310,7 +310,10 @@ def change_parser():
|
|
310 |
if not e:
|
311 |
return get_data_error_result(retmsg="Document not found!")
|
312 |
if doc.parser_id.lower() == req["parser_id"].lower():
|
313 |
-
|
|
|
|
|
|
|
314 |
|
315 |
if doc.type == FileType.VISUAL or re.search(r"\.(ppt|pptx|pages)$", doc.name):
|
316 |
return get_data_error_result(retmsg="Not supported yet!")
|
@@ -319,6 +322,8 @@ def change_parser():
|
|
319 |
{"parser_id": req["parser_id"], "progress": 0, "progress_msg": "", "run": "0"})
|
320 |
if not e:
|
321 |
return get_data_error_result(retmsg="Document not found!")
|
|
|
|
|
322 |
if doc.token_num > 0:
|
323 |
e = DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num * -1, doc.chunk_num * -1,
|
324 |
doc.process_duation * -1)
|
|
|
310 |
if not e:
|
311 |
return get_data_error_result(retmsg="Document not found!")
|
312 |
if doc.parser_id.lower() == req["parser_id"].lower():
|
313 |
+
if "parser_config" in req:
|
314 |
+
if req["parser_config"] == doc.parser_config:
|
315 |
+
return get_json_result(data=True)
|
316 |
+
else: return get_json_result(data=True)
|
317 |
|
318 |
if doc.type == FileType.VISUAL or re.search(r"\.(ppt|pptx|pages)$", doc.name):
|
319 |
return get_data_error_result(retmsg="Not supported yet!")
|
|
|
322 |
{"parser_id": req["parser_id"], "progress": 0, "progress_msg": "", "run": "0"})
|
323 |
if not e:
|
324 |
return get_data_error_result(retmsg="Document not found!")
|
325 |
+
if "parser_config" in req:
|
326 |
+
DocumentService.update_parser_config(doc.id, req["parser_config"])
|
327 |
if doc.token_num > 0:
|
328 |
e = DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num * -1, doc.chunk_num * -1,
|
329 |
doc.process_duation * -1)
|
api/db/init_data.py
CHANGED
@@ -276,7 +276,7 @@ def init_llm_factory():
|
|
276 |
drop table llm_factories;
|
277 |
update tenant_llm set llm_factory='Tongyi-Qianwen' where llm_factory='通义千问';
|
278 |
update tenant_llm set llm_factory='ZHIPU-AI' where llm_factory='智谱AI';
|
279 |
-
update tenant set parser_ids='naive:General,
|
280 |
alter table knowledgebase modify avatar longtext;
|
281 |
alter table user modify avatar longtext;
|
282 |
alter table dialog modify icon longtext;
|
@@ -297,5 +297,4 @@ def init_web_data():
|
|
297 |
|
298 |
if __name__ == '__main__':
|
299 |
init_web_db()
|
300 |
-
init_web_data()
|
301 |
-
add_tenant_llm()
|
|
|
276 |
drop table llm_factories;
|
277 |
update tenant_llm set llm_factory='Tongyi-Qianwen' where llm_factory='通义千问';
|
278 |
update tenant_llm set llm_factory='ZHIPU-AI' where llm_factory='智谱AI';
|
279 |
+
update tenant set parser_ids='naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One';
|
280 |
alter table knowledgebase modify avatar longtext;
|
281 |
alter table user modify avatar longtext;
|
282 |
alter table dialog modify icon longtext;
|
|
|
297 |
|
298 |
if __name__ == '__main__':
|
299 |
init_web_db()
|
300 |
+
init_web_data()
|
|
api/db/services/document_service.py
CHANGED
@@ -118,9 +118,25 @@ class DocumentService(CommonService):
|
|
118 |
if not docs:return
|
119 |
return docs[0]["tenant_id"]
|
120 |
|
121 |
-
|
122 |
@classmethod
|
123 |
@DB.connection_context()
|
124 |
def get_thumbnails(cls, docids):
|
125 |
fields = [cls.model.id, cls.model.thumbnail]
|
126 |
return list(cls.model.select(*fields).where(cls.model.id.in_(docids)).dicts())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
if not docs:return
|
119 |
return docs[0]["tenant_id"]
|
120 |
|
|
|
121 |
@classmethod
|
122 |
@DB.connection_context()
|
123 |
def get_thumbnails(cls, docids):
|
124 |
fields = [cls.model.id, cls.model.thumbnail]
|
125 |
return list(cls.model.select(*fields).where(cls.model.id.in_(docids)).dicts())
|
126 |
+
|
127 |
+
@classmethod
|
128 |
+
@DB.connection_context()
|
129 |
+
def update_parser_config(cls, id, config):
|
130 |
+
e, d = cls.get_by_id(id)
|
131 |
+
if not e:raise LookupError(f"Document({id}) not found.")
|
132 |
+
def dfs_update(old, new):
|
133 |
+
for k,v in new.items():
|
134 |
+
if k not in old:
|
135 |
+
old[k] = v
|
136 |
+
continue
|
137 |
+
if isinstance(v, dict):
|
138 |
+
assert isinstance(old[k], dict)
|
139 |
+
dfs_update(old[k], v)
|
140 |
+
else: old[k] = v
|
141 |
+
dfs_update(d.parser_config, config)
|
142 |
+
cls.update_by_id(id, {"parser_config": d.parser_config})
|
api/settings.py
CHANGED
@@ -94,7 +94,7 @@ ASR_MDL = default_llm[LLM_FACTORY]["asr_model"]
|
|
94 |
IMAGE2TEXT_MDL = default_llm[LLM_FACTORY]["image2text_model"]
|
95 |
|
96 |
API_KEY = LLM.get("api_key", "")
|
97 |
-
PARSERS = LLM.get("parsers", "naive:General,
|
98 |
|
99 |
# distribution
|
100 |
DEPENDENT_DISTRIBUTION = get_base_config("dependent_distribution", False)
|
|
|
94 |
IMAGE2TEXT_MDL = default_llm[LLM_FACTORY]["image2text_model"]
|
95 |
|
96 |
API_KEY = LLM.get("api_key", "")
|
97 |
+
PARSERS = LLM.get("parsers", "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One")
|
98 |
|
99 |
# distribution
|
100 |
DEPENDENT_DISTRIBUTION = get_base_config("dependent_distribution", False)
|
deepdoc/parser/__init__.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
|
2 |
|
3 |
-
from .pdf_parser import HuParser as PdfParser
|
4 |
from .docx_parser import HuDocxParser as DocxParser
|
5 |
from .excel_parser import HuExcelParser as ExcelParser
|
6 |
from .ppt_parser import HuPptParser as PptParser
|
|
|
1 |
|
2 |
|
3 |
+
from .pdf_parser import HuParser as PdfParser, PlainParser
|
4 |
from .docx_parser import HuDocxParser as DocxParser
|
5 |
from .excel_parser import HuExcelParser as ExcelParser
|
6 |
from .ppt_parser import HuPptParser as PptParser
|
deepdoc/parser/pdf_parser.py
CHANGED
@@ -1073,5 +1073,37 @@ class HuParser:
|
|
1073 |
return poss
|
1074 |
|
1075 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1076 |
if __name__ == "__main__":
|
1077 |
pass
|
|
|
1073 |
return poss
|
1074 |
|
1075 |
|
1076 |
+
class PlainParser(object):
|
1077 |
+
def __call__(self, filename, **kwargs):
|
1078 |
+
self.outlines = []
|
1079 |
+
lines = []
|
1080 |
+
try:
|
1081 |
+
self.pdf = pdf2_read(filename if isinstance(filename, str) else BytesIO(filename))
|
1082 |
+
outlines = self.pdf.outline
|
1083 |
+
for page in self.pdf.pages:
|
1084 |
+
lines.extend([t for t in page.extract_text().split("\n")])
|
1085 |
+
|
1086 |
+
def dfs(arr, depth):
|
1087 |
+
for a in arr:
|
1088 |
+
if isinstance(a, dict):
|
1089 |
+
self.outlines.append((a["/Title"], depth))
|
1090 |
+
continue
|
1091 |
+
dfs(a, depth + 1)
|
1092 |
+
|
1093 |
+
dfs(outlines, 0)
|
1094 |
+
except Exception as e:
|
1095 |
+
logging.warning(f"Outlines exception: {e}")
|
1096 |
+
if not self.outlines:
|
1097 |
+
logging.warning(f"Miss outlines")
|
1098 |
+
|
1099 |
+
return [(l, "") for l in lines], []
|
1100 |
+
|
1101 |
+
def crop(self, ck, need_position):
|
1102 |
+
raise NotImplementedError
|
1103 |
+
|
1104 |
+
@staticmethod
|
1105 |
+
def remove_tag(txt):
|
1106 |
+
raise NotImplementedError
|
1107 |
+
|
1108 |
if __name__ == "__main__":
|
1109 |
pass
|
rag/app/book.py
CHANGED
@@ -12,10 +12,12 @@
|
|
12 |
#
|
13 |
import copy
|
14 |
import re
|
|
|
|
|
15 |
from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, \
|
16 |
-
hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, add_positions
|
17 |
from rag.nlp import huqie
|
18 |
-
from deepdoc.parser import PdfParser, DocxParser
|
19 |
|
20 |
|
21 |
class Pdf(PdfParser):
|
@@ -69,10 +71,12 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
|
69 |
sections, tbls = doc_parser(binary if binary else filename, from_page=from_page, to_page=to_page)
|
70 |
remove_contents_table(sections, eng=is_english(random_choices([t for t,_ in sections], k=200)))
|
71 |
callback(0.8, "Finish parsing.")
|
|
|
72 |
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
73 |
-
pdf_parser = Pdf()
|
74 |
sections, tbls = pdf_parser(filename if not binary else binary,
|
75 |
from_page=from_page, to_page=to_page, callback=callback)
|
|
|
76 |
elif re.search(r"\.txt$", filename, re.IGNORECASE):
|
77 |
callback(0.1, "Start to parse.")
|
78 |
txt = ""
|
@@ -87,31 +91,24 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
|
87 |
sections = [(l,"") for l in sections if l]
|
88 |
remove_contents_table(sections, eng = is_english(random_choices([t for t,_ in sections], k=200)))
|
89 |
callback(0.8, "Finish parsing.")
|
|
|
90 |
else: raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
|
91 |
|
92 |
make_colon_as_title(sections)
|
93 |
bull = bullets_category([t for t in random_choices([t for t,_ in sections], k=100)])
|
94 |
-
if bull >= 0:
|
|
|
95 |
else:
|
96 |
sections = [s.split("@") for s,_ in sections]
|
97 |
sections = [(pr[0], "@"+pr[1]) for pr in sections if len(pr)==2]
|
98 |
-
|
99 |
|
100 |
# is it English
|
101 |
eng = lang.lower() == "english"#is_english(random_choices([t for t, _ in sections], k=218))
|
102 |
|
103 |
res = tokenize_table(tbls, doc, eng)
|
|
|
104 |
|
105 |
-
# wrap up to es documents
|
106 |
-
for ck in cks:
|
107 |
-
d = copy.deepcopy(doc)
|
108 |
-
ck = "\n".join(ck)
|
109 |
-
if pdf_parser:
|
110 |
-
d["image"], poss = pdf_parser.crop(ck, need_position=True)
|
111 |
-
add_positions(d, poss)
|
112 |
-
ck = pdf_parser.remove_tag(ck)
|
113 |
-
tokenize(d, ck, eng)
|
114 |
-
res.append(d)
|
115 |
return res
|
116 |
|
117 |
|
|
|
12 |
#
|
13 |
import copy
|
14 |
import re
|
15 |
+
from io import BytesIO
|
16 |
+
|
17 |
from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, \
|
18 |
+
hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, add_positions, tokenize_chunks
|
19 |
from rag.nlp import huqie
|
20 |
+
from deepdoc.parser import PdfParser, DocxParser, PlainParser
|
21 |
|
22 |
|
23 |
class Pdf(PdfParser):
|
|
|
71 |
sections, tbls = doc_parser(binary if binary else filename, from_page=from_page, to_page=to_page)
|
72 |
remove_contents_table(sections, eng=is_english(random_choices([t for t,_ in sections], k=200)))
|
73 |
callback(0.8, "Finish parsing.")
|
74 |
+
|
75 |
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
76 |
+
pdf_parser = Pdf() if kwargs.get("parser_config",{}).get("layout_recognize", True) else PlainParser()
|
77 |
sections, tbls = pdf_parser(filename if not binary else binary,
|
78 |
from_page=from_page, to_page=to_page, callback=callback)
|
79 |
+
|
80 |
elif re.search(r"\.txt$", filename, re.IGNORECASE):
|
81 |
callback(0.1, "Start to parse.")
|
82 |
txt = ""
|
|
|
91 |
sections = [(l,"") for l in sections if l]
|
92 |
remove_contents_table(sections, eng = is_english(random_choices([t for t,_ in sections], k=200)))
|
93 |
callback(0.8, "Finish parsing.")
|
94 |
+
|
95 |
else: raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
|
96 |
|
97 |
make_colon_as_title(sections)
|
98 |
bull = bullets_category([t for t in random_choices([t for t,_ in sections], k=100)])
|
99 |
+
if bull >= 0:
|
100 |
+
chunks = ["\n".join(ck) for ck in hierarchical_merge(bull, sections, 3)]
|
101 |
else:
|
102 |
sections = [s.split("@") for s,_ in sections]
|
103 |
sections = [(pr[0], "@"+pr[1]) for pr in sections if len(pr)==2]
|
104 |
+
chunks = naive_merge(sections, kwargs.get("chunk_token_num", 256), kwargs.get("delimer", "\n。;!?"))
|
105 |
|
106 |
# is it English
|
107 |
eng = lang.lower() == "english"#is_english(random_choices([t for t, _ in sections], k=218))
|
108 |
|
109 |
res = tokenize_table(tbls, doc, eng)
|
110 |
+
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
|
111 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
return res
|
113 |
|
114 |
|
rag/app/laws.py
CHANGED
@@ -15,9 +15,9 @@ import re
|
|
15 |
from io import BytesIO
|
16 |
from docx import Document
|
17 |
from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
|
18 |
-
make_colon_as_title, add_positions
|
19 |
from rag.nlp import huqie
|
20 |
-
from deepdoc.parser import PdfParser, DocxParser
|
21 |
from rag.settings import cron_logger
|
22 |
|
23 |
|
@@ -68,7 +68,7 @@ class Pdf(PdfParser):
|
|
68 |
|
69 |
callback(0.8, "Text extraction finished")
|
70 |
|
71 |
-
return [b["text"]
|
72 |
|
73 |
|
74 |
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
|
@@ -87,11 +87,13 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
|
87 |
for txt in Docx()(filename, binary):
|
88 |
sections.append(txt)
|
89 |
callback(0.8, "Finish parsing.")
|
|
|
90 |
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
|
|
95 |
elif re.search(r"\.txt$", filename, re.IGNORECASE):
|
96 |
callback(0.1, "Start to parse.")
|
97 |
txt = ""
|
@@ -114,22 +116,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
|
114 |
|
115 |
make_colon_as_title(sections)
|
116 |
bull = bullets_category(sections)
|
117 |
-
|
118 |
-
if not
|
119 |
-
|
120 |
-
|
121 |
-
# wrap up to es documents
|
122 |
-
for ck in cks:
|
123 |
-
print("\n-".join(ck))
|
124 |
-
ck = "\n".join(ck)
|
125 |
-
d = copy.deepcopy(doc)
|
126 |
-
if pdf_parser:
|
127 |
-
d["image"], poss = pdf_parser.crop(ck, need_position=True)
|
128 |
-
add_positions(d, poss)
|
129 |
-
ck = pdf_parser.remove_tag(ck)
|
130 |
-
tokenize(d, ck, eng)
|
131 |
-
res.append(d)
|
132 |
-
return res
|
133 |
|
134 |
|
135 |
if __name__ == "__main__":
|
|
|
15 |
from io import BytesIO
|
16 |
from docx import Document
|
17 |
from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
|
18 |
+
make_colon_as_title, add_positions, tokenize_chunks
|
19 |
from rag.nlp import huqie
|
20 |
+
from deepdoc.parser import PdfParser, DocxParser, PlainParser
|
21 |
from rag.settings import cron_logger
|
22 |
|
23 |
|
|
|
68 |
|
69 |
callback(0.8, "Text extraction finished")
|
70 |
|
71 |
+
return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes]
|
72 |
|
73 |
|
74 |
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
|
|
|
87 |
for txt in Docx()(filename, binary):
|
88 |
sections.append(txt)
|
89 |
callback(0.8, "Finish parsing.")
|
90 |
+
|
91 |
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
92 |
+
pdf_parser = Pdf() if kwargs.get("parser_config",{}).get("layout_recognize", True) else PlainParser()
|
93 |
+
for txt, poss in pdf_parser(filename if not binary else binary,
|
94 |
+
from_page=from_page, to_page=to_page, callback=callback):
|
95 |
+
sections.append(txt + poss)
|
96 |
+
|
97 |
elif re.search(r"\.txt$", filename, re.IGNORECASE):
|
98 |
callback(0.1, "Start to parse.")
|
99 |
txt = ""
|
|
|
116 |
|
117 |
make_colon_as_title(sections)
|
118 |
bull = bullets_category(sections)
|
119 |
+
chunks = hierarchical_merge(bull, sections, 3)
|
120 |
+
if not chunks: callback(0.99, "No chunk parsed out.")
|
121 |
+
|
122 |
+
return tokenize_chunks(["\n".join(ck) for ck in chunks], doc, eng, pdf_parser)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
|
124 |
|
125 |
if __name__ == "__main__":
|
rag/app/manual.py
CHANGED
@@ -2,8 +2,8 @@ import copy
|
|
2 |
import re
|
3 |
|
4 |
from api.db import ParserType
|
5 |
-
from rag.nlp import huqie, tokenize, tokenize_table, add_positions, bullets_category, title_frequency
|
6 |
-
from deepdoc.parser import PdfParser
|
7 |
from rag.utils import num_tokens_from_string
|
8 |
|
9 |
|
@@ -30,9 +30,7 @@ class Pdf(PdfParser):
|
|
30 |
# print(b)
|
31 |
print("OCR:", timer()-start)
|
32 |
|
33 |
-
|
34 |
-
return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
|
35 |
-
.format(pn, left, right, top, bottom)
|
36 |
|
37 |
self._layouts_rec(zoomin)
|
38 |
callback(0.65, "Layout analysis finished.")
|
@@ -49,6 +47,8 @@ class Pdf(PdfParser):
|
|
49 |
for b in self.boxes:
|
50 |
b["text"] = re.sub(r"([\t ]|\u3000){2,}", " ", b["text"].strip())
|
51 |
|
|
|
|
|
52 |
# set pivot using the most frequent type of title,
|
53 |
# then merge between 2 pivot
|
54 |
if len(self.boxes)>0 and len(self.outlines)/len(self.boxes) > 0.1:
|
@@ -103,9 +103,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
|
103 |
pdf_parser = None
|
104 |
|
105 |
if re.search(r"\.pdf$", filename, re.IGNORECASE):
|
106 |
-
pdf_parser = Pdf()
|
107 |
-
|
108 |
-
|
|
|
109 |
else: raise NotImplementedError("file type not supported yet(pdf supported)")
|
110 |
doc = {
|
111 |
"docnm_kwd": filename
|
@@ -115,13 +116,60 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
|
115 |
# is it English
|
116 |
eng = lang.lower() == "english"#pdf_parser.is_english
|
117 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
res = tokenize_table(tbls, doc, eng)
|
119 |
-
|
120 |
-
d = copy.deepcopy(doc)
|
121 |
-
d["image"], poss = pdf_parser.crop(ck, need_position=True)
|
122 |
-
add_positions(d, poss)
|
123 |
-
tokenize(d, pdf_parser.remove_tag(ck), eng)
|
124 |
-
res.append(d)
|
125 |
return res
|
126 |
|
127 |
|
|
|
2 |
import re
|
3 |
|
4 |
from api.db import ParserType
|
5 |
+
from rag.nlp import huqie, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
|
6 |
+
from deepdoc.parser import PdfParser, PlainParser
|
7 |
from rag.utils import num_tokens_from_string
|
8 |
|
9 |
|
|
|
30 |
# print(b)
|
31 |
print("OCR:", timer()-start)
|
32 |
|
33 |
+
|
|
|
|
|
34 |
|
35 |
self._layouts_rec(zoomin)
|
36 |
callback(0.65, "Layout analysis finished.")
|
|
|
47 |
for b in self.boxes:
|
48 |
b["text"] = re.sub(r"([\t ]|\u3000){2,}", " ", b["text"].strip())
|
49 |
|
50 |
+
return [(b["text"], b.get("layout_no", ""), self.get_position(b, zoomin)) for i, b in enumerate(self.boxes)]
|
51 |
+
|
52 |
# set pivot using the most frequent type of title,
|
53 |
# then merge between 2 pivot
|
54 |
if len(self.boxes)>0 and len(self.outlines)/len(self.boxes) > 0.1:
|
|
|
103 |
pdf_parser = None
|
104 |
|
105 |
if re.search(r"\.pdf$", filename, re.IGNORECASE):
|
106 |
+
pdf_parser = Pdf() if kwargs.get("parser_config",{}).get("layout_recognize", True) else PlainParser()
|
107 |
+
sections, tbls = pdf_parser(filename if not binary else binary,
|
108 |
+
from_page=from_page, to_page=to_page, callback=callback)
|
109 |
+
if sections and len(sections[0])<3: cks = [(t, l, [0]*5) for t, l in sections]
|
110 |
else: raise NotImplementedError("file type not supported yet(pdf supported)")
|
111 |
doc = {
|
112 |
"docnm_kwd": filename
|
|
|
116 |
# is it English
|
117 |
eng = lang.lower() == "english"#pdf_parser.is_english
|
118 |
|
119 |
+
# set pivot using the most frequent type of title,
|
120 |
+
# then merge between 2 pivot
|
121 |
+
if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.1:
|
122 |
+
max_lvl = max([lvl for _, lvl in pdf_parser.outlines])
|
123 |
+
most_level = max(0, max_lvl - 1)
|
124 |
+
levels = []
|
125 |
+
for txt, _, _ in sections:
|
126 |
+
for t, lvl in pdf_parser.outlines:
|
127 |
+
tks = set([t[i] + t[i + 1] for i in range(len(t) - 1)])
|
128 |
+
tks_ = set([txt[i] + txt[i + 1] for i in range(min(len(t), len(txt) - 1))])
|
129 |
+
if len(set(tks & tks_)) / max([len(tks), len(tks_), 1]) > 0.8:
|
130 |
+
levels.append(lvl)
|
131 |
+
break
|
132 |
+
else:
|
133 |
+
levels.append(max_lvl + 1)
|
134 |
+
else:
|
135 |
+
bull = bullets_category([txt for txt,_,_ in sections])
|
136 |
+
most_level, levels = title_frequency(bull, [(txt, l) for txt, l, poss in sections])
|
137 |
+
|
138 |
+
assert len(sections) == len(levels)
|
139 |
+
sec_ids = []
|
140 |
+
sid = 0
|
141 |
+
for i, lvl in enumerate(levels):
|
142 |
+
if lvl <= most_level and i > 0 and lvl != levels[i - 1]: sid += 1
|
143 |
+
sec_ids.append(sid)
|
144 |
+
# print(lvl, self.boxes[i]["text"], most_level, sid)
|
145 |
+
|
146 |
+
sections = [(txt, sec_ids[i], poss) for i, (txt, _, poss) in enumerate(sections)]
|
147 |
+
for (img, rows), poss in tbls:
|
148 |
+
sections.append((rows if isinstance(rows, str) else rows[0], -1,
|
149 |
+
[(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
|
150 |
+
|
151 |
+
def tag(pn, left, right, top, bottom):
|
152 |
+
if pn+left+right+top+bottom == 0:
|
153 |
+
return ""
|
154 |
+
return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
|
155 |
+
.format(pn, left, right, top, bottom)
|
156 |
+
|
157 |
+
chunks = []
|
158 |
+
last_sid = -2
|
159 |
+
tk_cnt = 0
|
160 |
+
for txt, sec_id, poss in sorted(sections, key=lambda x: (x[-1][0][0], x[-1][0][3], x[-1][0][1])):
|
161 |
+
poss = "\t".join([tag(*pos) for pos in poss])
|
162 |
+
if tk_cnt < 2048 and (sec_id == last_sid or sec_id == -1):
|
163 |
+
if chunks:
|
164 |
+
chunks[-1] += "\n" + txt + poss
|
165 |
+
tk_cnt += num_tokens_from_string(txt)
|
166 |
+
continue
|
167 |
+
chunks.append(txt + poss)
|
168 |
+
tk_cnt = num_tokens_from_string(txt)
|
169 |
+
if sec_id > -1: last_sid = sec_id
|
170 |
+
|
171 |
res = tokenize_table(tbls, doc, eng)
|
172 |
+
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
|
|
|
|
|
|
|
|
|
|
|
173 |
return res
|
174 |
|
175 |
|
rag/app/naive.py
CHANGED
@@ -12,8 +12,9 @@
|
|
12 |
#
|
13 |
import copy
|
14 |
import re
|
|
|
15 |
from rag.app import laws
|
16 |
-
from rag.nlp import huqie, is_english, tokenize, naive_merge, tokenize_table, add_positions
|
17 |
from deepdoc.parser import PdfParser, ExcelParser
|
18 |
from rag.settings import cron_logger
|
19 |
|
@@ -56,6 +57,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
|
56 |
"""
|
57 |
|
58 |
eng = lang.lower() == "english"#is_english(cks)
|
|
|
59 |
doc = {
|
60 |
"docnm_kwd": filename,
|
61 |
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
@@ -69,15 +71,18 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
|
69 |
for txt in laws.Docx()(filename, binary):
|
70 |
sections.append((txt, ""))
|
71 |
callback(0.8, "Finish parsing.")
|
|
|
72 |
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
73 |
-
pdf_parser = Pdf()
|
74 |
sections, tbls = pdf_parser(filename if not binary else binary,
|
75 |
from_page=from_page, to_page=to_page, callback=callback)
|
76 |
res = tokenize_table(tbls, doc, eng)
|
|
|
77 |
elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
|
78 |
callback(0.1, "Start to parse.")
|
79 |
excel_parser = ExcelParser()
|
80 |
sections = [(excel_parser.html(binary), "")]
|
|
|
81 |
elif re.search(r"\.txt$", filename, re.IGNORECASE):
|
82 |
callback(0.1, "Start to parse.")
|
83 |
txt = ""
|
@@ -92,26 +97,13 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
|
92 |
sections = txt.split("\n")
|
93 |
sections = [(l, "") for l in sections if l]
|
94 |
callback(0.8, "Finish parsing.")
|
|
|
95 |
else:
|
96 |
raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
|
97 |
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
# wrap up to es documents
|
102 |
-
for ck in cks:
|
103 |
-
if len(ck.strip()) == 0:continue
|
104 |
-
print("--", ck)
|
105 |
-
d = copy.deepcopy(doc)
|
106 |
-
if pdf_parser:
|
107 |
-
try:
|
108 |
-
d["image"], poss = pdf_parser.crop(ck, need_position=True)
|
109 |
-
except Exception as e:
|
110 |
-
continue
|
111 |
-
add_positions(d, poss)
|
112 |
-
ck = pdf_parser.remove_tag(ck)
|
113 |
-
tokenize(d, ck, eng)
|
114 |
-
res.append(d)
|
115 |
return res
|
116 |
|
117 |
|
|
|
12 |
#
|
13 |
import copy
|
14 |
import re
|
15 |
+
from deepdoc.parser.pdf_parser import PlainParser
|
16 |
from rag.app import laws
|
17 |
+
from rag.nlp import huqie, is_english, tokenize, naive_merge, tokenize_table, add_positions, tokenize_chunks
|
18 |
from deepdoc.parser import PdfParser, ExcelParser
|
19 |
from rag.settings import cron_logger
|
20 |
|
|
|
57 |
"""
|
58 |
|
59 |
eng = lang.lower() == "english"#is_english(cks)
|
60 |
+
parser_config = kwargs.get("parser_config", {"chunk_token_num": 128, "delimiter": "\n!?。;!?", "layout_recognize": True})
|
61 |
doc = {
|
62 |
"docnm_kwd": filename,
|
63 |
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
|
|
71 |
for txt in laws.Docx()(filename, binary):
|
72 |
sections.append((txt, ""))
|
73 |
callback(0.8, "Finish parsing.")
|
74 |
+
|
75 |
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
76 |
+
pdf_parser = Pdf() if parser_config["layout_recognize"] else PlainParser()
|
77 |
sections, tbls = pdf_parser(filename if not binary else binary,
|
78 |
from_page=from_page, to_page=to_page, callback=callback)
|
79 |
res = tokenize_table(tbls, doc, eng)
|
80 |
+
|
81 |
elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
|
82 |
callback(0.1, "Start to parse.")
|
83 |
excel_parser = ExcelParser()
|
84 |
sections = [(excel_parser.html(binary), "")]
|
85 |
+
|
86 |
elif re.search(r"\.txt$", filename, re.IGNORECASE):
|
87 |
callback(0.1, "Start to parse.")
|
88 |
txt = ""
|
|
|
97 |
sections = txt.split("\n")
|
98 |
sections = [(l, "") for l in sections if l]
|
99 |
callback(0.8, "Finish parsing.")
|
100 |
+
|
101 |
else:
|
102 |
raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
|
103 |
|
104 |
+
chunks = naive_merge(sections, parser_config.get("chunk_token_num", 128), parser_config.get("delimiter", "\n!?。;!?"))
|
105 |
+
|
106 |
+
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
return res
|
108 |
|
109 |
|
rag/app/one.py
CHANGED
@@ -13,7 +13,7 @@
|
|
13 |
import re
|
14 |
from rag.app import laws
|
15 |
from rag.nlp import huqie, tokenize
|
16 |
-
from deepdoc.parser import PdfParser, ExcelParser
|
17 |
|
18 |
|
19 |
class Pdf(PdfParser):
|
@@ -45,7 +45,7 @@ class Pdf(PdfParser):
|
|
45 |
for (img, rows), poss in tbls:
|
46 |
sections.append((rows if isinstance(rows, str) else rows[0],
|
47 |
[(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
|
48 |
-
return [txt for txt, _ in sorted(sections, key=lambda x: (x[-1][0][0], x[-1][0][3], x[-1][0][1]))]
|
49 |
|
50 |
|
51 |
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
|
@@ -59,16 +59,19 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
|
59 |
sections = []
|
60 |
if re.search(r"\.docx?$", filename, re.IGNORECASE):
|
61 |
callback(0.1, "Start to parse.")
|
62 |
-
for txt in laws.Docx()(filename, binary)
|
63 |
-
sections.append(txt)
|
64 |
callback(0.8, "Finish parsing.")
|
|
|
65 |
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
66 |
-
pdf_parser = Pdf()
|
67 |
sections = pdf_parser(filename if not binary else binary, to_page=to_page, callback=callback)
|
|
|
|
|
68 |
elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
|
69 |
callback(0.1, "Start to parse.")
|
70 |
excel_parser = ExcelParser()
|
71 |
sections = [excel_parser.html(binary)]
|
|
|
72 |
elif re.search(r"\.txt$", filename, re.IGNORECASE):
|
73 |
callback(0.1, "Start to parse.")
|
74 |
txt = ""
|
@@ -81,8 +84,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
|
81 |
if not l: break
|
82 |
txt += l
|
83 |
sections = txt.split("\n")
|
84 |
-
sections = [
|
85 |
callback(0.8, "Finish parsing.")
|
|
|
86 |
else:
|
87 |
raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
|
88 |
|
|
|
13 |
import re
|
14 |
from rag.app import laws
|
15 |
from rag.nlp import huqie, tokenize
|
16 |
+
from deepdoc.parser import PdfParser, ExcelParser, PlainParser
|
17 |
|
18 |
|
19 |
class Pdf(PdfParser):
|
|
|
45 |
for (img, rows), poss in tbls:
|
46 |
sections.append((rows if isinstance(rows, str) else rows[0],
|
47 |
[(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
|
48 |
+
return [(txt, "") for txt, _ in sorted(sections, key=lambda x: (x[-1][0][0], x[-1][0][3], x[-1][0][1]))]
|
49 |
|
50 |
|
51 |
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
|
|
|
59 |
sections = []
|
60 |
if re.search(r"\.docx?$", filename, re.IGNORECASE):
|
61 |
callback(0.1, "Start to parse.")
|
62 |
+
sections = [txt for txt in laws.Docx()(filename, binary) if txt]
|
|
|
63 |
callback(0.8, "Finish parsing.")
|
64 |
+
|
65 |
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
66 |
+
pdf_parser = Pdf() if kwargs.get("parser_config",{}).get("layout_recognize", True) else PlainParser()
|
67 |
sections = pdf_parser(filename if not binary else binary, to_page=to_page, callback=callback)
|
68 |
+
sections = [s for s, _ in sections if s]
|
69 |
+
|
70 |
elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
|
71 |
callback(0.1, "Start to parse.")
|
72 |
excel_parser = ExcelParser()
|
73 |
sections = [excel_parser.html(binary)]
|
74 |
+
|
75 |
elif re.search(r"\.txt$", filename, re.IGNORECASE):
|
76 |
callback(0.1, "Start to parse.")
|
77 |
txt = ""
|
|
|
84 |
if not l: break
|
85 |
txt += l
|
86 |
sections = txt.split("\n")
|
87 |
+
sections = [s for s in sections if s]
|
88 |
callback(0.8, "Finish parsing.")
|
89 |
+
|
90 |
else:
|
91 |
raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
|
92 |
|
rag/app/paper.py
CHANGED
@@ -15,8 +15,8 @@ import re
|
|
15 |
from collections import Counter
|
16 |
|
17 |
from api.db import ParserType
|
18 |
-
from rag.nlp import huqie, tokenize, tokenize_table, add_positions, bullets_category, title_frequency
|
19 |
-
from deepdoc.parser import PdfParser
|
20 |
import numpy as np
|
21 |
from rag.utils import num_tokens_from_string
|
22 |
|
@@ -59,24 +59,6 @@ class Pdf(PdfParser):
|
|
59 |
self.boxes = self.sort_X_by_page(self.boxes, column_width / 2)
|
60 |
for b in self.boxes:
|
61 |
b["text"] = re.sub(r"([\t ]|\u3000){2,}", " ", b["text"].strip())
|
62 |
-
# freq = Counter([b["text"] for b in self.boxes])
|
63 |
-
# garbage = set([k for k, v in freq.items() if v > self.total_page * 0.6])
|
64 |
-
# i = 0
|
65 |
-
# while i < len(self.boxes):
|
66 |
-
# if self.boxes[i]["text"] in garbage \
|
67 |
-
# or (re.match(r"[a-zA-Z0-9]+$", self.boxes[i]["text"]) and not self.boxes[i].get("layoutno")) \
|
68 |
-
# or (i + 1 < len(self.boxes) and self.boxes[i]["text"] == self.boxes[i + 1]["text"]):
|
69 |
-
# self.boxes.pop(i)
|
70 |
-
# elif i + 1 < len(self.boxes) and self.boxes[i].get("layoutno", '0') == self.boxes[i + 1].get("layoutno",
|
71 |
-
# '1'):
|
72 |
-
# # merge within same layouts
|
73 |
-
# self.boxes[i + 1]["top"] = self.boxes[i]["top"]
|
74 |
-
# self.boxes[i + 1]["x0"] = min(self.boxes[i]["x0"], self.boxes[i + 1]["x0"])
|
75 |
-
# self.boxes[i + 1]["x1"] = max(self.boxes[i]["x1"], self.boxes[i + 1]["x1"])
|
76 |
-
# self.boxes[i + 1]["text"] = self.boxes[i]["text"] + " " + self.boxes[i + 1]["text"]
|
77 |
-
# self.boxes.pop(i)
|
78 |
-
# else:
|
79 |
-
# i += 1
|
80 |
|
81 |
def _begin(txt):
|
82 |
return re.match(
|
@@ -148,9 +130,19 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
|
148 |
"""
|
149 |
pdf_parser = None
|
150 |
if re.search(r"\.pdf$", filename, re.IGNORECASE):
|
151 |
-
|
152 |
-
|
153 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
154 |
else: raise NotImplementedError("file type not supported yet(pdf supported)")
|
155 |
|
156 |
doc = {"docnm_kwd": filename, "authors_tks": huqie.qie(paper["authors"]),
|
@@ -195,16 +187,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
|
195 |
continue
|
196 |
chunks.append(txt)
|
197 |
last_sid = sec_id
|
198 |
-
|
199 |
-
d = copy.deepcopy(doc)
|
200 |
-
d["image"], poss = pdf_parser.crop(txt, need_position=True)
|
201 |
-
add_positions(d, poss)
|
202 |
-
tokenize(d, pdf_parser.remove_tag(txt), eng)
|
203 |
-
res.append(d)
|
204 |
-
print("----------------------\n", pdf_parser.remove_tag(txt))
|
205 |
-
|
206 |
return res
|
207 |
|
|
|
208 |
readed = [0] * len(paper["lines"])
|
209 |
# find colon firstly
|
210 |
i = 0
|
@@ -280,7 +266,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
|
280 |
print(d)
|
281 |
# d["image"].save(f"./logs/{i}.jpg")
|
282 |
return res
|
283 |
-
|
284 |
|
285 |
if __name__ == "__main__":
|
286 |
import sys
|
|
|
15 |
from collections import Counter
|
16 |
|
17 |
from api.db import ParserType
|
18 |
+
from rag.nlp import huqie, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
|
19 |
+
from deepdoc.parser import PdfParser, PlainParser
|
20 |
import numpy as np
|
21 |
from rag.utils import num_tokens_from_string
|
22 |
|
|
|
59 |
self.boxes = self.sort_X_by_page(self.boxes, column_width / 2)
|
60 |
for b in self.boxes:
|
61 |
b["text"] = re.sub(r"([\t ]|\u3000){2,}", " ", b["text"].strip())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
|
63 |
def _begin(txt):
|
64 |
return re.match(
|
|
|
130 |
"""
|
131 |
pdf_parser = None
|
132 |
if re.search(r"\.pdf$", filename, re.IGNORECASE):
|
133 |
+
if not kwargs.get("parser_config",{}).get("layout_recognize", True):
|
134 |
+
pdf_parser = PlainParser()
|
135 |
+
paper = {
|
136 |
+
"title": filename,
|
137 |
+
"authors": " ",
|
138 |
+
"abstract": "",
|
139 |
+
"sections": pdf_parser(filename if not binary else binary),
|
140 |
+
"tables": []
|
141 |
+
}
|
142 |
+
else:
|
143 |
+
pdf_parser = Pdf()
|
144 |
+
paper = pdf_parser(filename if not binary else binary,
|
145 |
+
from_page=from_page, to_page=to_page, callback=callback)
|
146 |
else: raise NotImplementedError("file type not supported yet(pdf supported)")
|
147 |
|
148 |
doc = {"docnm_kwd": filename, "authors_tks": huqie.qie(paper["authors"]),
|
|
|
187 |
continue
|
188 |
chunks.append(txt)
|
189 |
last_sid = sec_id
|
190 |
+
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
191 |
return res
|
192 |
|
193 |
+
"""
|
194 |
readed = [0] * len(paper["lines"])
|
195 |
# find colon firstly
|
196 |
i = 0
|
|
|
266 |
print(d)
|
267 |
# d["image"].save(f"./logs/{i}.jpg")
|
268 |
return res
|
269 |
+
"""
|
270 |
|
271 |
if __name__ == "__main__":
|
272 |
import sys
|
rag/app/presentation.py
CHANGED
@@ -18,7 +18,8 @@ from PIL import Image
|
|
18 |
|
19 |
from rag.nlp import tokenize, is_english
|
20 |
from rag.nlp import huqie
|
21 |
-
from deepdoc.parser import PdfParser, PptParser
|
|
|
22 |
|
23 |
|
24 |
class Ppt(PptParser):
|
@@ -56,19 +57,6 @@ class Pdf(PdfParser):
|
|
56 |
callback(0.8, "Page {}~{}: OCR finished".format(from_page, min(to_page, self.total_page)))
|
57 |
assert len(self.boxes) == len(self.page_images), "{} vs. {}".format(len(self.boxes), len(self.page_images))
|
58 |
res = []
|
59 |
-
#################### More precisely ###################
|
60 |
-
# self._layouts_rec(zoomin)
|
61 |
-
# self._text_merge()
|
62 |
-
# pages = {}
|
63 |
-
# for b in self.boxes:
|
64 |
-
# if self.__garbage(b["text"]):continue
|
65 |
-
# if b["page_number"] not in pages: pages[b["page_number"]] = []
|
66 |
-
# pages[b["page_number"]].append(b["text"])
|
67 |
-
# for i, lines in pages.items():
|
68 |
-
# res.append(("\n".join(lines), self.page_images[i-1]))
|
69 |
-
# return res
|
70 |
-
########################################
|
71 |
-
|
72 |
for i in range(len(self.boxes)):
|
73 |
lines = "\n".join([b["text"] for b in self.boxes[i] if not self.__garbage(b["text"])])
|
74 |
res.append((lines, self.page_images[i]))
|
@@ -76,6 +64,16 @@ class Pdf(PdfParser):
|
|
76 |
return res
|
77 |
|
78 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
|
80 |
"""
|
81 |
The supported file formats are pdf, pptx.
|
@@ -102,14 +100,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
|
102 |
res.append(d)
|
103 |
return res
|
104 |
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
105 |
-
pdf_parser = Pdf()
|
106 |
-
for pn, (txt,img) in enumerate(pdf_parser(filename
|
107 |
d = copy.deepcopy(doc)
|
108 |
pn += from_page
|
109 |
-
d["image"] = img
|
110 |
d["page_num_int"] = [pn+1]
|
111 |
d["top_int"] = [0]
|
112 |
-
d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])]
|
113 |
tokenize(d, txt, eng)
|
114 |
res.append(d)
|
115 |
return res
|
|
|
18 |
|
19 |
from rag.nlp import tokenize, is_english
|
20 |
from rag.nlp import huqie
|
21 |
+
from deepdoc.parser import PdfParser, PptParser, PlainParser
|
22 |
+
from PyPDF2 import PdfReader as pdf2_read
|
23 |
|
24 |
|
25 |
class Ppt(PptParser):
|
|
|
57 |
callback(0.8, "Page {}~{}: OCR finished".format(from_page, min(to_page, self.total_page)))
|
58 |
assert len(self.boxes) == len(self.page_images), "{} vs. {}".format(len(self.boxes), len(self.page_images))
|
59 |
res = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
for i in range(len(self.boxes)):
|
61 |
lines = "\n".join([b["text"] for b in self.boxes[i] if not self.__garbage(b["text"])])
|
62 |
res.append((lines, self.page_images[i]))
|
|
|
64 |
return res
|
65 |
|
66 |
|
67 |
+
class PlainPdf(PlainParser):
|
68 |
+
def __call__(self, filename, binary=None, callback=None, **kwargs):
|
69 |
+
self.pdf = pdf2_read(filename if not binary else BytesIO(filename))
|
70 |
+
page_txt = []
|
71 |
+
for page in self.pdf.pages:
|
72 |
+
page_txt.append(page.extract_text())
|
73 |
+
callback(0.9, "Parsing finished")
|
74 |
+
return [(txt, None) for txt in page_txt]
|
75 |
+
|
76 |
+
|
77 |
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
|
78 |
"""
|
79 |
The supported file formats are pdf, pptx.
|
|
|
100 |
res.append(d)
|
101 |
return res
|
102 |
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
103 |
+
pdf_parser = Pdf() if kwargs.get("parser_config",{}).get("layout_recognize", True) else PlainPdf()
|
104 |
+
for pn, (txt,img) in enumerate(pdf_parser(filename, binary, from_page=from_page, to_page=to_page, callback=callback)):
|
105 |
d = copy.deepcopy(doc)
|
106 |
pn += from_page
|
107 |
+
if img: d["image"] = img
|
108 |
d["page_num_int"] = [pn+1]
|
109 |
d["top_int"] = [0]
|
110 |
+
d["position_int"] = [(pn + 1, 0, img.size[0] if img else 0, 0, img.size[1] if img else 0)]
|
111 |
tokenize(d, txt, eng)
|
112 |
res.append(d)
|
113 |
return res
|
rag/nlp/__init__.py
CHANGED
@@ -76,6 +76,25 @@ def tokenize(d, t, eng):
|
|
76 |
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
|
77 |
|
78 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
def tokenize_table(tbls, doc, eng, batch_size=10):
|
80 |
res = []
|
81 |
# add tables
|
|
|
76 |
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
|
77 |
|
78 |
|
79 |
+
def tokenize_chunks(chunks, doc, eng, pdf_parser):
|
80 |
+
res = []
|
81 |
+
# wrap up as es documents
|
82 |
+
for ck in chunks:
|
83 |
+
if len(ck.strip()) == 0:continue
|
84 |
+
print("--", ck)
|
85 |
+
d = copy.deepcopy(doc)
|
86 |
+
if pdf_parser:
|
87 |
+
try:
|
88 |
+
d["image"], poss = pdf_parser.crop(ck, need_position=True)
|
89 |
+
add_positions(d, poss)
|
90 |
+
ck = pdf_parser.remove_tag(ck)
|
91 |
+
except NotImplementedError as e:
|
92 |
+
pass
|
93 |
+
tokenize(d, ck, eng)
|
94 |
+
res.append(d)
|
95 |
+
return res
|
96 |
+
|
97 |
+
|
98 |
def tokenize_table(tbls, doc, eng, batch_size=10):
|
99 |
res = []
|
100 |
# add tables
|
rag/nlp/huqie.py
CHANGED
@@ -300,7 +300,11 @@ class Huqie:
|
|
300 |
def qieqie(self, tks):
|
301 |
tks = tks.split(" ")
|
302 |
zh_num = len([1 for c in tks if c and is_chinese(c[0])])
|
303 |
-
if zh_num < len(tks) * 0.2:
|
|
|
|
|
|
|
|
|
304 |
|
305 |
res = []
|
306 |
for tk in tks:
|
|
|
300 |
def qieqie(self, tks):
|
301 |
tks = tks.split(" ")
|
302 |
zh_num = len([1 for c in tks if c and is_chinese(c[0])])
|
303 |
+
if zh_num < len(tks) * 0.2:
|
304 |
+
res = []
|
305 |
+
for tk in tks:
|
306 |
+
res.extend(tk.split("/"))
|
307 |
+
return " ".join(res)
|
308 |
|
309 |
res = []
|
310 |
for tk in tks:
|
rag/nlp/search.py
CHANGED
@@ -68,6 +68,7 @@ class Dealer:
|
|
68 |
s = Search()
|
69 |
pg = int(req.get("page", 1)) - 1
|
70 |
ps = int(req.get("size", 1000))
|
|
|
71 |
src = req.get("fields", ["docnm_kwd", "content_ltks", "kb_id", "img_id",
|
72 |
"image_id", "doc_id", "q_512_vec", "q_768_vec", "position_int",
|
73 |
"q_1024_vec", "q_1536_vec", "available_int", "content_with_weight"])
|
@@ -103,7 +104,7 @@ class Dealer:
|
|
103 |
assert emb_mdl, "No embedding model selected"
|
104 |
s["knn"] = self._vector(
|
105 |
qst, emb_mdl, req.get(
|
106 |
-
"similarity", 0.1),
|
107 |
s["knn"]["filter"] = bqry.to_dict()
|
108 |
if "highlight" in s:
|
109 |
del s["highlight"]
|
@@ -292,8 +293,8 @@ class Dealer:
|
|
292 |
ranks = {"total": 0, "chunks": [], "doc_aggs": {}}
|
293 |
if not question:
|
294 |
return ranks
|
295 |
-
req = {"kb_ids": kb_ids, "doc_ids": doc_ids, "size":
|
296 |
-
"question": question, "vector": True,
|
297 |
"similarity": similarity_threshold}
|
298 |
sres = self.search(req, index_name(tenant_id), embd_mdl)
|
299 |
|
|
|
68 |
s = Search()
|
69 |
pg = int(req.get("page", 1)) - 1
|
70 |
ps = int(req.get("size", 1000))
|
71 |
+
topk = int(req.get("topk", 1024))
|
72 |
src = req.get("fields", ["docnm_kwd", "content_ltks", "kb_id", "img_id",
|
73 |
"image_id", "doc_id", "q_512_vec", "q_768_vec", "position_int",
|
74 |
"q_1024_vec", "q_1536_vec", "available_int", "content_with_weight"])
|
|
|
104 |
assert emb_mdl, "No embedding model selected"
|
105 |
s["knn"] = self._vector(
|
106 |
qst, emb_mdl, req.get(
|
107 |
+
"similarity", 0.1), topk)
|
108 |
s["knn"]["filter"] = bqry.to_dict()
|
109 |
if "highlight" in s:
|
110 |
del s["highlight"]
|
|
|
293 |
ranks = {"total": 0, "chunks": [], "doc_aggs": {}}
|
294 |
if not question:
|
295 |
return ranks
|
296 |
+
req = {"kb_ids": kb_ids, "doc_ids": doc_ids, "size": page_size,
|
297 |
+
"question": question, "vector": True, "topk": top,
|
298 |
"similarity": similarity_threshold}
|
299 |
sres = self.search(req, index_name(tenant_id), embd_mdl)
|
300 |
|
rag/svr/task_broker.py
CHANGED
@@ -81,11 +81,15 @@ def dispatch():
|
|
81 |
|
82 |
tsks = []
|
83 |
if r["type"] == FileType.PDF.value:
|
|
|
|
|
|
|
84 |
pages = PdfParser.total_page_number(r["name"], MINIO.get(r["kb_id"], r["location"]))
|
85 |
-
page_size = 12
|
86 |
-
if r["parser_id"] == "paper": page_size = 22
|
87 |
if r["parser_id"] == "one": page_size = 1000000000
|
88 |
-
for s,e in r["parser_config"].get("pages", [(
|
|
|
89 |
e = min(e, pages)
|
90 |
for p in range(s, e, page_size):
|
91 |
task = new_task()
|
|
|
81 |
|
82 |
tsks = []
|
83 |
if r["type"] == FileType.PDF.value:
|
84 |
+
if not r["parser_config"].get("layout_recognize", True):
|
85 |
+
tsks.append(new_task())
|
86 |
+
continue
|
87 |
pages = PdfParser.total_page_number(r["name"], MINIO.get(r["kb_id"], r["location"]))
|
88 |
+
page_size = r["parser_config"].get("task_page_size", 12)
|
89 |
+
if r["parser_id"] == "paper": page_size = r["parser_config"].get("task_page_size", 22)
|
90 |
if r["parser_id"] == "one": page_size = 1000000000
|
91 |
+
for s,e in r["parser_config"].get("pages", [(1, 100000)]):
|
92 |
+
s -= 1
|
93 |
e = min(e, pages)
|
94 |
for p in range(s, e, page_size):
|
95 |
task = new_task()
|