Spaces:

retopara
/

ragflow

Build error

ragflow / deepdoc /parser /docx_parser.py

kuschzzp

Fix:#3230 When parsing a docx file using the Book parsing method, to_page is always -1, resulting in a block count of 0 even if parsing is successful (#3249)

ff43695 8 months ago

raw

history blame

5.02 kB

	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#

	from docx import Document
	import re
	import pandas as pd
	from collections import Counter
	from rag.nlp import rag_tokenizer
	from io import BytesIO


	class RAGFlowDocxParser:

	def __extract_table_content(self, tb):
	df = []
	for row in tb.rows:
	df.append([c.text for c in row.cells])
	return self.__compose_table_content(pd.DataFrame(df))

	def __compose_table_content(self, df):

	def blockType(b):
	patt = [
	("^(20\|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
	(r"^(20\|19)[0-9]{2}年$", "Dt"),
	(r"^(20\|19)[0-9]{2}[年/-][0-9]{1,2}月*$", "Dt"),
	("^[0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
	(r"^第*[一二三四1-4]季度$", "Dt"),
	(r"^(20\|19)[0-9]{2}年*[一二三四1-4]季度$", "Dt"),
	(r"^(20\|19)[0-9]{2}[ABCDE]$", "DT"),
	("^[0-9.,+%/ -]+$", "Nu"),
	(r"^[0-9A-Z/\._~-]+$", "Ca"),
	(r"^[A-Z]*[a-z' -]+$", "En"),
	(r"^[0-9.,+-]+[0-9A-Za-z/$￥%<>（）()' -]+$", "NE"),
	(r"^.{1}$", "Sg")
	]
	for p, n in patt:
	if re.search(p, b):
	return n
	tks = [t for t in rag_tokenizer.tokenize(b).split(" ") if len(t) > 1]
	if len(tks) > 3:
	if len(tks) < 12:
	return "Tx"
	else:
	return "Lx"

	if len(tks) == 1 and rag_tokenizer.tag(tks[0]) == "nr":
	return "Nr"

	return "Ot"

	if len(df) < 2:
	return []
	max_type = Counter([blockType(str(df.iloc[i, j])) for i in range(
	1, len(df)) for j in range(len(df.iloc[i, :]))])
	max_type = max(max_type.items(), key=lambda x: x[1])[0]

	colnm = len(df.iloc[0, :])
	hdrows = [0] # header is not nessesarily appear in the first line
	if max_type == "Nu":
	for r in range(1, len(df)):
	tys = Counter([blockType(str(df.iloc[r, j]))
	for j in range(len(df.iloc[r, :]))])
	tys = max(tys.items(), key=lambda x: x[1])[0]
	if tys != max_type:
	hdrows.append(r)

	lines = []
	for i in range(1, len(df)):
	if i in hdrows:
	continue
	hr = [r - i for r in hdrows]
	hr = [r for r in hr if r < 0]
	t = len(hr) - 1
	while t > 0:
	if hr[t] - hr[t - 1] > 1:
	hr = hr[t:]
	break
	t -= 1
	headers = []
	for j in range(len(df.iloc[i, :])):
	t = []
	for h in hr:
	x = str(df.iloc[i + h, j]).strip()
	if x in t:
	continue
	t.append(x)
	t = ",".join(t)
	if t:
	t += ": "
	headers.append(t)
	cells = []
	for j in range(len(df.iloc[i, :])):
	if not str(df.iloc[i, j]):
	continue
	cells.append(headers[j] + str(df.iloc[i, j]))
	lines.append(";".join(cells))

	if colnm > 3:
	return lines
	return ["\n".join(lines)]

	def __call__(self, fnm, from_page=0, to_page=100000000):
	self.doc = Document(fnm) if isinstance(
	fnm, str) else Document(BytesIO(fnm))
	pn = 0 # parsed page
	secs = [] # parsed contents
	for p in self.doc.paragraphs:
	if pn > to_page:
	break

	runs_within_single_paragraph = [] # save runs within the range of pages
	for run in p.runs:
	if pn > to_page:
	break
	if from_page <= pn < to_page and p.text.strip():
	runs_within_single_paragraph.append(run.text) # append run.text first

	# wrap page break checker into a static method
	if 'lastRenderedPageBreak' in run._element.xml:
	pn += 1

	secs.append(("".join(runs_within_single_paragraph), p.style.name if hasattr(p.style, 'name') else '')) # then concat run.text as part of the paragraph

	tbls = [self.__extract_table_content(tb) for tb in self.doc.tables]
	return secs, tbls