Spaces:

Ritvik19
/

marker-io

Running

App Files Files Community

marker-io / marker /postprocessors /markdown.py

Ritvik19

Add all files and directories

c8a32e7 2 months ago

raw

history blame

No virus

7.23 kB

	from marker.schema.merged import MergedLine, MergedBlock, FullyMergedBlock
	from marker.schema.page import Page
	import re
	import regex
	from typing import List


	def escape_markdown(text):
	# List of characters that need to be escaped in markdown
	characters_to_escape = r"[#]"
	# Escape each of these characters with a backslash
	escaped_text = re.sub(characters_to_escape, r'\\\g<0>', text)
	return escaped_text


	def surround_text(s, char_to_insert):
	leading_whitespace = re.match(r'^(\s*)', s).group(1)
	trailing_whitespace = re.search(r'(\s*)$', s).group(1)
	stripped_string = s.strip()
	modified_string = char_to_insert + stripped_string + char_to_insert
	final_string = leading_whitespace + modified_string + trailing_whitespace
	return final_string


	def merge_spans(pages: List[Page]) -> List[List[MergedBlock]]:
	merged_blocks = []
	for page in pages:
	page_blocks = []
	for blocknum, block in enumerate(page.blocks):
	block_lines = []
	for linenum, line in enumerate(block.lines):
	line_text = ""
	if len(line.spans) == 0:
	continue
	fonts = []
	for i, span in enumerate(line.spans):
	font = span.font.lower()
	next_span = None
	next_idx = 1
	while len(line.spans) > i + next_idx:
	next_span = line.spans[i + next_idx]
	next_idx += 1
	if len(next_span.text.strip()) > 2:
	break

	fonts.append(font)
	span_text = span.text

	# Don't bold or italicize very short sequences
	# Avoid bolding first and last sequence so lines can be joined properly
	if len(span_text) > 3 and 0 < i < len(line.spans) - 1:
	if span.italic and (not next_span or not next_span.italic):
	span_text = surround_text(span_text, "*")
	elif span.bold and (not next_span or not next_span.bold):
	span_text = surround_text(span_text, "**")
	line_text += span_text
	block_lines.append(MergedLine(
	text=line_text,
	fonts=fonts,
	bbox=line.bbox
	))
	if len(block_lines) > 0:
	page_blocks.append(MergedBlock(
	lines=block_lines,
	pnum=block.pnum,
	bbox=block.bbox,
	block_type=block.block_type
	))
	merged_blocks.append(page_blocks)

	return merged_blocks


	def block_surround(text, block_type):
	if block_type == "Section-header":
	if not text.startswith("#"):
	text = "\n## " + text.strip().title() + "\n"
	elif block_type == "Title":
	if not text.startswith("#"):
	text = "# " + text.strip().title() + "\n"
	elif block_type == "Table":
	text = "\n" + text + "\n"
	elif block_type == "List-item":
	text = escape_markdown(text)
	elif block_type == "Code":
	text = "\n```\n" + text + "\n```\n"
	elif block_type == "Text":
	text = escape_markdown(text)
	elif block_type == "Formula":
	if text.strip().startswith("$$") and text.strip().endswith("$$"):
	text = text.strip()
	text = "\n" + text + "\n"
	return text


	def line_separator(line1, line2, block_type, is_continuation=False):
	# Should cover latin-derived languages and russian
	lowercase_letters = r'\p{Lo}\|\p{Ll}\|\d'
	hyphens = r'-—¬'
	# Remove hyphen in current line if next line and current line appear to be joined
	hyphen_pattern = regex.compile(rf'.*[{lowercase_letters}][{hyphens}]\s?$', regex.DOTALL)
	if line1 and hyphen_pattern.match(line1) and regex.match(rf"^\s?[{lowercase_letters}]", line2):
	# Split on — or - from the right
	line1 = regex.split(rf"[{hyphens}]\s?$", line1)[0]
	return line1.rstrip() + line2.lstrip()

	all_letters = r'\p{L}\|\d'
	sentence_continuations = r',;\(\—\"\'\*'
	sentence_ends = r'。ๆ\.?!'
	line_end_pattern = regex.compile(rf'.*[{lowercase_letters}][{sentence_continuations}]?\s?$', regex.DOTALL)
	line_start_pattern = regex.compile(rf'^\s?[{all_letters}]', regex.DOTALL)
	sentence_end_pattern = regex.compile(rf'.*[{sentence_ends}]\s?$', regex.DOTALL)

	text_blocks = ["Text", "List-item", "Footnote", "Caption", "Figure"]
	if block_type in ["Title", "Section-header"]:
	return line1.rstrip() + " " + line2.lstrip()
	elif block_type == "Formula":
	return line1 + "\n" + line2
	elif line_end_pattern.match(line1) and line_start_pattern.match(line2) and block_type in text_blocks:
	return line1.rstrip() + " " + line2.lstrip()
	elif is_continuation:
	return line1.rstrip() + " " + line2.lstrip()
	elif block_type in text_blocks and sentence_end_pattern.match(line1):
	return line1 + "\n\n" + line2
	elif block_type == "Table":
	return line1 + "\n\n" + line2
	else:
	return line1 + "\n" + line2


	def block_separator(line1, line2, block_type1, block_type2):
	sep = "\n"
	if block_type1 == "Text":
	sep = "\n\n"

	return sep + line2


	def merge_lines(blocks: List[List[MergedBlock]]):
	text_blocks = []
	prev_type = None
	prev_line = None
	block_text = ""
	block_type = ""

	for page in blocks:
	for block in page:
	block_type = block.block_type
	if block_type != prev_type and prev_type:
	text_blocks.append(
	FullyMergedBlock(
	text=block_surround(block_text, prev_type),
	block_type=prev_type
	)
	)
	block_text = ""

	prev_type = block_type
	# Join lines in the block together properly
	for i, line in enumerate(block.lines):
	line_height = line.bbox[3] - line.bbox[1]
	prev_line_height = prev_line.bbox[3] - prev_line.bbox[1] if prev_line else 0
	prev_line_x = prev_line.bbox[0] if prev_line else 0
	prev_line = line
	is_continuation = line_height == prev_line_height and line.bbox[0] == prev_line_x
	if block_text:
	block_text = line_separator(block_text, line.text, block_type, is_continuation)
	else:
	block_text = line.text

	# Append the final block
	text_blocks.append(
	FullyMergedBlock(
	text=block_surround(block_text, prev_type),
	block_type=block_type
	)
	)
	return text_blocks


	def get_full_text(text_blocks):
	full_text = ""
	prev_block = None
	for block in text_blocks:
	if prev_block:
	full_text += block_separator(prev_block.text, block.text, prev_block.block_type, block.block_type)
	else:
	full_text += block.text
	prev_block = block
	return full_text