import markdown import re import os import math from textwrap import dedent from functools import lru_cache from pymdownx.superfences import fence_code_format from latex2mathml.converter import convert as tex2mathml from shared_utils.config_loader import get_conf as get_conf from shared_utils.text_mask import apply_gpt_academic_string_mask markdown_extension_configs = { "mdx_math": { "enable_dollar_delimiter": True, "use_gitlab_delimiters": False, }, } code_highlight_configs = { "pymdownx.superfences": { "css_class": "codehilite", "custom_fences": [ {"name": "mermaid", "class": "mermaid", "format": fence_code_format} ], }, "pymdownx.highlight": { "css_class": "codehilite", "guess_lang": True, # 'auto_title': True, # 'linenums': True }, } code_highlight_configs_block_mermaid = { "pymdownx.superfences": { "css_class": "codehilite", # "custom_fences": [ # {"name": "mermaid", "class": "mermaid", "format": fence_code_format} # ], }, "pymdownx.highlight": { "css_class": "codehilite", "guess_lang": True, # 'auto_title': True, # 'linenums': True }, } mathpatterns = { r"(?") return f'$${content}$$' else: return f'${content}$' def replace_math_render(match): content = match.group(1) if "mode=display" in match.group(0): if "\\begin{aligned}" in content: content = content.replace("\\begin{aligned}", "\\begin{array}") content = content.replace("\\end{aligned}", "\\end{array}") content = content.replace("&", " ") content = tex2mathml_catch_exception(content, display="block") return content else: return tex2mathml_catch_exception(content) def markdown_bug_hunt(content): """ 解决一个mdx_math的bug(单$包裹begin命令时多余\n", "") return content def is_equation(txt): """ 判定是否为公式 | 测试1 写出洛伦兹定律,使用tex格式公式 测试2 给出柯西不等式,使用latex格式 测试3 写出麦克斯韦方程组 """ if "```" in txt and "```reference" not in txt: return False if "$" not in txt and "\\[" not in txt: return False matches = [] for pattern, property in mathpatterns.items(): flags = re.ASCII | re.DOTALL if property["allow_multi_lines"] else re.ASCII matches.extend(re.findall(pattern, txt, flags)) if len(matches) == 0: return False contain_any_eq = False illegal_pattern = re.compile(r"[^\x00-\x7F]|echo") for match in matches: if len(match) != 3: return False eq_canidate = match[1] if illegal_pattern.search(eq_canidate): return False else: contain_any_eq = True return contain_any_eq def fix_markdown_indent(txt): # fix markdown indent if (" - " not in txt) or (". " not in txt): # do not need to fix, fast escape return txt # walk through the lines and fix non-standard indentation lines = txt.split("\n") pattern = re.compile(r"^\s+-") activated = False for i, line in enumerate(lines): if line.startswith("- ") or line.startswith("1. "): activated = True if activated and pattern.match(line): stripped_string = line.lstrip() num_spaces = len(line) - len(stripped_string) if (num_spaces % 4) == 3: num_spaces_should_be = math.ceil(num_spaces / 4) * 4 lines[i] = " " * num_spaces_should_be + stripped_string return "\n".join(lines) FENCED_BLOCK_RE = re.compile( dedent( r""" (?P^[ \t]*(?:~{3,}|`{3,}))[ ]* # opening fence ((\{(?P[^\}\n]*)\})| # (optional {attrs} or (\.?(?P[\w#.+-]*)[ ]*)? # optional (.)lang (hl_lines=(?P"|')(?P.*?)(?P=quot)[ ]*)?) # optional hl_lines) \n # newline (end of opening fence) (?P.*?)(?<=\n) # the code block (?P=fence)[ ]*$ # closing fence """ ), re.MULTILINE | re.DOTALL | re.VERBOSE, ) def get_line_range(re_match_obj, txt): start_pos, end_pos = re_match_obj.regs[0] num_newlines_before = txt[: start_pos + 1].count("\n") line_start = num_newlines_before line_end = num_newlines_before + txt[start_pos:end_pos].count("\n") + 1 return line_start, line_end def fix_code_segment_indent(txt): lines = [] change_any = False txt_tmp = txt while True: re_match_obj = FENCED_BLOCK_RE.search(txt_tmp) if not re_match_obj: break if len(lines) == 0: lines = txt.split("\n") # 清空 txt_tmp 对应的位置方便下次搜索 start_pos, end_pos = re_match_obj.regs[0] txt_tmp = txt_tmp[:start_pos] + " " * (end_pos - start_pos) + txt_tmp[end_pos:] line_start, line_end = get_line_range(re_match_obj, txt) # 获取公共缩进 shared_indent_cnt = 1e5 for i in range(line_start, line_end): stripped_string = lines[i].lstrip() num_spaces = len(lines[i]) - len(stripped_string) if num_spaces < shared_indent_cnt: shared_indent_cnt = num_spaces # 修复缩进 if (shared_indent_cnt < 1e5) and (shared_indent_cnt % 4) == 3: num_spaces_should_be = math.ceil(shared_indent_cnt / 4) * 4 for i in range(line_start, line_end): add_n = num_spaces_should_be - shared_indent_cnt lines[i] = " " * add_n + lines[i] if not change_any: # 遇到第一个 change_any = True if change_any: return "\n".join(lines) else: return txt def fix_dollar_sticking_bug(txt): """ 修复不标准的dollar公式符号的问题 """ txt_result = "" single_stack_height = 0 double_stack_height = 0 while True: while True: index = txt.find('$') if index == -1: txt_result += txt return txt_result if single_stack_height > 0: if txt[:(index+1)].find('\n') > 0 or txt[:(index+1)].find('') > 0 or txt[:(index+1)].find('') > 0: print('公式之中出现了异常 (Unexpect element in equation)') single_stack_height = 0 txt_result += ' $' continue if double_stack_height > 0: if txt[:(index+1)].find('\n\n') > 0: print('公式之中出现了异常 (Unexpect element in equation)') double_stack_height = 0 txt_result += '$$' continue is_double = (txt[index+1] == '$') if is_double: if single_stack_height != 0: # add a padding txt = txt[:(index+1)] + " " + txt[(index+1):] continue if double_stack_height == 0: double_stack_height = 1 else: double_stack_height = 0 txt_result += txt[:(index+2)] txt = txt[(index+2):] else: if double_stack_height != 0: # print(txt[:(index)]) print('发现异常嵌套公式') if single_stack_height == 0: single_stack_height = 1 else: single_stack_height = 0 # print(txt[:(index)]) txt_result += txt[:(index+1)] txt = txt[(index+1):] break def markdown_convertion_for_file(txt): """ 将Markdown格式的文本转换为HTML格式。如果包含数学公式,则先将公式转换为HTML格式。 """ from themes.theme import advanced_css pre = f""" GPT-Academic输出文档
""" suf = """
""" if txt.startswith(pre) and txt.endswith(suf): # print('警告,输入了已经经过转化的字符串,二次转化可能出问题') return txt # 已经被转化过,不需要再次转化 find_equation_pattern = r'' pattern = "|".join([pattern for pattern, property in mathpatterns.items() if not property["allow_multi_lines"]]) pattern = re.compile(pattern, flags=re.ASCII) convert_stage_3 = pattern.sub(repl_fn, convert_stage_2) convert_stage_4 = markdown_bug_hunt(convert_stage_3) # 2. convert to rendered equation convert_stage_5, n = re.subn( find_equation_pattern, replace_math_render, convert_stage_4, flags=re.DOTALL ) # cat them together return pre + convert_stage_5 + suf @lru_cache(maxsize=128) # 使用 lru缓存 加快转换速度 def markdown_convertion(txt): """ 将Markdown格式的文本转换为HTML格式。如果包含数学公式,则先将公式转换为HTML格式。 """ pre = '
' suf = "
" if txt.startswith(pre) and txt.endswith(suf): # print('警告,输入了已经经过转化的字符串,二次转化可能出问题') return txt # 已经被转化过,不需要再次转化 find_equation_pattern = r'