Spaces:
Running
Running
from __future__ import annotations | |
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Tuple, Type | |
import logging | |
import json | |
import os | |
import datetime | |
import hashlib | |
import csv | |
import requests | |
import re | |
import html | |
import markdown2 | |
import torch | |
import sys | |
import gc | |
from pygments.lexers import guess_lexer, ClassNotFound | |
import gradio as gr | |
from pypinyin import lazy_pinyin | |
import tiktoken | |
import mdtex2html | |
from markdown import markdown | |
from pygments import highlight | |
from pygments.lexers import guess_lexer,get_lexer_by_name | |
from pygments.formatters import HtmlFormatter | |
from beschreibungen import * | |
logging.basicConfig( | |
level=logging.INFO, | |
format="%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s", | |
) | |
def markdown_to_html_with_syntax_highlight(md_str): | |
def replacer(match): | |
lang = match.group(1) or "text" | |
code = match.group(2) | |
lang = lang.strip() | |
#print(1,lang) | |
if lang=="text": | |
lexer = guess_lexer(code) | |
lang = lexer.name | |
#print(2,lang) | |
try: | |
lexer = get_lexer_by_name(lang, stripall=True) | |
except ValueError: | |
lexer = get_lexer_by_name("python", stripall=True) | |
formatter = HtmlFormatter() | |
#print(3,lexer.name) | |
highlighted_code = highlight(code, lexer, formatter) | |
return f'<pre><code class="{lang}">{highlighted_code}</code></pre>' | |
code_block_pattern = r"```(\w+)?\n([\s\S]+?)\n```" | |
md_str = re.sub(code_block_pattern, replacer, md_str, flags=re.MULTILINE) | |
html_str = markdown(md_str) | |
return html_str | |
def normalize_markdown(md_text: str) -> str: | |
lines = md_text.split("\n") | |
normalized_lines = [] | |
inside_list = False | |
for i, line in enumerate(lines): | |
if re.match(r"^(\d+\.|-|\*|\+)\s", line.strip()): | |
if not inside_list and i > 0 and lines[i - 1].strip() != "": | |
normalized_lines.append("") | |
inside_list = True | |
normalized_lines.append(line) | |
elif inside_list and line.strip() == "": | |
if i < len(lines) - 1 and not re.match( | |
r"^(\d+\.|-|\*|\+)\s", lines[i + 1].strip() | |
): | |
normalized_lines.append(line) | |
continue | |
else: | |
inside_list = False | |
normalized_lines.append(line) | |
return "\n".join(normalized_lines) | |
def convert_mdtext(md_text): | |
code_block_pattern = re.compile(r"```(.*?)(?:```|$)", re.DOTALL) | |
inline_code_pattern = re.compile(r"`(.*?)`", re.DOTALL) | |
code_blocks = code_block_pattern.findall(md_text) | |
non_code_parts = code_block_pattern.split(md_text)[::2] | |
result = [] | |
for non_code, code in zip(non_code_parts, code_blocks + [""]): | |
if non_code.strip(): | |
non_code = normalize_markdown(non_code) | |
if inline_code_pattern.search(non_code): | |
result.append(markdown(non_code, extensions=["tables"])) | |
else: | |
result.append(mdtex2html.convert(non_code, extensions=["tables"])) | |
if code.strip(): | |
code = f"\n```{code}\n\n```" | |
code = markdown_to_html_with_syntax_highlight(code) | |
result.append(code) | |
result = "".join(result) | |
result += ALREADY_CONVERTED_MARK | |
return result | |
def convert_asis(userinput): | |
return f"<p style=\"white-space:pre-wrap;\">{html.escape(userinput)}</p>"+ALREADY_CONVERTED_MARK | |
def detect_converted_mark(userinput): | |
if userinput.endswith(ALREADY_CONVERTED_MARK): | |
return True | |
else: | |
return False | |
def detect_language(code): | |
if code.startswith("\n"): | |
first_line = "" | |
else: | |
first_line = code.strip().split("\n", 1)[0] | |
language = first_line.lower() if first_line else "" | |
code_without_language = code[len(first_line) :].lstrip() if first_line else code | |
return language, code_without_language | |
def convert_to_markdown(text): | |
text = text.replace("$","$") | |
def replace_leading_tabs_and_spaces(line): | |
new_line = [] | |
for char in line: | |
if char == "\t": | |
new_line.append("	") | |
elif char == " ": | |
new_line.append(" ") | |
else: | |
break | |
return "".join(new_line) + line[len(new_line):] | |
markdown_text = "" | |
lines = text.split("\n") | |
in_code_block = False | |
for line in lines: | |
if in_code_block is False and line.startswith("```"): | |
in_code_block = True | |
markdown_text += f"{line}\n" | |
elif in_code_block is True and line.startswith("```"): | |
in_code_block = False | |
markdown_text += f"{line}\n" | |
elif in_code_block: | |
markdown_text += f"{line}\n" | |
else: | |
line = replace_leading_tabs_and_spaces(line) | |
line = re.sub(r"^(#)", r"\\\1", line) | |
markdown_text += f"{line} \n" | |
return markdown_text | |
def add_language_tag(text): | |
def detect_language(code_block): | |
try: | |
lexer = guess_lexer(code_block) | |
return lexer.name.lower() | |
except ClassNotFound: | |
return "" | |
code_block_pattern = re.compile(r"(```)(\w*\n[^`]+```)", re.MULTILINE) | |
def replacement(match): | |
code_block = match.group(2) | |
if match.group(2).startswith("\n"): | |
language = detect_language(code_block) | |
if language: | |
return f"```{language}{code_block}```" | |
else: | |
return f"```\n{code_block}```" | |
else: | |
return match.group(1) + code_block + "```" | |
text2 = code_block_pattern.sub(replacement, text) | |
return text2 | |
def delete_last_conversation(chatbot, history): | |
if len(chatbot) > 0: | |
chatbot.pop() | |
if len(history) > 0: | |
history.pop() | |
return ( | |
chatbot, | |
history, | |
"Delete Done", | |
) | |
def reset_state(): | |
return [], [], "Reset Done" | |
def reset_textbox(): | |
return gr.update(value=""),"" | |
def cancel_outputing(): | |
return "Stop Done" | |
class State: | |
interrupted = False | |
def interrupt(self): | |
self.interrupted = True | |
def recover(self): | |
self.interrupted = False | |
shared_state = State() | |
def is_stop_word_or_prefix(s: str, stop_words: list) -> bool: | |
for stop_word in stop_words: | |
if s.endswith(stop_word): | |
return True | |
for i in range(1, len(stop_word)): | |
if s.endswith(stop_word[:i]): | |
return True | |
return False | |