Spaces:

JohnSmith9982
/

ChuanhuChatGPT

Running on CPU Upgrade

App Files Files Community

100

ChuanhuChatGPT / modules /utils.py

JohnSmith9982

Upload 114 files

7bab2f2 8 months ago

raw

history blame contribute delete

No virus

24.2 kB

	# -- coding:utf-8 --
	from __future__ import annotations
	from typing import TYPE_CHECKING, Any, Callable, Dict, List, Tuple, Type
	import logging
	import commentjson as json
	import os
	import datetime
	import csv
	import requests
	import re
	import html
	import hashlib

	import gradio as gr
	from pypinyin import lazy_pinyin
	import tiktoken
	from markdown import markdown
	from pygments import highlight
	from pygments.lexers import get_lexer_by_name
	from pygments.formatters import HtmlFormatter
	import pandas as pd

	from modules.presets import *
	from . import shared
	from modules.config import retrieve_proxy, hide_history_when_not_logged_in

	if TYPE_CHECKING:
	from typing import TypedDict

	class DataframeData(TypedDict):
	headers: List[str]
	data: List[List[str \| int \| bool]]

	def predict(current_model, *args):
	iter = current_model.predict(*args)
	for i in iter:
	yield i

	def billing_info(current_model):
	return current_model.billing_info()

	def set_key(current_model, *args):
	return current_model.set_key(*args)

	def load_chat_history(current_model, *args):
	return current_model.load_chat_history(*args)

	def delete_chat_history(current_model, *args):
	return current_model.delete_chat_history(*args)

	def interrupt(current_model, *args):
	return current_model.interrupt(*args)

	def reset(current_model, *args):
	return current_model.reset(*args)

	def retry(current_model, *args):
	iter = current_model.retry(*args)
	for i in iter:
	yield i

	def delete_first_conversation(current_model, *args):
	return current_model.delete_first_conversation(*args)

	def delete_last_conversation(current_model, *args):
	return current_model.delete_last_conversation(*args)

	def set_system_prompt(current_model, *args):
	return current_model.set_system_prompt(*args)

	def rename_chat_history(current_model, *args):
	return current_model.rename_chat_history(*args)

	def auto_name_chat_history(current_model, *args):
	return current_model.auto_name_chat_history(*args)

	def export_markdown(current_model, *args):
	return current_model.export_markdown(*args)

	def upload_chat_history(current_model, *args):
	return current_model.load_chat_history(*args)

	def set_token_upper_limit(current_model, *args):
	return current_model.set_token_upper_limit(*args)

	def set_temperature(current_model, *args):
	current_model.set_temperature(*args)

	def set_top_p(current_model, *args):
	current_model.set_top_p(*args)

	def set_n_choices(current_model, *args):
	current_model.set_n_choices(*args)

	def set_stop_sequence(current_model, *args):
	current_model.set_stop_sequence(*args)

	def set_max_tokens(current_model, *args):
	current_model.set_max_tokens(*args)

	def set_presence_penalty(current_model, *args):
	current_model.set_presence_penalty(*args)

	def set_frequency_penalty(current_model, *args):
	current_model.set_frequency_penalty(*args)

	def set_logit_bias(current_model, *args):
	current_model.set_logit_bias(*args)

	def set_user_identifier(current_model, *args):
	current_model.set_user_identifier(*args)

	def set_single_turn(current_model, *args):
	current_model.set_single_turn(*args)

	def handle_file_upload(current_model, *args):
	return current_model.handle_file_upload(*args)

	def handle_summarize_index(current_model, *args):
	return current_model.summarize_index(*args)

	def like(current_model, *args):
	return current_model.like(*args)

	def dislike(current_model, *args):
	return current_model.dislike(*args)


	def count_token(input_str):
	encoding = tiktoken.get_encoding("cl100k_base")
	if type(input_str) == dict:
	input_str = f"role: {input_str['role']}, content: {input_str['content']}"
	length = len(encoding.encode(input_str))
	return length


	def markdown_to_html_with_syntax_highlight(md_str): # deprecated
	def replacer(match):
	lang = match.group(1) or "text"
	code = match.group(2)

	try:
	lexer = get_lexer_by_name(lang, stripall=True)
	except ValueError:
	lexer = get_lexer_by_name("text", stripall=True)

	formatter = HtmlFormatter()
	highlighted_code = highlight(code, lexer, formatter)

	return f'<pre><code class="{lang}">{highlighted_code}</code></pre>'

	code_block_pattern = r"```(\w+)?\n([\s\S]+?)\n```"
	md_str = re.sub(code_block_pattern, replacer, md_str, flags=re.MULTILINE)

	html_str = markdown(md_str)
	return html_str


	def normalize_markdown(md_text: str) -> str: # deprecated
	lines = md_text.split("\n")
	normalized_lines = []
	inside_list = False

	for i, line in enumerate(lines):
	if re.match(r"^(\d+\.\|-\|\*\|\+)\s", line.strip()):
	if not inside_list and i > 0 and lines[i - 1].strip() != "":
	normalized_lines.append("")
	inside_list = True
	normalized_lines.append(line)
	elif inside_list and line.strip() == "":
	if i < len(lines) - 1 and not re.match(
	r"^(\d+\.\|-\|\*\|\+)\s", lines[i + 1].strip()
	):
	normalized_lines.append(line)
	continue
	else:
	inside_list = False
	normalized_lines.append(line)

	return "\n".join(normalized_lines)


	def convert_mdtext(md_text): # deprecated
	code_block_pattern = re.compile(r"```(.*?)(?:```\|$)", re.DOTALL)
	inline_code_pattern = re.compile(r"`(.*?)`", re.DOTALL)
	code_blocks = code_block_pattern.findall(md_text)
	non_code_parts = code_block_pattern.split(md_text)[::2]

	result = []
	raw = f'<div class="raw-message hideM">{html.escape(md_text)}</div>'
	for non_code, code in zip(non_code_parts, code_blocks + [""]):
	if non_code.strip():
	non_code = normalize_markdown(non_code)
	result.append(markdown(non_code, extensions=["tables"]))
	if code.strip():
	# _, code = detect_language(code) # 暂时去除代码高亮功能，因为在大段代码的情况下会出现问题
	# code = code.replace("\n\n", "\n") # 暂时去除代码中的空行，因为在大段代码的情况下会出现问题
	code = f"\n```{code}\n\n```"
	code = markdown_to_html_with_syntax_highlight(code)
	result.append(code)
	result = "".join(result)
	output = f'<div class="md-message">{result}</div>'
	output += raw
	output += ALREADY_CONVERTED_MARK
	return output


	def clip_rawtext(chat_message, need_escape=True):
	# first, clip hr line
	hr_pattern = r'\n\n<hr class="append-display no-in-raw" />(.*?)'
	hr_match = re.search(hr_pattern, chat_message, re.DOTALL)
	message_clipped = chat_message[:hr_match.start()] if hr_match else chat_message
	# second, avoid agent-prefix being escaped
	agent_prefix_pattern = r'(<!-- S O PREFIX --><p class="agent-prefix">.*?<\/p><!-- E O PREFIX -->)'
	# agent_matches = re.findall(agent_prefix_pattern, message_clipped)
	agent_parts = re.split(agent_prefix_pattern, message_clipped, flags=re.DOTALL)
	final_message = ""
	for i, part in enumerate(agent_parts):
	if i % 2 == 0:
	if part != "" and part != "\n":
	final_message += f'<pre class="fake-pre">{escape_markdown(part)}</pre>' if need_escape else f'<pre class="fake-pre">{part}</pre>'
	else:
	final_message += part
	return final_message


	def convert_bot_before_marked(chat_message):
	"""
	注意不能给输出加缩进, 否则会被marked解析成代码块
	"""
	if '<div class="md-message">' in chat_message:
	return chat_message
	else:
	raw = f'<div class="raw-message hideM">{clip_rawtext(chat_message)}</div>'
	# really_raw = f'{START_OF_OUTPUT_MARK}<div class="really-raw hideM">{clip_rawtext(chat_message, need_escape=False)}\n</div>{END_OF_OUTPUT_MARK}'

	code_block_pattern = re.compile(r"```(.*?)(?:```\|$)", re.DOTALL)
	code_blocks = code_block_pattern.findall(chat_message)
	non_code_parts = code_block_pattern.split(chat_message)[::2]
	result = []
	for non_code, code in zip(non_code_parts, code_blocks + [""]):
	if non_code.strip():
	result.append(non_code)
	if code.strip():
	code = f"\n```{code}\n```"
	result.append(code)
	result = "".join(result)
	md = f'<div class="md-message">\n\n{result}\n</div>'
	return raw + md

	def convert_user_before_marked(chat_message):
	if '<div class="user-message">' in chat_message:
	return chat_message
	else:
	return f'<div class="user-message">{escape_markdown(chat_message)}</div>'

	def escape_markdown(text):
	"""
	Escape Markdown special characters to HTML-safe equivalents.
	"""
	escape_chars = {
	# ' ': ' ',
	'_': '_',
	'': '',
	'[': '[',
	']': ']',
	'(': '(',
	')': ')',
	'{': '{',
	'}': '}',
	'#': '#',
	'+': '+',
	'-': '-',
	'.': '.',
	'!': '!',
	'`': '`',
	'>': '>',
	'<': '<',
	'\|': '\|',
	'$': '$',
	':': ':',
	'\n': '<br>',
	}
	text = text.replace(' ', '    ')
	return ''.join(escape_chars.get(c, c) for c in text)


	def convert_asis(userinput): # deprecated
	return (
	f'<p style="white-space:pre-wrap;">{html.escape(userinput)}</p>'
	+ ALREADY_CONVERTED_MARK
	)


	def detect_converted_mark(userinput): # deprecated
	try:
	if userinput.endswith(ALREADY_CONVERTED_MARK):
	return True
	else:
	return False
	except:
	return True


	def detect_language(code): # deprecated
	if code.startswith("\n"):
	first_line = ""
	else:
	first_line = code.strip().split("\n", 1)[0]
	language = first_line.lower() if first_line else ""
	code_without_language = code[len(first_line) :].lstrip() if first_line else code
	return language, code_without_language


	def construct_text(role, text):
	return {"role": role, "content": text}


	def construct_user(text):
	return construct_text("user", text)


	def construct_system(text):
	return construct_text("system", text)


	def construct_assistant(text):
	return construct_text("assistant", text)


	def save_file(filename, system, history, chatbot, user_name):
	os.makedirs(os.path.join(HISTORY_DIR, user_name), exist_ok=True)
	if filename is None:
	filename = new_auto_history_filename(user_name)
	if filename.endswith(".md"):
	filename = filename[:-3]
	if not filename.endswith(".json") and not filename.endswith(".md"):
	filename += ".json"
	if filename == ".json":
	raise Exception("文件名不能为空")

	json_s = {"system": system, "history": history, "chatbot": chatbot}
	repeat_file_index = 2
	if not filename == os.path.basename(filename):
	history_file_path = filename
	else:
	history_file_path = os.path.join(HISTORY_DIR, user_name, filename)

	with open(history_file_path, "w", encoding='utf-8') as f:
	json.dump(json_s, f, ensure_ascii=False)

	filename = os.path.basename(filename)
	filename_md = filename[:-5] + ".md"
	md_s = f"system: \n- {system} \n"
	for data in history:
	md_s += f"\n{data['role']}: \n- {data['content']} \n"
	with open(os.path.join(HISTORY_DIR, user_name, filename_md), "w", encoding="utf8") as f:
	f.write(md_s)
	return os.path.join(HISTORY_DIR, user_name, filename)


	def sorted_by_pinyin(list):
	return sorted(list, key=lambda char: lazy_pinyin(char)[0][0])

	def sorted_by_last_modified_time(list, dir):
	return sorted(list, key=lambda char: os.path.getctime(os.path.join(dir, char)), reverse=True)

	def get_file_names_by_type(dir, filetypes=[".json"]):
	logging.debug(f"获取文件名列表，目录为{dir}，文件类型为{filetypes}")
	files = []
	for type in filetypes:
	files += [f for f in os.listdir(dir) if f.endswith(type)]
	logging.debug(f"files are:{files}")
	return files

	def get_file_names_by_pinyin(dir, filetypes=[".json"]):
	files = get_file_names_by_type(dir, filetypes)
	if files != [""]:
	files = sorted_by_pinyin(files)
	logging.debug(f"files are:{files}")
	return files

	def get_file_names_dropdown_by_pinyin(dir, filetypes=[".json"]):
	files = get_file_names_by_pinyin(dir, filetypes)
	return gr.Dropdown.update(choices=files)

	def get_file_names_by_last_modified_time(dir, filetypes=[".json"]):
	files = get_file_names_by_type(dir, filetypes)
	if files != [""]:
	files = sorted_by_last_modified_time(files, dir)
	logging.debug(f"files are:{files}")
	return files


	def get_history_names(user_name=""):
	logging.debug(f"从用户 {user_name} 中获取历史记录文件名列表")
	if user_name == "" and hide_history_when_not_logged_in:
	return []
	else:
	history_files = get_file_names_by_last_modified_time(os.path.join(HISTORY_DIR, user_name))
	history_files = [f[:f.rfind(".")] for f in history_files]
	return history_files

	def get_first_history_name(user_name=""):
	history_names = get_history_names(user_name)
	return history_names[0] if history_names else None

	def get_history_list(user_name=""):
	history_names = get_history_names(user_name)
	return gr.Radio.update(choices=history_names)

	def init_history_list(user_name=""):
	history_names = get_history_names(user_name)
	return gr.Radio.update(choices=history_names, value=history_names[0] if history_names else "")

	def filter_history(user_name, keyword):
	history_names = get_history_names(user_name)
	try:
	history_names = [name for name in history_names if re.search(keyword, name)]
	return gr.update(choices=history_names)
	except:
	return gr.update(choices=history_names)

	def load_template(filename, mode=0):
	logging.debug(f"加载模板文件{filename}，模式为{mode}（0为返回字典和下拉菜单，1为返回下拉菜单，2为返回字典）")
	lines = []
	if filename.endswith(".json"):
	with open(os.path.join(TEMPLATES_DIR, filename), "r", encoding="utf8") as f:
	lines = json.load(f)
	lines = [[i["act"], i["prompt"]] for i in lines]
	else:
	with open(
	os.path.join(TEMPLATES_DIR, filename), "r", encoding="utf8"
	) as csvfile:
	reader = csv.reader(csvfile)
	lines = list(reader)
	lines = lines[1:]
	if mode == 1:
	return sorted_by_pinyin([row[0] for row in lines])
	elif mode == 2:
	return {row[0]: row[1] for row in lines}
	else:
	choices = sorted_by_pinyin([row[0] for row in lines])
	return {row[0]: row[1] for row in lines}, gr.Dropdown.update(
	choices=choices
	)


	def get_template_names():
	logging.debug("获取模板文件名列表")
	return get_file_names_by_pinyin(TEMPLATES_DIR, filetypes=[".csv", "json"])

	def get_template_dropdown():
	logging.debug("获取模板下拉菜单")
	template_names = get_template_names()
	return gr.Dropdown.update(choices=template_names)


	def get_template_content(templates, selection, original_system_prompt):
	logging.debug(f"应用模板中，选择为{selection}，原始系统提示为{original_system_prompt}")
	try:
	return templates[selection]
	except:
	return original_system_prompt


	def reset_textbox():
	logging.debug("重置文本框")
	return gr.update(value="")


	def reset_default():
	default_host = shared.state.reset_api_host()
	retrieve_proxy("")
	return gr.update(value=default_host), gr.update(value=""), "API-Host 和代理已重置"


	def change_api_host(host):
	shared.state.set_api_host(host)
	msg = f"API-Host更改为了{host}"
	logging.info(msg)
	return msg


	def change_proxy(proxy):
	retrieve_proxy(proxy)
	os.environ["HTTPS_PROXY"] = proxy
	msg = f"代理更改为了{proxy}"
	logging.info(msg)
	return msg


	def hide_middle_chars(s):
	if s is None:
	return ""
	if len(s) <= 8:
	return s
	else:
	head = s[:4]
	tail = s[-4:]
	hidden = "" (len(s) - 8)
	return head + hidden + tail


	def submit_key(key):
	key = key.strip()
	msg = f"API密钥更改为了{hide_middle_chars(key)}"
	logging.info(msg)
	return key, msg


	def replace_today(prompt):
	today = datetime.datetime.today().strftime("%Y-%m-%d")
	return prompt.replace("{current_date}", today)


	def get_geoip():
	try:
	with retrieve_proxy():
	response = requests.get("https://ipapi.co/json/", timeout=5)
	data = response.json()
	except:
	data = {"error": True, "reason": "连接ipapi失败"}
	if "error" in data.keys():
	logging.warning(f"无法获取IP地址信息。\n{data}")
	if data["reason"] == "RateLimited":
	return (
	i18n("您的IP区域：未知。")
	)
	else:
	return i18n("获取IP地理位置失败。原因：") + f"{data['reason']}" + i18n("。你仍然可以使用聊天功能。")
	else:
	country = data["country_name"]
	if country == "China":
	text = "您的IP区域：中国。请立即检查代理设置，在不受支持的地区使用API可能导致账号被封禁。"
	else:
	text = i18n("您的IP区域：") + f"{country}。"
	logging.info(text)
	return text


	def find_n(lst, max_num):
	n = len(lst)
	total = sum(lst)

	if total < max_num:
	return n

	for i in range(len(lst)):
	if total - lst[i] < max_num:
	return n - i - 1
	total = total - lst[i]
	return 1


	def start_outputing():
	logging.debug("显示取消按钮，隐藏发送按钮")
	return gr.Button.update(visible=False), gr.Button.update(visible=True)


	def end_outputing():
	return (
	gr.Button.update(visible=True),
	gr.Button.update(visible=False),
	)


	def cancel_outputing():
	logging.info("中止输出……")
	shared.state.interrupt()


	def transfer_input(inputs):
	# 一次性返回，降低延迟
	textbox = reset_textbox()
	outputing = start_outputing()
	return (
	inputs,
	gr.update(value=""),
	gr.Button.update(visible=False),
	gr.Button.update(visible=True),
	)


	def update_chuanhu():
	from .repo import background_update

	print("[Updater] Trying to update...")
	update_status = background_update()
	if update_status == "success":
	logging.info("Successfully updated, restart needed")
	status = '<span id="update-status" class="hideK">success</span>'
	return gr.Markdown.update(value=i18n("更新成功，请重启本程序")+status)
	else:
	status = '<span id="update-status" class="hideK">failure</span>'
	return gr.Markdown.update(value=i18n("更新失败，请尝试[手动更新](https://github.com/GaiZhenbiao/ChuanhuChatGPT/wiki/使用教程#手动更新)")+status)


	def add_source_numbers(lst, source_name = "Source", use_source = True):
	if use_source:
	return [f'[{idx+1}]\t "{item[0]}"\n{source_name}: {item[1]}' for idx, item in enumerate(lst)]
	else:
	return [f'[{idx+1}]\t "{item}"' for idx, item in enumerate(lst)]

	def add_details(lst):
	nodes = []
	for index, txt in enumerate(lst):
	brief = txt[:25].replace("\n", "")
	nodes.append(
	f"<details><summary>{brief}...</summary><p>{txt}</p></details>"
	)
	return nodes


	def sheet_to_string(sheet, sheet_name = None):
	result = []
	for index, row in sheet.iterrows():
	row_string = ""
	for column in sheet.columns:
	row_string += f"{column}: {row[column]}, "
	row_string = row_string.rstrip(", ")
	row_string += "."
	result.append(row_string)
	return result

	def excel_to_string(file_path):
	# 读取Excel文件中的所有工作表
	excel_file = pd.read_excel(file_path, engine='openpyxl', sheet_name=None)

	# 初始化结果字符串
	result = []

	# 遍历每一个工作表
	for sheet_name, sheet_data in excel_file.items():

	# 处理当前工作表并添加到结果字符串
	result += sheet_to_string(sheet_data, sheet_name=sheet_name)


	return result

	def get_last_day_of_month(any_day):
	# The day 28 exists in every month. 4 days later, it's always next month
	next_month = any_day.replace(day=28) + datetime.timedelta(days=4)
	# subtracting the number of the current day brings us back one month
	return next_month - datetime.timedelta(days=next_month.day)

	def get_model_source(model_name, alternative_source):
	if model_name == "gpt2-medium":
	return "https://huggingface.co/gpt2-medium"

	def refresh_ui_elements_on_load(current_model, selected_model_name, user_name):
	current_model.set_user_identifier(user_name)
	return toggle_like_btn_visibility(selected_model_name), *current_model.auto_load()

	def toggle_like_btn_visibility(selected_model_name):
	if selected_model_name == "xmchat":
	return gr.update(visible=True)
	else:
	return gr.update(visible=False)

	def get_corresponding_file_type_by_model_name(selected_model_name):
	if selected_model_name in ["xmchat", "GPT4 Vision"]:
	return ["image"]
	else:
	return [".pdf", ".docx", ".pptx", ".epub", ".xlsx", ".txt", "text"]

	# def toggle_file_type(selected_model_name):
	# return gr.Files.update(file_types=get_corresponding_file_type_by_model_name(selected_model_name))

	def new_auto_history_filename(username):
	latest_file = get_first_history_name(username)
	if latest_file:
	with open(os.path.join(HISTORY_DIR, username, latest_file + ".json"), 'r', encoding="utf-8") as f:
	if len(f.read()) == 0:
	return latest_file
	now = i18n("新对话 ") + datetime.datetime.now().strftime('%m-%d %H-%M')
	return f'{now}.json'

	def get_history_filepath(username):
	dirname = os.path.join(HISTORY_DIR, username)
	os.makedirs(dirname, exist_ok=True)
	latest_file = get_first_history_name(username)
	if not latest_file:
	latest_file = new_auto_history_filename(username)

	latest_file = os.path.join(dirname, latest_file)
	return latest_file

	def beautify_err_msg(err_msg):
	if "insufficient_quota" in err_msg:
	return i18n("剩余配额不足，[进一步了解](https://github.com/GaiZhenbiao/ChuanhuChatGPT/wiki/%E5%B8%B8%E8%A7%81%E9%97%AE%E9%A2%98#you-exceeded-your-current-quota-please-check-your-plan-and-billing-details)")
	if "The model `gpt-4` does not exist" in err_msg:
	return i18n("你没有权限访问 GPT4，[进一步了解](https://github.com/GaiZhenbiao/ChuanhuChatGPT/issues/843)")
	if "Resource not found" in err_msg:
	return i18n("请查看 config_example.json，配置 Azure OpenAI")
	return err_msg

	def auth_from_conf(username, password):
	try:
	with open("config.json", encoding="utf-8") as f:
	conf = json.load(f)
	usernames, passwords = [i[0] for i in conf["users"]], [i[1] for i in conf["users"]]
	if username in usernames:
	if passwords[usernames.index(username)] == password:
	return True
	return False
	except:
	return False

	def get_file_hash(file_src=None, file_paths=None):
	if file_src:
	file_paths = [x.name for x in file_src]
	file_paths.sort(key=lambda x: os.path.basename(x))

	md5_hash = hashlib.md5()
	for file_path in file_paths:
	with open(file_path, "rb") as f:
	while chunk := f.read(8192):
	md5_hash.update(chunk)

	return md5_hash.hexdigest()

	def myprint(**args):
	print(args)

	def replace_special_symbols(string, replace_string=" "):
	# 定义正则表达式，匹配所有特殊符号
	pattern = r'[!@#$%^&*()<>?/\\|}{~:]'

	new_string = re.sub(pattern, replace_string, string)

	return new_string