gpt-academic2

Runtime error

gpt-academic2 / toolbox.py

Siyuan Feng

feat: clean pdf fitz text

ab879ca over 1 year ago

12.1 kB

	import markdown, mdtex2html, threading, importlib, traceback
	from show_math import convert as convert_math
	from functools import wraps

	def predict_no_ui_but_counting_down(i_say, i_say_show_user, chatbot, top_p, temperature, history=[], sys_prompt=''):
	"""
	调用简单的predict_no_ui接口，但是依然保留了些许界面心跳功能，当对话太长时，会自动采用二分法截断
	"""
	import time
	from predict import predict_no_ui
	from toolbox import get_conf
	TIMEOUT_SECONDS, MAX_RETRY = get_conf('TIMEOUT_SECONDS', 'MAX_RETRY')
	# 多线程的时候，需要一个mutable结构在不同线程之间传递信息
	# list就是最简单的mutable结构，我们第一个位置放gpt输出，第二个位置传递报错信息
	mutable = [None, '']
	# multi-threading worker
	def mt(i_say, history):
	while True:
	try:
	mutable[0] = predict_no_ui(inputs=i_say, top_p=top_p, temperature=temperature, history=history, sys_prompt=sys_prompt)
	break
	except ConnectionAbortedError as e:
	if len(history) > 0:
	history = [his[len(his)//2:] for his in history if his is not None]
	mutable[1] = 'Warning! History conversation is too long, cut into half. '
	else:
	i_say = i_say[:len(i_say)//2]
	mutable[1] = 'Warning! Input file is too long, cut into half. '
	except TimeoutError as e:
	mutable[0] = '[Local Message] Failed with timeout.'
	raise TimeoutError
	# 创建新线程发出http请求
	thread_name = threading.Thread(target=mt, args=(i_say, history)); thread_name.start()
	# 原来的线程则负责持续更新UI，实现一个超时倒计时，并等待新线程的任务完成
	cnt = 0
	while thread_name.is_alive():
	cnt += 1
	chatbot[-1] = (i_say_show_user, f"[Local Message] {mutable[1]}waiting gpt response {cnt}/{TIMEOUT_SECONDS2(MAX_RETRY+1)}"+''.join(['.']*(cnt%4)))
	yield chatbot, history, '正常'
	time.sleep(1)
	# 把gpt的输出从mutable中取出来
	gpt_say = mutable[0]
	if gpt_say=='[Local Message] Failed with timeout.': raise TimeoutError
	return gpt_say

	def write_results_to_file(history, file_name=None):
	"""
	将对话记录history以Markdown格式写入文件中。如果没有指定文件名，则使用当前时间生成文件名。
	"""
	import os, time
	if file_name is None:
	# file_name = time.strftime("chatGPT分析报告%Y-%m-%d-%H-%M-%S", time.localtime()) + '.md'
	file_name = 'chatGPT分析报告' + time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()) + '.md'
	os.makedirs('./gpt_log/', exist_ok=True)
	with open(f'./gpt_log/{file_name}', 'w', encoding = 'utf8') as f:
	f.write('# chatGPT 分析报告\n')
	for i, content in enumerate(history):
	if i%2==0: f.write('## ')
	f.write(content)
	f.write('\n\n')
	res = '以上材料已经被写入' + os.path.abspath(f'./gpt_log/{file_name}')
	print(res)
	return res

	def regular_txt_to_markdown(text):
	"""
	将普通文本转换为Markdown格式的文本。
	"""
	text = text.replace('\n', '\n\n')
	text = text.replace('\n\n\n', '\n\n')
	text = text.replace('\n\n\n', '\n\n')
	return text

	def CatchException(f):
	"""
	装饰器函数，捕捉函数f中的异常并封装到一个生成器中返回，并显示到聊天当中。
	"""
	@wraps(f)
	def decorated(txt, top_p, temperature, chatbot, history, systemPromptTxt, WEB_PORT):
	try:
	yield from f(txt, top_p, temperature, chatbot, history, systemPromptTxt, WEB_PORT)
	except Exception as e:
	from check_proxy import check_proxy
	from toolbox import get_conf
	proxies, = get_conf('proxies')
	tb_str = regular_txt_to_markdown(traceback.format_exc())
	chatbot[-1] = (chatbot[-1][0], f"[Local Message] 实验性函数调用出错: \n\n {tb_str} \n\n 当前代理可用性: \n\n {check_proxy(proxies)}")
	yield chatbot, history, f'异常 {e}'
	return decorated

	def report_execption(chatbot, history, a, b):
	"""
	向chatbot中添加错误信息
	"""
	chatbot.append((a, b))
	history.append(a); history.append(b)

	def text_divide_paragraph(text):
	"""
	将文本按照段落分隔符分割开，生成带有段落标签的HTML代码。
	"""
	if '```' in text:
	# careful input
	return text
	else:
	# wtf input
	lines = text.split("\n")
	for i, line in enumerate(lines):
	lines[i] = lines[i].replace(" ", " ")
	text = "</br>".join(lines)
	return text

	def markdown_convertion(txt):
	"""
	将Markdown格式的文本转换为HTML格式。如果包含数学公式，则先将公式转换为HTML格式。
	"""
	if ('$' in txt) and ('```' not in txt):
	return markdown.markdown(txt,extensions=['fenced_code','tables']) + '<br><br>' + \
	markdown.markdown(convert_math(txt, splitParagraphs=False),extensions=['fenced_code','tables'])
	else:
	return markdown.markdown(txt,extensions=['fenced_code','tables'])


	def format_io(self, y):
	"""
	将输入和输出解析为HTML格式。将y中最后一项的输入部分段落化，并将输出部分的Markdown和数学公式转换为HTML格式。
	"""
	if y is None or y == []: return []
	i_ask, gpt_reply = y[-1]
	i_ask = text_divide_paragraph(i_ask) # 输入部分太自由，预处理一波
	y[-1] = (
	None if i_ask is None else markdown.markdown(i_ask, extensions=['fenced_code','tables']),
	None if gpt_reply is None else markdown_convertion(gpt_reply)
	)
	return y


	def find_free_port():
	"""
	返回当前系统中可用的未使用端口。
	"""
	import socket
	from contextlib import closing
	with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
	s.bind(('', 0))
	s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
	return s.getsockname()[1]


	def extract_archive(file_path, dest_dir):
	import zipfile
	import tarfile
	import os
	# Get the file extension of the input file
	file_extension = os.path.splitext(file_path)[1]

	# Extract the archive based on its extension
	if file_extension == '.zip':
	with zipfile.ZipFile(file_path, 'r') as zipobj:
	zipobj.extractall(path=dest_dir)
	print("Successfully extracted zip archive to {}".format(dest_dir))

	elif file_extension in ['.tar', '.gz', '.bz2']:
	with tarfile.open(file_path, 'r:*') as tarobj:
	tarobj.extractall(path=dest_dir)
	print("Successfully extracted tar archive to {}".format(dest_dir))
	else:
	return

	def find_recent_files(directory):
	"""
	me: find files that is created with in one minutes under a directory with python, write a function
	gpt: here it is!
	"""
	import os
	import time
	current_time = time.time()
	one_minute_ago = current_time - 60
	recent_files = []

	for filename in os.listdir(directory):
	file_path = os.path.join(directory, filename)
	if file_path.endswith('.log'): continue
	created_time = os.path.getctime(file_path)
	if created_time >= one_minute_ago:
	if os.path.isdir(file_path): continue
	recent_files.append(file_path)

	return recent_files


	def on_file_uploaded(files, chatbot, txt):
	if len(files) == 0: return chatbot, txt
	import shutil, os, time, glob
	from toolbox import extract_archive
	try: shutil.rmtree('./private_upload/')
	except: pass
	time_tag = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())
	os.makedirs(f'private_upload/{time_tag}', exist_ok=True)
	for file in files:
	file_origin_name = os.path.basename(file.orig_name)
	shutil.copy(file.name, f'private_upload/{time_tag}/{file_origin_name}')
	extract_archive(f'private_upload/{time_tag}/{file_origin_name}',
	dest_dir=f'private_upload/{time_tag}/{file_origin_name}.extract')
	moved_files = [fp for fp in glob.glob('private_upload/*/', recursive=True)]
	txt = f'private_upload/{time_tag}'
	moved_files_str = '\t\n\n'.join(moved_files)
	chatbot.append(['我上传了文件，请查收',
	f'[Local Message] 收到以下文件: \n\n{moved_files_str}\n\n调用路径参数已自动修正到: \n\n{txt}\n\n现在您点击任意实验功能时，以上文件将被作为输入参数'])
	return chatbot, txt


	def on_report_generated(files, chatbot):
	from toolbox import find_recent_files
	report_files = find_recent_files('gpt_log')
	if len(report_files) == 0: return report_files, chatbot
	# files.extend(report_files)
	chatbot.append(['汇总报告如何远程获取？', '汇总报告已经添加到右侧文件上传区，请查收。'])
	return report_files, chatbot

	def get_conf(*args):
	# 建议您复制一个config_private.py放自己的秘密, 如API和代理网址, 避免不小心传github被别人看到
	res = []
	for arg in args:
	try: r = getattr(importlib.import_module('config_private'), arg)
	except: r = getattr(importlib.import_module('config'), arg)
	res.append(r)
	# 在读取API_KEY时，检查一下是不是忘了改config
	if arg=='API_KEY' and len(r) != 51:
	assert False, "正确的API_KEY密钥是51位，请在config文件中修改API密钥, 添加海外代理之后再运行。" + \
	"（如果您刚更新过代码，请确保旧版config_private文件中没有遗留任何新增键值）"
	return res

	def clear_line_break(txt):
	txt = txt.replace('\n', ' ')
	txt = txt.replace(' ', ' ')
	txt = txt.replace(' ', ' ')
	return txt

	import re
	import unicodedata

	def is_paragraph_break(match):
	"""
	根据给定的匹配结果来判断换行符是否表示段落分隔。
	如果换行符前为句子结束标志（句号，感叹号，问号），且下一个字符为大写字母，则换行符更有可能表示段落分隔。
	也可以根据之前的内容长度来判断段落是否已经足够长。
	"""
	prev_char, next_char = match.groups()

	# 句子结束标志
	sentence_endings = ".!?"

	# 设定一个最小段落长度阈值
	min_paragraph_length = 140

	if prev_char in sentence_endings and next_char.isupper() and len(match.string[:match.start(1)]) > min_paragraph_length:
	return "\n\n"
	else:
	return " "

	def normalize_text(text):
	"""
	通过把连字（ligatures）等文本特殊符号转换为其基本形式来对文本进行归一化处理。
	例如，将连字 "fi" 转换为 "f" 和 "i"。
	"""
	# 对文本进行归一化处理，分解连字
	normalized_text = unicodedata.normalize("NFKD", text)

	# 替换其他特殊字符
	cleaned_text = re.sub(r'[^\x00-\x7F]+', '', normalized_text)

	return cleaned_text

	def clean_text(raw_text):
	"""
	对从 PDF 提取出的原始文本进行清洗和格式化处理。
	1. 对原始文本进行归一化处理。
	2. 替换跨行的连词，例如 “Espe-\ncially” 转换为 “Especially”。
	3. 根据 heuristic 规则判断换行符是否是段落分隔，并相应地进行替换。
	"""
	# 对文本进行归一化处理
	normalized_text = normalize_text(raw_text)

	# 替换跨行的连词
	text = re.sub(r'(\w+-\n\w+)', lambda m: m.group(1).replace('-\n', ''), normalized_text)

	# 根据前后相邻字符的特点，找到原文本中的换行符
	newlines = re.compile(r'(\S)\n(\S)')

	# 根据 heuristic 规则，用空格或段落分隔符替换原换行符
	final_text = re.sub(newlines, lambda m: m.group(1) + is_paragraph_break(m) + m.group(2), text)

	return final_text.strip()