Spaces:
Running
Running
File size: 8,385 Bytes
2712d99 b0409b9 dd648bd 2712d99 5d23420 2712d99 b2fba01 2712d99 9a4b562 2712d99 9a4b562 2712d99 5eea959 2712d99 5eea959 9a4b562 4783fd6 9a4b562 5eea959 2712d99 5eea959 2712d99 5eea959 2712d99 4783fd6 2712d99 4783fd6 2712d99 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 |
from toolbox import update_ui
from toolbox import CatchException, report_execption, write_results_to_file
fast_debug = False
class PaperFileGroup():
def __init__(self):
self.file_paths = []
self.file_contents = []
self.sp_file_contents = []
self.sp_file_index = []
self.sp_file_tag = []
# count_token
from request_llm.bridge_all import model_info
enc = model_info["gpt-3.5-turbo"]['tokenizer']
def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
self.get_token_num = get_token_num
def run_file_split(self, max_token_limit=1900):
"""
将长文本分离开来
"""
for index, file_content in enumerate(self.file_contents):
if self.get_token_num(file_content) < max_token_limit:
self.sp_file_contents.append(file_content)
self.sp_file_index.append(index)
self.sp_file_tag.append(self.file_paths[index])
else:
from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
segments = breakdown_txt_to_satisfy_token_limit_for_pdf(file_content, self.get_token_num, max_token_limit)
for j, segment in enumerate(segments):
self.sp_file_contents.append(segment)
self.sp_file_index.append(index)
self.sp_file_tag.append(self.file_paths[index] + f".part-{j}.md")
print('Segmentation: done')
def 多文件翻译(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, language='en'):
import time, os, re
from .crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
# <-------- 读取Markdown文件,删除其中的所有注释 ---------->
pfg = PaperFileGroup()
for index, fp in enumerate(file_manifest):
with open(fp, 'r', encoding='utf-8', errors='replace') as f:
file_content = f.read()
# 记录删除注释后的文本
pfg.file_paths.append(fp)
pfg.file_contents.append(file_content)
# <-------- 拆分过长的Markdown文件 ---------->
pfg.run_file_split(max_token_limit=1500)
n_split = len(pfg.sp_file_contents)
# <-------- 多线程润色开始 ---------->
if language == 'en->zh':
inputs_array = ["This is a Markdown file, translate it into Chinese, do not modify any existing Markdown commands:" +
f"\n\n{frag}" for frag in pfg.sp_file_contents]
inputs_show_user_array = [f"翻译 {f}" for f in pfg.sp_file_tag]
sys_prompt_array = ["You are a professional academic paper translator." for _ in range(n_split)]
elif language == 'zh->en':
inputs_array = [f"This is a Markdown file, translate it into English, do not modify any existing Markdown commands:" +
f"\n\n{frag}" for frag in pfg.sp_file_contents]
inputs_show_user_array = [f"翻译 {f}" for f in pfg.sp_file_tag]
sys_prompt_array = ["You are a professional academic paper translator." for _ in range(n_split)]
gpt_response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
inputs_array=inputs_array,
inputs_show_user_array=inputs_show_user_array,
llm_kwargs=llm_kwargs,
chatbot=chatbot,
history_array=[[""] for _ in range(n_split)],
sys_prompt_array=sys_prompt_array,
# max_workers=5, # OpenAI所允许的最大并行过载
scroller_max_len = 80
)
# <-------- 整理结果,退出 ---------->
create_report_file_name = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()) + f"-chatgpt.polish.md"
res = write_results_to_file(gpt_response_collection, file_name=create_report_file_name)
history = gpt_response_collection
chatbot.append((f"{fp}完成了吗?", res))
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
def get_files_from_everything(txt):
import glob, os
success = True
if txt.startswith('http'):
# 网络的远程文件
txt = txt.replace("https://github.com/", "https://raw.githubusercontent.com/")
txt = txt.replace("/blob/", "/")
import requests
from toolbox import get_conf
proxies, = get_conf('proxies')
r = requests.get(txt, proxies=proxies)
with open('./gpt_log/temp.md', 'wb+') as f: f.write(r.content)
project_folder = './gpt_log/'
file_manifest = ['./gpt_log/temp.md']
elif txt.endswith('.md'):
# 直接给定文件
file_manifest = [txt]
project_folder = os.path.dirname(txt)
elif os.path.exists(txt):
# 本地路径,递归搜索
project_folder = txt
file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.md', recursive=True)]
else:
success = False
return success, file_manifest, project_folder
@CatchException
def Markdown英译中(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
# 基本信息:功能、贡献者
chatbot.append([
"函数插件功能?",
"对整个Markdown项目进行翻译。函数插件贡献者: Binary-Husky"])
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
# 尝试导入依赖,如果缺少依赖,则给出安装建议
try:
import tiktoken
import glob, os
except:
report_execption(chatbot, history,
a=f"解析项目: {txt}",
b=f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade tiktoken```。")
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
return
history = [] # 清空历史,以免输入溢出
success, file_manifest, project_folder = get_files_from_everything(txt)
if not success:
# 什么都没有
if txt == "": txt = '空空如也的输入栏'
report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}")
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
return
if len(file_manifest) == 0:
report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.md文件: {txt}")
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
return
yield from 多文件翻译(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, language='en->zh')
@CatchException
def Markdown中译英(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
# 基本信息:功能、贡献者
chatbot.append([
"函数插件功能?",
"对整个Markdown项目进行翻译。函数插件贡献者: Binary-Husky"])
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
# 尝试导入依赖,如果缺少依赖,则给出安装建议
try:
import tiktoken
import glob, os
except:
report_execption(chatbot, history,
a=f"解析项目: {txt}",
b=f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade tiktoken```。")
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
return
history = [] # 清空历史,以免输入溢出
success, file_manifest, project_folder = get_files_from_everything(txt)
if not success:
# 什么都没有
if txt == "": txt = '空空如也的输入栏'
report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}")
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
return
if len(file_manifest) == 0:
report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.md文件: {txt}")
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
return
yield from 多文件翻译(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, language='zh->en') |