disallow special token + limit num of file < 512
Browse files
crazy_functions/Latex全文润色.py
CHANGED
@@ -14,7 +14,7 @@ class PaperFileGroup():
|
|
14 |
import tiktoken
|
15 |
from toolbox import get_conf
|
16 |
enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
|
17 |
-
def get_token_num(txt): return len(enc.encode(txt))
|
18 |
self.get_token_num = get_token_num
|
19 |
|
20 |
def run_file_split(self, max_token_limit=1900):
|
|
|
14 |
import tiktoken
|
15 |
from toolbox import get_conf
|
16 |
enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
|
17 |
+
def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
|
18 |
self.get_token_num = get_token_num
|
19 |
|
20 |
def run_file_split(self, max_token_limit=1900):
|
crazy_functions/Latex全文翻译.py
CHANGED
@@ -14,7 +14,7 @@ class PaperFileGroup():
|
|
14 |
import tiktoken
|
15 |
from toolbox import get_conf
|
16 |
enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
|
17 |
-
def get_token_num(txt): return len(enc.encode(txt))
|
18 |
self.get_token_num = get_token_num
|
19 |
|
20 |
def run_file_split(self, max_token_limit=1900):
|
|
|
14 |
import tiktoken
|
15 |
from toolbox import get_conf
|
16 |
enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
|
17 |
+
def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
|
18 |
self.get_token_num = get_token_num
|
19 |
|
20 |
def run_file_split(self, max_token_limit=1900):
|
crazy_functions/crazy_utils.py
CHANGED
@@ -6,7 +6,7 @@ def input_clipping(inputs, history, max_token_limit):
|
|
6 |
import numpy as np
|
7 |
from toolbox import get_conf
|
8 |
enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
|
9 |
-
def get_token_num(txt): return len(enc.encode(txt))
|
10 |
|
11 |
mode = 'input-and-history'
|
12 |
# 当 输入部分的token占比 小于 全文的一半时,只裁剪历史
|
@@ -23,7 +23,7 @@ def input_clipping(inputs, history, max_token_limit):
|
|
23 |
|
24 |
while n_token > max_token_limit:
|
25 |
where = np.argmax(everything_token)
|
26 |
-
encoded = enc.encode(everything[where])
|
27 |
clipped_encoded = encoded[:len(encoded)-delta]
|
28 |
everything[where] = enc.decode(clipped_encoded)[:-1] # -1 to remove the may-be illegal char
|
29 |
everything_token[where] = get_token_num(everything[where])
|
|
|
6 |
import numpy as np
|
7 |
from toolbox import get_conf
|
8 |
enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
|
9 |
+
def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
|
10 |
|
11 |
mode = 'input-and-history'
|
12 |
# 当 输入部分的token占比 小于 全文的一半时,只裁剪历史
|
|
|
23 |
|
24 |
while n_token > max_token_limit:
|
25 |
where = np.argmax(everything_token)
|
26 |
+
encoded = enc.encode(everything[where], disallowed_special=())
|
27 |
clipped_encoded = encoded[:len(encoded)-delta]
|
28 |
everything[where] = enc.decode(clipped_encoded)[:-1] # -1 to remove the may-be illegal char
|
29 |
everything_token[where] = get_token_num(everything[where])
|
crazy_functions/代码重写为全英文_多线程.py
CHANGED
@@ -62,7 +62,7 @@ def 全项目切换英文(txt, llm_kwargs, plugin_kwargs, chatbot, history, sys_
|
|
62 |
import tiktoken
|
63 |
from toolbox import get_conf
|
64 |
enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
|
65 |
-
def get_token_fn(txt): return len(enc.encode(txt))
|
66 |
|
67 |
|
68 |
# 第6步:任务函数
|
|
|
62 |
import tiktoken
|
63 |
from toolbox import get_conf
|
64 |
enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
|
65 |
+
def get_token_fn(txt): return len(enc.encode(txt, disallowed_special=()))
|
66 |
|
67 |
|
68 |
# 第6步:任务函数
|
crazy_functions/批量Markdown翻译.py
CHANGED
@@ -14,7 +14,7 @@ class PaperFileGroup():
|
|
14 |
import tiktoken
|
15 |
from toolbox import get_conf
|
16 |
enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
|
17 |
-
def get_token_num(txt): return len(enc.encode(txt))
|
18 |
self.get_token_num = get_token_num
|
19 |
|
20 |
def run_file_split(self, max_token_limit=1900):
|
|
|
14 |
import tiktoken
|
15 |
from toolbox import get_conf
|
16 |
enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
|
17 |
+
def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
|
18 |
self.get_token_num = get_token_num
|
19 |
|
20 |
def run_file_split(self, max_token_limit=1900):
|
crazy_functions/批量翻译PDF文档_多线程.py
CHANGED
@@ -70,7 +70,7 @@ def 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot,
|
|
70 |
from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
|
71 |
from toolbox import get_conf
|
72 |
enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
|
73 |
-
def get_token_num(txt): return len(enc.encode(txt))
|
74 |
paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
|
75 |
txt=file_content, get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT)
|
76 |
page_one_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
|
|
|
70 |
from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
|
71 |
from toolbox import get_conf
|
72 |
enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
|
73 |
+
def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
|
74 |
paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
|
75 |
txt=file_content, get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT)
|
76 |
page_one_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
|
crazy_functions/理解PDF文档内容.py
CHANGED
@@ -19,7 +19,7 @@ def 解析PDF(file_name, llm_kwargs, plugin_kwargs, chatbot, history, system_pro
|
|
19 |
from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
|
20 |
from toolbox import get_conf
|
21 |
enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
|
22 |
-
def get_token_num(txt): return len(enc.encode(txt))
|
23 |
paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
|
24 |
txt=file_content, get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT)
|
25 |
page_one_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
|
|
|
19 |
from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
|
20 |
from toolbox import get_conf
|
21 |
enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
|
22 |
+
def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
|
23 |
paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
|
24 |
txt=file_content, get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT)
|
25 |
page_one_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
|
crazy_functions/解析项目源代码.py
CHANGED
@@ -11,7 +11,8 @@ def 解析源代码新(file_manifest, project_folder, llm_kwargs, plugin_kwargs,
|
|
11 |
history_array = []
|
12 |
sys_prompt_array = []
|
13 |
report_part_1 = []
|
14 |
-
|
|
|
15 |
############################## <第一步,逐个文件分析,多线程> ##################################
|
16 |
for index, fp in enumerate(file_manifest):
|
17 |
with open(fp, 'r', encoding='utf-8', errors='replace') as f:
|
|
|
11 |
history_array = []
|
12 |
sys_prompt_array = []
|
13 |
report_part_1 = []
|
14 |
+
|
15 |
+
assert len(file_manifest) <= 512, "源文件太多, 请缩减输入文件的数量, 或者删除此行并拆分file_manifest以保证结果能被分批存储。"
|
16 |
############################## <第一步,逐个文件分析,多线程> ##################################
|
17 |
for index, fp in enumerate(file_manifest):
|
18 |
with open(fp, 'r', encoding='utf-8', errors='replace') as f:
|