qingxu99 commited on
Commit
745734b
1 Parent(s): 2bb1f3d

改进效率

Browse files
crazy_functions/代码重写为全英文_多线程.py CHANGED
@@ -10,16 +10,13 @@ def extract_code_block_carefully(txt):
10
  txt_out = '```'.join(splitted[1:-1])
11
  return txt_out
12
 
13
- def breakdown_txt_to_satisfy_token_limit(txt, limit, must_break_at_empty_line=True):
14
- from transformers import GPT2TokenizerFast
15
- tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
16
- get_token_cnt = lambda txt: len(tokenizer(txt)["input_ids"])
17
  def cut(txt_tocut, must_break_at_empty_line): # 递归
18
- if get_token_cnt(txt_tocut) <= limit:
19
  return [txt_tocut]
20
  else:
21
  lines = txt_tocut.split('\n')
22
- estimated_line_cut = limit / get_token_cnt(txt_tocut) * len(lines)
23
  estimated_line_cut = int(estimated_line_cut)
24
  for cnt in reversed(range(estimated_line_cut)):
25
  if must_break_at_empty_line:
@@ -27,7 +24,7 @@ def breakdown_txt_to_satisfy_token_limit(txt, limit, must_break_at_empty_line=Tr
27
  print(cnt)
28
  prev = "\n".join(lines[:cnt])
29
  post = "\n".join(lines[cnt:])
30
- if get_token_cnt(prev) < limit: break
31
  if cnt == 0:
32
  print('what the f?')
33
  raise RuntimeError("存在一行极长的文本!")
@@ -86,12 +83,12 @@ def 全项目切换英文(txt, top_p, temperature, chatbot, history, sys_prompt,
86
 
87
 
88
  # 第5步:Token限制下的截断与处理
89
- MAX_TOKEN = 2500
90
- # from transformers import GPT2TokenizerFast
91
- # print('加载tokenizer中')
92
- # tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
93
- # get_token_cnt = lambda txt: len(tokenizer(txt)["input_ids"])
94
- # print('加载tokenizer结束')
95
 
96
 
97
  # 第6步:任务函数
@@ -107,7 +104,7 @@ def 全项目切换英文(txt, top_p, temperature, chatbot, history, sys_prompt,
107
  try:
108
  gpt_say = ""
109
  # 分解代码文件
110
- file_content_breakdown = breakdown_txt_to_satisfy_token_limit(file_content, MAX_TOKEN)
111
  for file_content_partial in file_content_breakdown:
112
  i_say = i_say_template(fp, file_content_partial)
113
  # # ** gpt request **
 
10
  txt_out = '```'.join(splitted[1:-1])
11
  return txt_out
12
 
13
+ def breakdown_txt_to_satisfy_token_limit(txt, get_token_fn, limit, must_break_at_empty_line=True):
 
 
 
14
  def cut(txt_tocut, must_break_at_empty_line): # 递归
15
+ if get_token_fn(txt_tocut) <= limit:
16
  return [txt_tocut]
17
  else:
18
  lines = txt_tocut.split('\n')
19
+ estimated_line_cut = limit / get_token_fn(txt_tocut) * len(lines)
20
  estimated_line_cut = int(estimated_line_cut)
21
  for cnt in reversed(range(estimated_line_cut)):
22
  if must_break_at_empty_line:
 
24
  print(cnt)
25
  prev = "\n".join(lines[:cnt])
26
  post = "\n".join(lines[cnt:])
27
+ if get_token_fn(prev) < limit: break
28
  if cnt == 0:
29
  print('what the f?')
30
  raise RuntimeError("存在一行极长的文本!")
 
83
 
84
 
85
  # 第5步:Token限制下的截断与处理
86
+ MAX_TOKEN = 3000
87
+ from transformers import GPT2TokenizerFast
88
+ print('加载tokenizer中')
89
+ tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
90
+ get_token_fn = lambda txt: len(tokenizer(txt)["input_ids"])
91
+ print('加载tokenizer结束')
92
 
93
 
94
  # 第6步:任务函数
 
104
  try:
105
  gpt_say = ""
106
  # 分解代码文件
107
+ file_content_breakdown = breakdown_txt_to_satisfy_token_limit(file_content, get_token_fn, MAX_TOKEN)
108
  for file_content_partial in file_content_breakdown:
109
  i_say = i_say_template(fp, file_content_partial)
110
  # # ** gpt request **