Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	disallow special token + limit num of file < 512
Browse files
    	
        crazy_functions/Latex全文润色.py
    CHANGED
    
    | 
         @@ -14,7 +14,7 @@ class PaperFileGroup(): 
     | 
|
| 14 | 
         
             
                    import tiktoken
         
     | 
| 15 | 
         
             
                    from toolbox import get_conf
         
     | 
| 16 | 
         
             
                    enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
         
     | 
| 17 | 
         
            -
                    def get_token_num(txt): return len(enc.encode(txt))
         
     | 
| 18 | 
         
             
                    self.get_token_num = get_token_num
         
     | 
| 19 | 
         | 
| 20 | 
         
             
                def run_file_split(self, max_token_limit=1900):
         
     | 
| 
         | 
|
| 14 | 
         
             
                    import tiktoken
         
     | 
| 15 | 
         
             
                    from toolbox import get_conf
         
     | 
| 16 | 
         
             
                    enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
         
     | 
| 17 | 
         
            +
                    def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
         
     | 
| 18 | 
         
             
                    self.get_token_num = get_token_num
         
     | 
| 19 | 
         | 
| 20 | 
         
             
                def run_file_split(self, max_token_limit=1900):
         
     | 
    	
        crazy_functions/Latex全文翻译.py
    CHANGED
    
    | 
         @@ -14,7 +14,7 @@ class PaperFileGroup(): 
     | 
|
| 14 | 
         
             
                    import tiktoken
         
     | 
| 15 | 
         
             
                    from toolbox import get_conf
         
     | 
| 16 | 
         
             
                    enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
         
     | 
| 17 | 
         
            -
                    def get_token_num(txt): return len(enc.encode(txt))
         
     | 
| 18 | 
         
             
                    self.get_token_num = get_token_num
         
     | 
| 19 | 
         | 
| 20 | 
         
             
                def run_file_split(self, max_token_limit=1900):
         
     | 
| 
         | 
|
| 14 | 
         
             
                    import tiktoken
         
     | 
| 15 | 
         
             
                    from toolbox import get_conf
         
     | 
| 16 | 
         
             
                    enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
         
     | 
| 17 | 
         
            +
                    def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
         
     | 
| 18 | 
         
             
                    self.get_token_num = get_token_num
         
     | 
| 19 | 
         | 
| 20 | 
         
             
                def run_file_split(self, max_token_limit=1900):
         
     | 
    	
        crazy_functions/crazy_utils.py
    CHANGED
    
    | 
         @@ -6,7 +6,7 @@ def input_clipping(inputs, history, max_token_limit): 
     | 
|
| 6 | 
         
             
                import numpy as np
         
     | 
| 7 | 
         
             
                from toolbox import get_conf
         
     | 
| 8 | 
         
             
                enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
         
     | 
| 9 | 
         
            -
                def get_token_num(txt): return len(enc.encode(txt))
         
     | 
| 10 | 
         | 
| 11 | 
         
             
                mode = 'input-and-history'
         
     | 
| 12 | 
         
             
                # 当 输入部分的token占比 小于 全文的一半时,只裁剪历史
         
     | 
| 
         @@ -23,7 +23,7 @@ def input_clipping(inputs, history, max_token_limit): 
     | 
|
| 23 | 
         | 
| 24 | 
         
             
                while n_token > max_token_limit:
         
     | 
| 25 | 
         
             
                    where = np.argmax(everything_token)
         
     | 
| 26 | 
         
            -
                    encoded = enc.encode(everything[where])
         
     | 
| 27 | 
         
             
                    clipped_encoded = encoded[:len(encoded)-delta]
         
     | 
| 28 | 
         
             
                    everything[where] = enc.decode(clipped_encoded)[:-1]    # -1 to remove the may-be illegal char
         
     | 
| 29 | 
         
             
                    everything_token[where] = get_token_num(everything[where])
         
     | 
| 
         | 
|
| 6 | 
         
             
                import numpy as np
         
     | 
| 7 | 
         
             
                from toolbox import get_conf
         
     | 
| 8 | 
         
             
                enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
         
     | 
| 9 | 
         
            +
                def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
         
     | 
| 10 | 
         | 
| 11 | 
         
             
                mode = 'input-and-history'
         
     | 
| 12 | 
         
             
                # 当 输入部分的token占比 小于 全文的一半时,只裁剪历史
         
     | 
| 
         | 
|
| 23 | 
         | 
| 24 | 
         
             
                while n_token > max_token_limit:
         
     | 
| 25 | 
         
             
                    where = np.argmax(everything_token)
         
     | 
| 26 | 
         
            +
                    encoded = enc.encode(everything[where], disallowed_special=())
         
     | 
| 27 | 
         
             
                    clipped_encoded = encoded[:len(encoded)-delta]
         
     | 
| 28 | 
         
             
                    everything[where] = enc.decode(clipped_encoded)[:-1]    # -1 to remove the may-be illegal char
         
     | 
| 29 | 
         
             
                    everything_token[where] = get_token_num(everything[where])
         
     | 
    	
        crazy_functions/代码重写为全英文_多线程.py
    CHANGED
    
    | 
         @@ -62,7 +62,7 @@ def 全项目切换英文(txt, llm_kwargs, plugin_kwargs, chatbot, history, sys_ 
     | 
|
| 62 | 
         
             
                import tiktoken
         
     | 
| 63 | 
         
             
                from toolbox import get_conf
         
     | 
| 64 | 
         
             
                enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
         
     | 
| 65 | 
         
            -
                def get_token_fn(txt): return len(enc.encode(txt))
         
     | 
| 66 | 
         | 
| 67 | 
         | 
| 68 | 
         
             
                # 第6步:任务函数
         
     | 
| 
         | 
|
| 62 | 
         
             
                import tiktoken
         
     | 
| 63 | 
         
             
                from toolbox import get_conf
         
     | 
| 64 | 
         
             
                enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
         
     | 
| 65 | 
         
            +
                def get_token_fn(txt): return len(enc.encode(txt, disallowed_special=()))
         
     | 
| 66 | 
         | 
| 67 | 
         | 
| 68 | 
         
             
                # 第6步:任务函数
         
     | 
    	
        crazy_functions/批量Markdown翻译.py
    CHANGED
    
    | 
         @@ -14,7 +14,7 @@ class PaperFileGroup(): 
     | 
|
| 14 | 
         
             
                    import tiktoken
         
     | 
| 15 | 
         
             
                    from toolbox import get_conf
         
     | 
| 16 | 
         
             
                    enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
         
     | 
| 17 | 
         
            -
                    def get_token_num(txt): return len(enc.encode(txt))
         
     | 
| 18 | 
         
             
                    self.get_token_num = get_token_num
         
     | 
| 19 | 
         | 
| 20 | 
         
             
                def run_file_split(self, max_token_limit=1900):
         
     | 
| 
         | 
|
| 14 | 
         
             
                    import tiktoken
         
     | 
| 15 | 
         
             
                    from toolbox import get_conf
         
     | 
| 16 | 
         
             
                    enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
         
     | 
| 17 | 
         
            +
                    def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
         
     | 
| 18 | 
         
             
                    self.get_token_num = get_token_num
         
     | 
| 19 | 
         | 
| 20 | 
         
             
                def run_file_split(self, max_token_limit=1900):
         
     | 
    	
        crazy_functions/批量翻译PDF文档_多线程.py
    CHANGED
    
    | 
         @@ -70,7 +70,7 @@ def 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, 
     | 
|
| 70 | 
         
             
                    from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
         
     | 
| 71 | 
         
             
                    from toolbox import get_conf
         
     | 
| 72 | 
         
             
                    enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
         
     | 
| 73 | 
         
            -
                    def get_token_num(txt): return len(enc.encode(txt))
         
     | 
| 74 | 
         
             
                    paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
         
     | 
| 75 | 
         
             
                        txt=file_content,  get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT)
         
     | 
| 76 | 
         
             
                    page_one_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
         
     | 
| 
         | 
|
| 70 | 
         
             
                    from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
         
     | 
| 71 | 
         
             
                    from toolbox import get_conf
         
     | 
| 72 | 
         
             
                    enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
         
     | 
| 73 | 
         
            +
                    def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
         
     | 
| 74 | 
         
             
                    paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
         
     | 
| 75 | 
         
             
                        txt=file_content,  get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT)
         
     | 
| 76 | 
         
             
                    page_one_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
         
     | 
    	
        crazy_functions/理解PDF文档内容.py
    CHANGED
    
    | 
         @@ -19,7 +19,7 @@ def 解析PDF(file_name, llm_kwargs, plugin_kwargs, chatbot, history, system_pro 
     | 
|
| 19 | 
         
             
                from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
         
     | 
| 20 | 
         
             
                from toolbox import get_conf
         
     | 
| 21 | 
         
             
                enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
         
     | 
| 22 | 
         
            -
                def get_token_num(txt): return len(enc.encode(txt))
         
     | 
| 23 | 
         
             
                paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
         
     | 
| 24 | 
         
             
                    txt=file_content,  get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT)
         
     | 
| 25 | 
         
             
                page_one_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
         
     | 
| 
         | 
|
| 19 | 
         
             
                from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
         
     | 
| 20 | 
         
             
                from toolbox import get_conf
         
     | 
| 21 | 
         
             
                enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
         
     | 
| 22 | 
         
            +
                def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
         
     | 
| 23 | 
         
             
                paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
         
     | 
| 24 | 
         
             
                    txt=file_content,  get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT)
         
     | 
| 25 | 
         
             
                page_one_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
         
     | 
    	
        crazy_functions/解析项目源代码.py
    CHANGED
    
    | 
         @@ -11,7 +11,8 @@ def 解析源代码新(file_manifest, project_folder, llm_kwargs, plugin_kwargs, 
     | 
|
| 11 | 
         
             
                history_array = []
         
     | 
| 12 | 
         
             
                sys_prompt_array = []
         
     | 
| 13 | 
         
             
                report_part_1 = []
         
     | 
| 14 | 
         
            -
             
     | 
| 
         | 
|
| 15 | 
         
             
                ############################## <第一步,逐个文件分析,多线程> ##################################
         
     | 
| 16 | 
         
             
                for index, fp in enumerate(file_manifest):
         
     | 
| 17 | 
         
             
                    with open(fp, 'r', encoding='utf-8', errors='replace') as f:
         
     | 
| 
         | 
|
| 11 | 
         
             
                history_array = []
         
     | 
| 12 | 
         
             
                sys_prompt_array = []
         
     | 
| 13 | 
         
             
                report_part_1 = []
         
     | 
| 14 | 
         
            +
                
         
     | 
| 15 | 
         
            +
                assert len(file_manifest) <= 512, "源文件太多, 请缩减输入文件的数量, 或者删除此行并拆分file_manifest以保证结果能被分批存储。"
         
     | 
| 16 | 
         
             
                ############################## <第一步,逐个文件分析,多线程> ##################################
         
     | 
| 17 | 
         
             
                for index, fp in enumerate(file_manifest):
         
     | 
| 18 | 
         
             
                    with open(fp, 'r', encoding='utf-8', errors='replace') as f:
         
     |