import tiktoken _enc_model = None def normalize2uaua( message, if_replace_system = False ): new_message = [] last_role = "" for msg in message: role = msg["role"] if if_replace_system and role == "system": role = "user" if last_role == role: new_message[-1]["content"] = new_message[-1]["content"] + "\n" + msg["content"] else: last_role = role new_message.append( msg ) return new_message def tiktoken_counter( text ): global _enc_model if _enc_model is None: _enc_model = tiktoken.get_encoding("cl100k_base") return len(_enc_model.encode(text)) def string_to_base64(text): import base64 byte_array = b'' for char in text: num_bytes = char.encode('utf-8') byte_array += num_bytes base64_data = base64.b64encode(byte_array) return base64_data.decode('utf-8') def base64_to_string(base64_data): import base64 byte_array = base64.b64decode(base64_data) text = byte_array.decode('utf-8') return text def float_array_to_base64(float_arr): import struct import base64 byte_array = b'' for f in float_arr: # 将每个浮点数打包为4字节 num_bytes = struct.pack('!f', f) byte_array += num_bytes # 将字节数组进行base64编码 base64_data = base64.b64encode(byte_array) return base64_data.decode('utf-8') def base64_to_float_array(base64_data): import struct import base64 byte_array = base64.b64decode(base64_data) float_array = [] # 每 4 个字节解析为一个浮点数 for i in range(0, len(byte_array), 4): num = struct.unpack('!f', byte_array[i:i+4])[0] float_array.append(num) return float_array def load_datas_from_jsonl( file_path ): import json datas = [] with open(file_path, 'r', encoding = 'utf-8') as f: for line in f: datas.append(json.loads(line)) return datas def save_datas_to_jsonl( file_path, datas ): import json with open(file_path, 'w', encoding = 'utf-8') as f: for data in datas: f.write(json.dumps(data, ensure_ascii=False) + '\n')