functionary-small-v3.2 / tokenization_functionary.py

fix tokenization

3db8af6 3 months ago

20.5 kB

	# Copyright (c) 2024, MeetKai Inc. All rights reserved.

	from copy import deepcopy
	import json
	from typing import Any, Dict, List, Literal, Optional, Union

	import jsonref
	from pydantic import BaseModel, Field, model_validator
	from typing_extensions import Self

	from transformers.tokenization_utils_base import BatchEncoding
	from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
	from transformers.utils import TensorType, logging


	logger = logging.get_logger(__name__)
	SYSTEM_PROMPT = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"""
	CODE_INTERPRETER_SYSTEM_PROMPT = """When you send a message containing Python code to python, it will be executed in a stateful Jupyter notebook environment. python will respond with the output of the execution or time out after 60.0 seconds. The drive at '/mnt/data' can be used to save and persist user files."""

	class Function(BaseModel):
	name: str
	description: Optional[str] = Field(default="")
	parameters: Optional[dict] = None


	class Tool(BaseModel):
	type: Literal["function", "code_interpreter"]
	function: Optional[Function] = None

	@model_validator(mode="after")
	def check_type_function_matches(self) -> Self:
	if self.type == "function":
	assert self.function is not None, '"function" must contain function description when `"type": "function"`'
	else:
	assert self.function is None, '"function" must not be provided when `"type": "code_interpreter"`'
	return self


	def convert_data_type(param_type: str) -> str:
	"""convert data_type to typescript data type
	Args:
	param_type (str): param_type
	Returns:
	str: param type in typescript
	"""
	if param_type == "integer" or param_type == "float":
	return "number"
	return param_type


	def get_param_type(param: Dict) -> str:
	"""get param_type of parameter
	Args:
	param (Dict): param dict in properties
	Returns:
	str: _description_
	"""
	param_type = "any"
	if "type" in param:
	raw_param_type = param["type"]
	if type(raw_param_type) is list:
	param_type = " \| ".join(raw_param_type)
	else:
	param_type = raw_param_type

	else: # in many cases, the json schema contains: oneOf instead of "type"
	if "oneOf" in param:
	one_of_types = []
	for item in param["oneOf"]:
	if "type" in item:
	one_of_types.append(convert_data_type(item["type"]))
	one_of_types = list(set(one_of_types))
	param_type = " \| ".join(one_of_types)
	return convert_data_type(param_type)


	def get_format_param(param: Dict) -> Optional[str]:
	"""Get "format" from param. There are cases where format is not directly in param but in oneOf
	Args:
	param (Dict): _description_
	Returns:
	Optional[str]: _description_
	"""
	if "format" in param:
	return param["format"]
	if "oneOf" in param:
	formats = []
	for item in param["oneOf"]:
	if "format" in item:
	formats.append(item["format"])
	if len(formats) > 0:
	return " or ".join(formats)
	return None


	def get_param_info(param: Dict) -> Optional[str]:
	"""get additional information about parameter such as: format, default value, min, max, ...
	Args:
	param (Dict): _description_
	Returns:
	Optional[str]: _description_
	"""
	param_type = param.get("type", "any")
	info_list = []
	if "description" in param:
	desc = param["description"]
	if not desc.endswith("."):
	desc += "."
	info_list.append(desc)

	if "default" in param:
	default_value = param["default"]
	if param_type == "string":
	default_value = f'"{default_value}"' # if string --> add ""
	info_list.append(f"Default={default_value}.")

	format_param = get_format_param(param)
	if format_param is not None:
	info_list.append("Format=" + format_param)

	for field, field_name in [
	("maximum", "Maximum"),
	("minimum", "Minimum"),
	("maxLength", "Maximum length"),
	("minLength", "Minimum length"),
	]:
	if field in param:
	info_list.append(f"{field_name}=" + str(param[field]))

	if len(info_list) > 0:
	result = "// " + " ".join(info_list)
	result = result.replace("\n", " ")
	return result
	return None


	def append_new_param_info(
	info_list: List[str],
	param_declaration: str,
	comment_info: Optional[str],
	examples_info: List,
	depth: int,
	):
	"""Append a new parameter with comment to the info_list
	Args:
	info_lines (List[str]): current info_list
	param_declaration (str): param: type
	comment_info (Optional[str]): information of comment
	examples_info (List): information of examples given
	depth (int): level of nested param
	"""
	offset = ""
	if depth >= 1:
	offset = "".join([" " for _ in range(depth)])
	if comment_info is not None:
	# if depth == 0: # format: //comment\nparam: type
	info_list.append(f"{offset}{comment_info}")
	if len(examples_info) > 0:
	for example in examples_info:
	info_list.append(f"{offset}{example}")
	info_list.append(f"{offset}{param_declaration}")
	# else: # format: param: type // comment
	# info_list.append(f"{offset}{param_declaration} {comment_info}")
	else:
	info_list.append(f"{offset}{param_declaration}")


	def get_examples_info(param_name: str, examples: List) -> List:
	"""get information about examples provided
	Args:
	param_name (str): _description_
	examples (List): _description_
	Returns:
	List: _description_
	"""
	examples_list = [f"// Example {param_name}:"]
	for example in examples:
	if isinstance(example, dict) or isinstance(example, list):
	example_str = json.dumps(example, ensure_ascii=False).replace('\n', '\\n')
	else:
	example_str = str(example).replace('\n', '\\n')
	examples_list.append(f"// {example_str}")

	return examples_list


	def get_enum_option_str(enum_options: List) -> str:
	"""get enum option separated by: "\|"
	Args:
	enum_options (List): list of options
	Returns:
	_type_: concatenation of options separated by "\|"
	"""
	# if each option is string --> add quote
	return " \| ".join([f'"{v}"' if type(v) is str else str(v) for v in enum_options])


	def get_array_typescript(
	param_name: Optional[str], param_dic: dict, depth: int = 0
	) -> str:
	"""recursive implementation for generating type script of array
	Args:
	param_name (Optional[str]): name of param, optional
	param_dic (dict): param_dic
	depth (int, optional): nested level. Defaults to 0.
	Returns:
	_type_: typescript of array
	"""
	offset = ""
	if depth >= 1:
	offset = "".join([" " for _ in range(depth)])
	items_info = param_dic.get("items", {})

	if len(items_info) == 0:
	if param_name is not None:
	return f"{offset}{param_name}: []"
	else:
	return "[]"
	array_type = get_param_type(items_info)
	if array_type == "object":
	info_lines = []
	child_lines = get_parameter_typescript(
	items_info.get("properties", {}), items_info.get("required", []), depth + 1
	)
	# if comment_info is not None:
	# info_lines.append(f"{offset}{comment_info}")
	if param_name is not None:
	info_lines.append(f"{offset}{param_name}" + ": {")
	else:
	info_lines.append(f"{offset}" + "{")
	info_lines.extend(child_lines)
	info_lines.append(f"{offset}" + "}[]")
	return "\n".join(info_lines)

	elif array_type == "array":
	item_info = get_array_typescript(None, items_info, depth + 1)
	if param_name is None:
	return f"{item_info}[]"
	return f"{offset}{param_name}: {item_info.strip()}[]"

	else:
	if "enum" in items_info:
	item_type = get_enum_option_str(items_info["enum"])
	if param_name is None:
	return f"({item_type})[]"
	else:
	return f"{offset}{param_name}: ({item_type})[]"
	else:
	if param_name is None:
	return f"{array_type}[]"
	else:
	return f"{offset}{param_name}: {array_type}[],"


	def get_parameter_typescript(properties, required_params, depth=0) -> List[str]:
	"""Recursion, returning the information about parameters including data type, description and other information
	These kinds of information will be put into the prompt
	Args:
	properties (_type_): properties in parameters
	required_params (_type_): List of required parameters
	depth (int, optional): the depth of params (nested level). Defaults to 0.
	Returns:
	_type_: list of lines containing information about all parameters
	"""
	tp_lines = []
	for param_name, param in properties.items():
	# Sometimes properties have "required" field as a list of string.
	# Even though its supposed to be not under properties. So we skip it
	if not isinstance(param, dict):
	continue
	# Param Description
	comment_info = get_param_info(param)
	# Param Examples
	examples_info = []
	if "examples" in param:
	examples_info = get_examples_info(param_name, param["examples"])
	# Param Name declaration
	param_declaration = f"{param_name}"
	if isinstance(required_params, list):
	if param_name not in required_params:
	param_declaration += "?"
	param_type = get_param_type(param)

	offset = ""
	if depth >= 1:
	offset = "".join([" " for _ in range(depth)])

	if param_type == "object": # param_type is object
	child_lines = get_parameter_typescript(
	param.get("properties", {}), param.get("required", []), depth + 1
	)
	if comment_info is not None:
	tp_lines.append(f"{offset}{comment_info}")
	if len(examples_info) > 0:
	for example in examples_info:
	tp_lines.append(f"{offset}{example}")

	param_declaration += ": {"
	tp_lines.append(f"{offset}{param_declaration}")
	tp_lines.extend(child_lines)
	tp_lines.append(f"{offset}" + "},")

	elif param_type == "array": # param_type is an array
	item_info = param.get("items", {})
	if "type" not in item_info: # don't know type of array
	param_declaration += ": [],"
	append_new_param_info(
	tp_lines, param_declaration, comment_info, examples_info, depth
	)
	else:
	array_declaration = get_array_typescript(
	param_declaration, param, depth
	)
	if not array_declaration.endswith(","):
	array_declaration += ","
	if comment_info is not None:
	tp_lines.append(f"{offset}{comment_info}")
	if len(examples_info) > 0:
	for example in examples_info:
	tp_lines.append(f"{offset}{example}")
	tp_lines.append(array_declaration)
	else:
	if "enum" in param:
	param_type = get_enum_option_str(param["enum"])
	# param_type = " \| ".join([f'"{v}"' for v in param["enum"]])
	if "nullable" in param and param["nullable"] is True:
	param_type += " \| null"
	param_declaration += f": {param_type},"
	append_new_param_info(
	tp_lines, param_declaration, comment_info, examples_info, depth
	)

	return tp_lines

	def generate_schema_from_functions(
	functions: List[Function], namespace="functions"
	) -> str:
	"""
	Convert functions schema to a schema that language models can understand.
	"""

	schema = "// Supported function definitions that should be called when necessary.\n"
	schema += f"namespace {namespace} {{\n\n"

	for function in functions:
	# Convert a Function object to dict, if necessary
	if not isinstance(function, dict):
	function = function.model_dump()
	function_name = function.get("name", None)
	if function_name is None:
	continue

	description = function.get("description", "")
	schema += f"// {description}\n"
	schema += f"type {function_name}"

	parameters = function.get("parameters", None)
	if parameters is not None and parameters.get("properties") is not None:
	parameters = deepcopy(jsonref.JsonRef.replace_refs(parameters))
	schema += " = (_: {\n"
	required_params = parameters.get("required", [])
	tp_lines = get_parameter_typescript(
	parameters.get("properties"),
	required_params,
	0,
	)
	schema += "\n".join(tp_lines)
	schema += "\n}) => any;\n\n"
	else:
	# Doesn't have any parameters
	schema += " = () => any;\n\n"

	schema += f"}} // namespace {namespace}"

	return schema

	class FunctionaryTokenizer(PreTrainedTokenizerFast):
	def apply_chat_template(
	self,
	conversation: Union[List[Dict[str, str]], List[List[Dict[str, str]]], str],
	tools: Optional[List[Dict[str, Any]]],
	chat_template: Optional[str] = None,
	add_generation_prompt: bool = False,
	tokenize: bool = True,
	padding: bool = False,
	truncation: bool = False,
	max_length: Optional[int] = None,
	return_tensors: Optional[Union[str, TensorType]] = None,
	return_dict: bool = False,
	tokenizer_kwargs: Optional[Dict[str, Any]] = None,
	**kwargs,
	) -> Union[str, List[int], List[str], List[List[int]], BatchEncoding]:

	if return_dict and not tokenize:
	raise ValueError(
	"`return_dict=True` is incompatible with `tokenize=False`, because there is no dict "
	"of tokenizer outputs to return."
	)

	if tokenizer_kwargs is None:
	tokenizer_kwargs = {}

	using_default_template = False

	# First, handle the cases when the model has a dict of multiple templates
	if isinstance(self.chat_template, dict) or (
	self.chat_template is None and isinstance(self.default_chat_template, dict)
	):
	if self.chat_template is not None:
	template_dict = self.chat_template
	using_default_dict = False
	else:
	template_dict = self.default_chat_template
	using_default_dict = True
	if chat_template is not None and chat_template in template_dict:
	# The user can pass the name of a template to the chat template argument instead of an entire template
	chat_template = template_dict[chat_template]
	if using_default_dict:
	using_default_template = True
	elif chat_template is None and "default" in template_dict:
	chat_template = template_dict["default"]
	if using_default_dict:
	using_default_template = True
	elif chat_template is None:
	raise ValueError(
	"This model has multiple chat templates with no default specified! Please either pass a chat "
	"template or the name of the template you wish to use to the `chat_template` argument. Available "
	f"template names are {sorted(template_dict.keys())}."
	)
	elif chat_template is None:
	# These are the cases when the model has a single template
	# priority: `chat_template` argument > `tokenizer.chat_template` > `tokenizer.default_chat_template
	if self.chat_template is not None:
	chat_template = self.chat_template
	else:
	chat_template = self.default_chat_template
	using_default_template = True

	if using_default_template:
	logger.warning_once(
	"No chat template is set for this tokenizer, falling back to a default class-level template. This is "
	"very error-prone, because models are often trained with templates different from the class default! "
	"Default chat templates are a legacy feature and will be removed in Transformers v4.43, at which "
	"point any code depending on them will stop working. We recommend setting a valid chat template before "
	"then to ensure that this model continues working without issues."
	)

	PYTHON_RUN_SYS_MSG = "When you send a message containing Python code to python, it will be executed in a stateful Jupyter notebook environment. python will respond with the output of the execution or time out after 60.0 seconds. The drive at '/mnt/data' can be used to save and persist user files."
	SYSTEM_CONTENT = """You are capable of executing available function(s) if required.
	Only execute function(s) when absolutely necessary.
	Ask for the required input to:recipient==all
	Use JSON for function arguments.
	Respond in this format:
	>>>${recipient}
	${content}
	Available functions:
	"""

	# Prepare tools/functions into schema
	functions_pydantic_to_render = []
	has_code_interpreter = False
	if tools is not None:
	for item in tools:
	if (
	"function" in item and item["function"] is not None
	): # new data format: tools: [{"type": xx, "function": xxx}]
	functions_pydantic_to_render.append(item["function"])
	elif "type" in item and item["type"] == "code_interpreter":
	has_code_interpreter = True
	else:
	functions_pydantic_to_render.append(item) # old format

	conversation.insert(
	0,
	{
	"role": "system",
	"content": SYSTEM_CONTENT + generate_schema_from_functions(functions_pydantic_to_render),
	},
	)
	if has_code_interpreter:
	conversation.insert(1, {"role": "system", "content": PYTHON_RUN_SYS_MSG})

	# Compilation function uses a cache to avoid recompiling the same template
	compiled_template = self._compile_jinja_template(chat_template)

	if isinstance(conversation, (list, tuple)) and (
	isinstance(conversation[0], (list, tuple)) or hasattr(conversation[0], "messages")
	):
	conversations = conversation
	is_batched = True
	else:
	conversations = [conversation]
	is_batched = False

	rendered = []
	template_kwargs = {self.special_tokens_map, kwargs} # kwargs overwrite special tokens if both are present
	for chat in conversations:
	if hasattr(chat, "messages"):
	# Indicates it's a Conversation object
	chat = chat.messages
	rendered_chat = compiled_template.render(
	messages=chat, add_generation_prompt=add_generation_prompt, **template_kwargs
	)
	rendered.append(rendered_chat)

	if not is_batched:
	rendered = rendered[0]

	if tokenize:
	out = self(
	rendered,
	padding=padding,
	truncation=truncation,
	max_length=max_length,
	add_special_tokens=False,
	return_tensors=return_tensors,
	**tokenizer_kwargs,
	)
	if return_dict:
	return out
	else:
	return out["input_ids"]
	else:
	return rendered