Clemspace's picture
Initial model upload
cb9e677
import logging
from dataclasses import dataclass, field
from simple_parsing.helpers import Serializable
logger = logging.getLogger("data")
@dataclass()
class InstructArgs(Serializable):
shuffle: bool = True
# For function calling training examples only the last tool call
# of the assistant message can be used for training. Therefore,
# we chunk longer function calling conversations into multiple
# training samples to not loose any data. E.g.:
# [[
# UserMessage_1, AssisantToolCallMessage_1, ToolMessage_1, AssisantMessage_1
# UserMessage_2, AssisantToolCallMessage_2, ToolMessage_2, AssisantMessage_2
# ]]
# => is chunked into two training samples:
# [[
# UserMessage_1, AssisantToolCallMessage_1, ToolMessage_1, AssisantMessage_1
# ],
# [
# UserMessage_1, AssisantToolCallMessage_1, ToolMessage_1, AssisantMessage_1
# UserMessage_2, AssisantToolCallMessage_2, ToolMessage_2, AssisantMessage_2
# ]]
# NOTE: Only if your data is already pre-chunked should this argument be set to False
dynamic_chunk_fn_call: bool = True
@dataclass()
class DataArgs(Serializable):
# The data arguments `data` and `instruct_data` are a string in the format
# "data_source_dir_1:weight_1,data_source_dir_2:weight_2,...". The weight
# will be used to sample the data sources. If the sum of the weights is
# not 1 when concatenating the two arguments `data` and `instruct_data`,
# it will be normalized. The data sources folders must contain jsonl files.
# If the value is an empty string, no data will be used for the corresponding
# data type.
data: str = (
"" # Each line in the jsonl files inside the data source directories must be a dictionary with a "text" key. See Readme for more details. Can be left empty.
)
shuffle: bool = False
instruct_data: str = (
"" # Each line in the jsonl files inside the data source directories must be a dictionary with a "interactions" key. See Readme for more details. Can be left empty.
)
eval_instruct_data: str = (
"" # Each line in the jsonl files inside the data source directories must be a dictionary with a "interactions" key. See Readme for more details. Can be left empty.
)
instruct: InstructArgs = field(default_factory=InstructArgs)
def __post_init__(self) -> None:
if (
self.instruct.shuffle is False
and self.instruct.dynamic_chunk_fn_call is True
):
raise ValueError(
"Make sure to either enable `data.instruct.shuffle=True` or `data.instruct.dynamic_chunk_fn_call=False`. Dynamic chunking is only possible if data is loaded and shuffled before training."
)