from math import ceil import gradio as gr from datasets import load_dataset, IterableDataset from transformers import AutoTokenizer, PreTrainedTokenizer def count_tokens(batch, tokenizer, text_column): encoded = tokenizer(batch[text_column]) return {"num_tokens": [len(input_ids) for input_ids in encoded["input_ids"]]} def get_dataset_num_tokens( dataset: IterableDataset, tokenizer: PreTrainedTokenizer, text_column: str, progress=gr.Progress() ) -> int: progress((0, None), desc="Counting tokens", unit="tokens") ds = dataset.map( count_tokens, batched=True, batch_size=1000, fn_kwargs={"tokenizer": tokenizer, "text_column": text_column} ) total_num_tokens = 0 for sample in ds: total_num_tokens += sample["num_tokens"] progress((total_num_tokens, None), desc="Counting tokens", unit="tokens") return total_num_tokens def calculate_steps( dataset_name: str, dataset_split: str, dataset_config: str | None, tokenizer_name: str, num_gpus_per_node: int, num_nodes: int, batch_size: int, grad_accum: int, block_size: int, text_column: str = "text", token: str | None = None, ): dataset_config = None if not dataset_config.strip() else dataset_config text_column = "text" if not text_column.strip() else text_column token = None if not token.strip() else token try: dataset = load_dataset(dataset_name, dataset_config, streaming=True, token=token, split=dataset_split) tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, token=token) total_num_tokens = get_dataset_num_tokens(dataset, tokenizer, text_column) except Exception as exc: raise gr.Error(str(exc)) else: dataset_size = ceil(total_num_tokens / block_size) world_size = num_gpus_per_node * num_nodes num_steps = ceil(dataset_size / (world_size * batch_size * grad_accum)) return dataset_size, num_steps with gr.Blocks() as demo: gr.Markdown( """# Steps Calculator Calculate the number of steps required to run through your whole dataset with a given sequence length. This is \ especially useful when training with a streaming dataset and you're not sure how many steps you need to run through \ the dataset with a given tokenizer and block size.""" ) with gr.Row(): dataset_name = gr.Text(label="Dataset name") dataset_split = gr.Text(label="Dataset split", value="train") dataset_config = gr.Text(label="Dataset config (optional)") tokenizer_name = gr.Text(label="Tokenizer name") with gr.Row(): num_gpus_per_node = gr.Number(value=1, minimum=1, label="Number of GPUs per node") num_nodes = gr.Number(value=1, minimum=1, label="Number of nodes") batch_size = gr.Number(value=8, minimum=1, label="Batch size") grad_accum = gr.Number(value=1, minimum=1, label="Gradient accumulation steps") block_size = gr.Number(value=2048, minimum=1, label="Block size") text_column = gr.Text(value="text", label="Text column") token = gr.Text(label="HF acces token (optional)") with gr.Row(): with gr.Column(): calculate_btn = gr.Button(value="Calculate") with gr.Column(): samples = gr.Number(value=None, minimum=1, label="Total block-sized samples", interactive=False) steps = gr.Number(value=None, minimum=1, label="Total steps needed", interactive=False) calculate_btn.click( calculate_steps, inputs=[ dataset_name, dataset_split, dataset_config, tokenizer_name, num_gpus_per_node, num_nodes, batch_size, grad_accum, block_size, text_column, token, ], outputs=[samples, steps], api_name="calculate-training-steps", ) if __name__ == "__main__": demo.queue().launch()