Bram Vanroy
add app
68ddcf0
from math import ceil
import gradio as gr
from datasets import load_dataset, IterableDataset
from transformers import AutoTokenizer, PreTrainedTokenizer
def count_tokens(batch, tokenizer, text_column):
encoded = tokenizer(batch[text_column])
return {"num_tokens": [len(input_ids) for input_ids in encoded["input_ids"]]}
def get_dataset_num_tokens(
dataset: IterableDataset, tokenizer: PreTrainedTokenizer, text_column: str, progress=gr.Progress()
) -> int:
progress((0, None), desc="Counting tokens", unit="tokens")
ds = dataset.map(
count_tokens, batched=True, batch_size=1000, fn_kwargs={"tokenizer": tokenizer, "text_column": text_column}
)
total_num_tokens = 0
for sample in ds:
total_num_tokens += sample["num_tokens"]
progress((total_num_tokens, None), desc="Counting tokens", unit="tokens")
return total_num_tokens
def calculate_steps(
dataset_name: str,
dataset_split: str,
dataset_config: str | None,
tokenizer_name: str,
num_gpus_per_node: int,
num_nodes: int,
batch_size: int,
grad_accum: int,
block_size: int,
text_column: str = "text",
token: str | None = None,
):
dataset_config = None if not dataset_config.strip() else dataset_config
text_column = "text" if not text_column.strip() else text_column
token = None if not token.strip() else token
try:
dataset = load_dataset(dataset_name, dataset_config, streaming=True, token=token, split=dataset_split)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, token=token)
total_num_tokens = get_dataset_num_tokens(dataset, tokenizer, text_column)
except Exception as exc:
raise gr.Error(str(exc))
else:
dataset_size = ceil(total_num_tokens / block_size)
world_size = num_gpus_per_node * num_nodes
num_steps = ceil(dataset_size / (world_size * batch_size * grad_accum))
return dataset_size, num_steps
with gr.Blocks() as demo:
gr.Markdown(
"""# Steps Calculator
Calculate the number of steps required to run through your whole dataset with a given sequence length. This is \
especially useful when training with a streaming dataset and you're not sure how many steps you need to run through \
the dataset with a given tokenizer and block size."""
)
with gr.Row():
dataset_name = gr.Text(label="Dataset name")
dataset_split = gr.Text(label="Dataset split", value="train")
dataset_config = gr.Text(label="Dataset config (optional)")
tokenizer_name = gr.Text(label="Tokenizer name")
with gr.Row():
num_gpus_per_node = gr.Number(value=1, minimum=1, label="Number of GPUs per node")
num_nodes = gr.Number(value=1, minimum=1, label="Number of nodes")
batch_size = gr.Number(value=8, minimum=1, label="Batch size")
grad_accum = gr.Number(value=1, minimum=1, label="Gradient accumulation steps")
block_size = gr.Number(value=2048, minimum=1, label="Block size")
text_column = gr.Text(value="text", label="Text column")
token = gr.Text(label="HF acces token (optional)")
with gr.Row():
with gr.Column():
calculate_btn = gr.Button(value="Calculate")
with gr.Column():
samples = gr.Number(value=None, minimum=1, label="Total block-sized samples", interactive=False)
steps = gr.Number(value=None, minimum=1, label="Total steps needed", interactive=False)
calculate_btn.click(
calculate_steps,
inputs=[
dataset_name,
dataset_split,
dataset_config,
tokenizer_name,
num_gpus_per_node,
num_nodes,
batch_size,
grad_accum,
block_size,
text_column,
token,
],
outputs=[samples, steps],
api_name="calculate-training-steps",
)
if __name__ == "__main__":
demo.queue().launch()