Spaces:

trl-lib
/

dataset-length-profiler

Running

App Files Files Community

qgallouedec HF Staff commited on 16 days ago

Commit

940ee11

verified ·

1 Parent(s): cd10f86

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -9

app.py CHANGED Viewed

@@ -15,6 +15,19 @@ training_args = SFTConfig(
     max_length={},
 )"""
 def benchmark(model_name, dataset_name):
     print(f"Running benchmark for model: {model_name} on dataset: {dataset_name}...")
@@ -23,11 +36,11 @@ def benchmark(model_name, dataset_name):
     print("Loading tokenizer...")
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     print("Tokenizing dataset...")
     config = SFTConfig(max_length=None, bf16=False)
     tokenized_dataset = SFTTrainer._prepare_dataset(
-        None, dataset, tokenizer, config, packing=False, formatting_func=None, dataset_name="train"
     )
     print("Computing the sequence lengths and total tokens")
@@ -46,8 +59,8 @@ def benchmark(model_name, dataset_name):
     hist = np.histogram(sequence_lengths, bins=50)
     lengths_distribution = pd.DataFrame({
-        "max_length": (hist[1][:-1] + hist[1][1:])/2,
-        "Percentage (%)": hist[0]/N_SAMPLES*100,
     })
     truncation_data = pd.DataFrame({
@@ -57,6 +70,7 @@ def benchmark(model_name, dataset_name):
     return lengths_distribution, truncation_data, CODE_TEMPLATE.format(recommended)
 with gr.Blocks() as demo:
     model_input = gr.Textbox(label="Model Name", value="Qwen/Qwen3-0.6B")
     dataset_input = gr.Textbox(label="Dataset Name", value="trl-lib/tldr")
@@ -78,10 +92,6 @@ This tool helps you choose an appropriate `max_length` value for your SFT traini
 - Generates two visualizations:
     - **Sequence Length Distribution:** Shows how long your tokenized sequences are.
     - **Truncation Percentage:** Estimates the percentage of tokens that would be discarded (truncated) for different `max_length` values.
-- Recommends the smallest `max_length` where truncation affects less than 5% of the tokens.
-Use this tool to balance efficiency and memory usage when setting your `max_length` parameter.
 """)
-demo.launch()

     max_length={},
 )"""
+class _TrainerStub:
+    """Minimal stand-in for an SFTTrainer instance, exposing only the attributes
+    that `_prepare_dataset` and `_tokenize` read from `self`."""
+    _is_vlm = False
+    chat_template = None
+    _tokenize = SFTTrainer._tokenize
+    def __init__(self, tokenizer):
+        self._tokenizer = tokenizer
 def benchmark(model_name, dataset_name):
     print(f"Running benchmark for model: {model_name} on dataset: {dataset_name}...")
     print("Loading tokenizer...")
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     print("Tokenizing dataset...")
     config = SFTConfig(max_length=None, bf16=False)
     tokenized_dataset = SFTTrainer._prepare_dataset(
+        _TrainerStub(tokenizer), dataset, tokenizer, config, packing=False, formatting_func=None, dataset_name="train"
     )
     print("Computing the sequence lengths and total tokens")
     hist = np.histogram(sequence_lengths, bins=50)
     lengths_distribution = pd.DataFrame({
+        "max_length": (hist[1][:-1] + hist[1][1:]) / 2,
+        "Percentage (%)": hist[0] / N_SAMPLES * 100,
     })
     truncation_data = pd.DataFrame({
     return lengths_distribution, truncation_data, CODE_TEMPLATE.format(recommended)
 with gr.Blocks() as demo:
     model_input = gr.Textbox(label="Model Name", value="Qwen/Qwen3-0.6B")
     dataset_input = gr.Textbox(label="Dataset Name", value="trl-lib/tldr")
 - Generates two visualizations:
     - **Sequence Length Distribution:** Shows how long your tokenized sequences are.
     - **Truncation Percentage:** Estimates the percentage of tokens that would be discarded (truncated) for different `max_length` values.
 """)
+demo.launch(server_name="0.0.0.0", server_port=7860)