Spaces:
Runtime error
Runtime error
| from typing import Set | |
| import gradio as gr | |
| from gradio.components import Component | |
| from utils import * | |
| def add_quantization_components() -> Set[Component]: | |
| q_components: Set[Component] = set() | |
| load_in_4bit = gr.Radio(["load_in_4bit", "load_in_8bit"], value="load_in_4bit", | |
| label="Quantization", | |
| info="This flag is used to enable 4/8-bit " | |
| "quantization.", | |
| interactive=True, | |
| elem_id=LOAD_IN_4_BIT_ID) | |
| bnb_4bit_quant_type = gr.Radio(["fp4", "nf4"], label="bnb_4bit_quant_type", | |
| value="nf4", | |
| elem_id=BNB_4BIT_QUANT_TYPE, | |
| interactive=True, | |
| info="This sets the quantization data type in " | |
| "the bnb.nn.Linear4Bit " | |
| "layers.") | |
| q_components.add(load_in_4bit) | |
| q_components.add(bnb_4bit_quant_type) | |
| return q_components | |
| def add_quantization_components1() -> Set[Component]: | |
| q_components: Set[Component] = set() | |
| bnb_4bit_compute_dtype = gr.Radio( | |
| ["torch.float32", "torch.bfloat16", "torch.float16"], | |
| label="bnb_4bit_compute_dtype", | |
| info="This sets the computational type which might be different " | |
| "than the input type.", | |
| elem_id=BNB_4BIT_COMPUTE_DTYPE, | |
| interactive=True, value="torch.bfloat16") | |
| bnb_4bit_use_double_quant = gr.Checkbox(label="bnb_4bit_use_double_quant", | |
| value=True, | |
| interactive=True, | |
| elem_id=BNB_4BIT_USE_DOUBLE_QUANT, | |
| info="This flag is used for nested " | |
| "quantization where the " | |
| "quantization constants from " | |
| "the first " | |
| "quantization are quantized " | |
| "again.") | |
| q_components.add(bnb_4bit_compute_dtype) | |
| q_components.add(bnb_4bit_use_double_quant) | |
| return q_components | |
| def add_dataset_components() -> Set[Component]: | |
| dataset_selection = gr.Dropdown([dt.path for dt in ft_datasets], | |
| elem_id=DATASET_SELECTION_ID, | |
| label="Select a Dataset", | |
| info="Choose a dataset to finetune the model in the ChatML format." | |
| ) | |
| seed = gr.Slider(0, 256, step=1, value=42, elem_id=DATASET_SHUFFLING_SEED, label="Random Seed", | |
| info="Set a random seed for shuffling the dataset.", interactive=True) | |
| d_components: Set[Component] = set() | |
| d_components.add(dataset_selection) | |
| d_components.add(seed) | |
| return d_components | |
| def add_pad_tokens() -> Set[Component]: | |
| pad_token_side = gr.Radio(["right", "left"], label="Tokenizer: padding_side", | |
| info="The side on which the model should have padding applied.", | |
| interactive=True, value="right", elem_id=PAD_SIDE_ID) | |
| pad_token_value = gr.Radio([None, "eos_token"], label="Tokenizer: pad_token", | |
| info="A special token used to make arrays of tokens the same size for batching " | |
| "purpose. Will then be " | |
| "ignored by attention mechanisms or loss computation.", | |
| interactive=True, value=None, elem_id=PAD_VALUE_ID) | |
| pad_components: Set[Component] = set() | |
| pad_components.add(pad_token_side) | |
| pad_components.add(pad_token_value) | |
| return pad_components | |
| def add_lora_components() -> Set[Component]: | |
| r = gr.Slider(1, 2048, step=1, value=6, label="r", info="Lora attention dimension (the 'rank').", | |
| interactive=True, elem_id=LORA_R_ID) | |
| alpha = gr.Slider(1, 512, step=1, value=8, label="lora_alpha", info="The alpha parameter for Lora scaling.", | |
| interactive=True, elem_id=LORA_ALPHA_ID) | |
| out_components: Set[Component] = set() | |
| out_components.add(r) | |
| out_components.add(alpha) | |
| return out_components | |
| def add_lora_components1() -> Set[Component]: | |
| dropout = gr.Slider(0, 1, step=0.01, value=0.05, label="lora_dropout", | |
| info="The dropout probability for Lora layers.", | |
| interactive=True, elem_id=LORA_DROPOUT_ID) | |
| bias = gr.Radio(['none', 'all', 'lora_only'], label="bias", | |
| info="Bias type for LoRA. If 'all' or 'lora_only', the corresponding biases will be updated during " | |
| "training.", | |
| interactive=True, value="none", elem_id=LORA_BIAS_ID) | |
| out_components: Set[Component] = set() | |
| out_components.add(dropout) | |
| out_components.add(bias) | |
| return out_components | |
| def add_training_args_1() -> Set[Component]: | |
| epochs = gr.Slider(1, 100, step=1, value=3, label="num_train_epochs", | |
| info="Total number of training epochs to perform.", | |
| interactive=True, elem_id=NUM_TRAIN_EPOCHS_ID) | |
| max_steps = gr.Slider(-1, 100, step=1, value=-1, label="max_steps", | |
| info="Total number of training steps to perform. If set to a positive number it overrides " | |
| "'num_train_epochs'.", | |
| interactive=True, elem_id=MAX_STEPS_ID) | |
| out_components: Set[Component] = set() | |
| out_components.add(epochs) | |
| out_components.add(max_steps) | |
| return out_components | |
| def add_training_args_1_bis() -> Set[Component]: | |
| logging_steps = gr.Slider(1, 100, step=1, value=10, label="logging_steps", | |
| info="Number of update steps between two logs if logging_strategy='steps'", | |
| interactive=True, elem_id=LOGGING_STEPS_ID) | |
| per_device_train_batch_size = gr.Slider(1, 64, step=1, value=3, label="per_device_train_batch_size", | |
| info="Batch size per device during training.", | |
| interactive=True, elem_id=PER_DEVICE_TRAIN_BATCH_SIZE) | |
| save_strategy = gr.Radio(['no', 'epoch', 'steps'], label="save_strategy", | |
| info="The checkpoint save strategy to adopt during training.", | |
| interactive=True, value="epoch", elem_id=SAVE_STRATEGY_ID) | |
| out_components: Set[Component] = set() | |
| out_components.add(save_strategy) | |
| out_components.add(logging_steps) | |
| out_components.add(per_device_train_batch_size) | |
| return out_components | |
| def add_training_args_3() -> Set[Component]: | |
| max_grad_norm = gr.Slider(0.01, 1, value=0.3, label="max_grad_norm", | |
| info="Maximum gradient norm (for gradient clipping).", | |
| interactive=True, elem_id=MAX_GRAD_NORM_ID) | |
| warmup_ratio = gr.Slider(0, 1, value=0.1, label="warmup_ratio", | |
| info="Ratio of total training steps used for a linear warmup from 0 to learning_rate.", | |
| interactive=True, elem_id=WARMUP_RATIO_ID) | |
| gradient_accumulation_steps = gr.Slider(1, 64, step=1, value=2, label="gradient_accumulation_steps", | |
| info="Number of updates steps to accumulate the gradients for, before " | |
| "performing a backward/update " | |
| "pass.", | |
| interactive=True, elem_id=GRADIENT_ACCUMULATION_STEPS_ID) | |
| gradient_checkpointing = gr.Checkbox(label="gradient_checkpointing", value=True, interactive=True, | |
| info="Use gradient checkpointing to save memory at the expense of slower " | |
| "backward pass.", elem_id=GRADIENT_CHECKPOINTING_ID) | |
| lr_scheduler_type = gr.Radio(['linear', 'constant', 'cosine'], label="lr_scheduler_type", | |
| info="The learning rate scheduler type to use.", | |
| interactive=True, value="cosine", elem_id=LR_SCHEDULER_TYPE_ID) | |
| out_components: Set[Component] = set() | |
| out_components.add(max_grad_norm) | |
| out_components.add(warmup_ratio) | |
| out_components.add(gradient_accumulation_steps) | |
| out_components.add(gradient_checkpointing) | |
| out_components.add(lr_scheduler_type) | |
| return out_components | |
| def add_outputs() -> (Component, Component): | |
| output_dir = gr.Textbox(interactive=True, | |
| label="output_dir", | |
| info='The output directory where the model and checkpoints will be saved.', | |
| elem_id=OUTPUT_DIR_ID) | |
| push_to_hub = gr.Checkbox( | |
| label="Push to Hub", | |
| value=False, | |
| interactive=True, | |
| info="Select this option if you want to upload the trained model to Hugging Face Hub after training. " | |
| "Please note, if this option is selected, you must provide a valid 'HF_TOKEN' in the generated notebook.", | |
| elem_id=PUSH_TO_HUB_ID | |
| ) | |
| return output_dir, push_to_hub | |
| def add_hf_repo_cmp() -> Component: | |
| repo_name = gr.Textbox(label="HF Repo name", | |
| placeholder="username/your_repository", | |
| info="Hugging Face repository to be created.", | |
| interactive=True, | |
| visible=False, | |
| elem_id=REPOSITORY_NAME_ID) | |
| return repo_name | |
| def add_outputs1() -> Set[Component]: | |
| report_to = gr.Dropdown( | |
| ["azure_ml", "comet_ml", "mlflow", "tensorboard", "wandb", "all", 'none'], | |
| value="tensorboard", | |
| elem_id=REPORT_TO_ID, | |
| label="report_to", | |
| info="The list of integrations to report the results and logs to. Supported platforms are 'azure_ml', " | |
| "'comet_ml', 'mlflow', 'tensorboard' and 'wandb'. Use 'all' to report to all integrations installed, " | |
| "'none' for no integrations." | |
| ) | |
| create_readme = gr.Checkbox(label="Automatically Generate a README.md", value=True, interactive=True, | |
| info="Choose whether to automatically generate a model card (README.md) or not.", | |
| elem_id=README_ID) | |
| out_components: Set[Component] = set() | |
| out_components.add(report_to) | |
| out_components.add(create_readme) | |
| return out_components | |
| def add_optimizer() -> Set[Component]: | |
| adam_beta1 = gr.Slider(0.00001, 1, value=0.9, label="adam_beta1", | |
| info="The beta1 hyperparameter for the [`AdamW`] optimizer.", | |
| interactive=True, elem_id=BETA1_ID) | |
| adam_beta2 = gr.Slider(0.00001, 1, value=0.999, label="adam_beta2", | |
| info="The beta2 hyperparameter for the [`AdamW`] optimizer.", | |
| interactive=True, elem_id=BETA2_ID) | |
| adam_epsilon = gr.Slider(1e-9, 1, value=1e-8, label="adam_epsilon", | |
| info="The epsilon hyperparameter for the [`AdamW`] optimizer.", | |
| interactive=True, elem_id=EPSILON_ID) | |
| out_components: Set[Component] = set() | |
| out_components.add(adam_beta1) | |
| out_components.add(adam_beta2) | |
| out_components.add(adam_epsilon) | |
| return out_components | |
| def add_optimizer1() -> Set[Component]: | |
| optimizer = gr.Dropdown( | |
| ["adamw_hf", "adamw_torch", "adamw_torch_fused", "adamw_apex_fused", "adamw_anyprecision", "adafactor"], | |
| value="adamw_torch_fused", | |
| elem_id=OPTIMIZER_ID, | |
| label="optimizer", | |
| info="The optimizer to use: 'adamw_hf', 'adamw_torch', 'adamw_torch_fused', 'adamw_apex_fused', " | |
| "'adamw_anyprecision' or " | |
| "'adafactor'. " | |
| ) | |
| learning_rate = gr.Slider(1e-6, 1, step=0.001, value=2.0e-05, label="learning_rate", | |
| info="The initial learning rate for AdamW.", | |
| interactive=True, elem_id=LEARNING_RATE_ID) | |
| weight_decay = gr.Slider(0, 1, value=0, label="weight_decay", | |
| info="The weight decay to apply (if not zero) to all layers except all bias and " | |
| "LayerNorm weights in [`AdamW`] optimizer.", | |
| interactive=True, elem_id=WEIGHT_DECAY_ID) | |
| out_components: Set[Component] = set() | |
| out_components.add(optimizer) | |
| out_components.add(learning_rate) | |
| out_components.add(weight_decay) | |
| return out_components | |
| def add_sft_trainer_args() -> Set[Component]: | |
| max_seq_length = gr.Slider(512, 3072, value=2048, label="max_seq_length", | |
| info="The maximum sequence length to use for the `ConstantLengthDataset` and for " | |
| "automatically " | |
| "creating the Dataset.", | |
| interactive=True, elem_id=MAX_SEQ_LENGTH_ID) | |
| packing = gr.Checkbox(label="packing", value=True, interactive=True, elem_id=PACKING_ID, | |
| info="This argument is used by the `ConstantLengthDataset` to pack the sequences of the " | |
| "dataset.") | |
| out_components: Set[Component] = set() | |
| out_components.add(max_seq_length) | |
| out_components.add(packing) | |
| return out_components | |