gemma-7b-it-python2 / README.md
dvdmrs09's picture
Update README.md
feaff47 verified
---
base_model: google/gemma-2b
pipeline_tag: text-generation
library_name: peft
---
# Model Card for Model ID
Fine-tuned on python
## Model Details
### Model Description
### Model Sources [optional]
Gemma-2b trained on python-oasst dataset
## Uses
## Training Details
### Training Data
{
"_timestamp": 1711018613.433522,
"train/grad_norm": 0.1240904619429259,
"train/global_step": 232,
"eval/steps_per_second": 2.894,
"_step": 232,
"_runtime": 2545.4226660728455,
"eval/loss": 1.189491629600525,
"eval/runtime": 1805.8574,
"train/learning_rate": 0.000014800637958532697,
"eval/samples_per_second": 23.152,
"_wandb.runtime": 2547,
"train/loss": 1.0436,
"train/epoch": 0.01
}
### Results
#### Summary
## Technical Specifications [optional]
{
"bf16": {
"desc": null,
"value": true
},
"fp16": {
"desc": null,
"value": false
},
"fsdp": {
"desc": null,
"value": []
},
"seed": {
"desc": null,
"value": 42
},
"tf32": {
"desc": null,
"value": false
},
"debug": {
"desc": null,
"value": []
},
"optim": {
"desc": null,
"value": "adamw_bnb_8bit"
},
"qlora": {
"desc": null,
"value": true
},
"top_k": {
"desc": null,
"value": 50
},
"top_p": {
"desc": null,
"value": 1
},
"_wandb": {
"desc": null,
"value": {
"m": [
{
"1": "train/global_step",
"6": [
3
]
},
{
"1": "train/loss",
"5": 1,
"6": [
1
]
},
{
"1": "train/grad_norm",
"5": 1,
"6": [
1
]
},
{
"1": "train/learning_rate",
"5": 1,
"6": [
1
]
},
{
"1": "train/epoch",
"5": 1,
"6": [
1
]
},
{
"1": "eval/loss",
"5": 1,
"6": [
1
]
},
{
"1": "eval/runtime",
"5": 1,
"6": [
1
]
},
{
"1": "eval/samples_per_second",
"5": 1,
"6": [
1
]
},
{
"1": "eval/steps_per_second",
"5": 1,
"6": [
1
]
}
],
"t": {
"1": [
1,
5,
11,
49,
51,
53,
55,
71,
84,
98,
99,
100,
105
],
"2": [
1,
5,
11,
49,
51,
53,
55,
71,
84,
98,
99,
100,
105
],
"3": [
3,
7,
23
],
"4": "3.10.13",
"5": "0.16.4",
"6": "4.39.0.dev0",
"8": [
5
],
"9": {
"1": "transformers_trainer"
},
"13": "linux-x86_64"
},
"framework": "huggingface",
"start_time": 1711016068,
"cli_version": "0.16.4",
"is_jupyter_run": false,
"python_version": "3.10.13",
"is_kaggle_kernel": false,
"huggingface_version": "4.39.0.dev0"
}
},
"prefix": {
"desc": null,
"value": null
},
"do_eval": {
"desc": null,
"value": true
},
"no_cuda": {
"desc": null,
"value": false
},
"use_cpu": {
"desc": null,
"value": false
},
"do_train": {
"desc": null,
"value": false
},
"head_dim": {
"desc": null,
"value": 256
},
"id2label": {
"desc": null,
"value": {
"0": "LABEL_0",
"1": "LABEL_1"
}
},
"label2id": {
"desc": null,
"value": {
"LABEL_0": 0,
"LABEL_1": 1
}
},
"run_name": {
"desc": null,
"value": "./out"
},
"use_ipex": {
"desc": null,
"value": false
},
"adafactor": {
"desc": null,
"value": false
},
"data_seed": {
"desc": null,
"value": null
},
"deepspeed": {
"desc": null,
"value": "deepspeed_configs/zero1.json"
},
"do_sample": {
"desc": null,
"value": false
},
"hub_token": {
"desc": null,
},
"log_level": {
"desc": null,
"value": "passive"
},
"max_steps": {
"desc": null,
"value": -1
},
"num_beams": {
"desc": null,
"value": 1
},
"ray_scope": {
"desc": null,
"value": "last"
},
"report_to": {
"desc": null,
"value": [
"wandb"
]
},
"typical_p": {
"desc": null,
"value": 1
},
"use_cache": {
"desc": null,
"value": false
},
"adam_beta1": {
"desc": null,
"value": 0.9
},
"adam_beta2": {
"desc": null,
"value": 0.999
},
"do_predict": {
"desc": null,
"value": false
},
"eval_delay": {
"desc": null,
"value": 0
},
"eval_steps": {
"desc": null,
"value": 0.03125
},
"hidden_act": {
"desc": null,
"value": "gelu"
},
"is_decoder": {
"desc": null,
"value": false
},
"local_rank": {
"desc": null,
"value": 0
},
"max_length": {
"desc": null,
"value": 20
},
"min_length": {
"desc": null,
"value": 0
},
"model_type": {
"desc": null,
"value": "gemma"
},
"optim_args": {
"desc": null,
"value": null
},
"orpo_alpha": {
"desc": null,
"value": null
},
"output_dir": {
"desc": null,
"value": "./out"
},
"past_index": {
"desc": null,
"value": -1
},
"rope_theta": {
"desc": null,
"value": 10000
},
"save_steps": {
"desc": null,
"value": 0.125
},
"vocab_size": {
"desc": null,
"value": 256000
},
"bench_split": {
"desc": null,
"value": "eval"
},
"ddp_backend": {
"desc": null,
"value": null
},
"ddp_timeout": {
"desc": null,
"value": 1800
},
"fsdp_config": {
"desc": null,
"value": {
"xla": false,
"xla_fsdp_v2": false,
"min_num_params": 0,
"xla_fsdp_grad_ckpt": false
}
},
"hidden_size": {
"desc": null,
"value": 2048
},
"label_names": {
"desc": null,
"value": null
},
"logging_dir": {
"desc": null,
"value": "./out/runs/Mar21_10-14-24_8205afe3ecd2"
},
"pretraining": {
"desc": null,
"value": false
},
"push_to_hub": {
"desc": null,
"value": false
},
"return_dict": {
"desc": null,
"value": true
},
"temperature": {
"desc": null,
"value": 1
},
"torch_dtype": {
"desc": null,
"value": "bfloat16"
},
"torchdynamo": {
"desc": null,
"value": null
},
"torchscript": {
"desc": null,
"value": false
},
"adam_epsilon": {
"desc": null,
"value": 1e-8
},
"bos_token_id": {
"desc": null,
"value": 2
},
"disable_tqdm": {
"desc": null,
"value": false
},
"eos_token_id": {
"desc": null,
"value": 1
},
"fp16_backend": {
"desc": null,
"value": "auto"
},
"hub_model_id": {
"desc": null,
"value": null
},
"hub_strategy": {
"desc": null,
"value": "every_save"
},
"pad_token_id": {
"desc": null,
"value": 0
},
"problem_type": {
"desc": null,
"value": null
},
"pruned_heads": {
"desc": null,
"value": {}
},
"relora_steps": {
"desc": null,
"value": null
},
"rms_norm_eps": {
"desc": null,
"value": 0.000001
},
"rope_scaling": {
"desc": null,
"value": null
},
"sep_token_id": {
"desc": null,
"value": null
},
"use_bfloat16": {
"desc": null,
"value": false
},
"warmup_ratio": {
"desc": null,
"value": 0
},
"warmup_steps": {
"desc": null,
"value": 3135
},
"weight_decay": {
"desc": null,
"value": 0
},
"_name_or_path": {
"desc": null,
"value": "dvdmrs09/gemma2b-train"
},
"architectures": {
"desc": null,
"value": [
"GemmaForCausalLM"
]
},
"bad_words_ids": {
"desc": null,
"value": null
},
"bench_dataset": {
"desc": null,
"value": "pharaouk/dharma-1/dharma_1_mini.json"
},
"do_bench_eval": {
"desc": null,
"value": false
},
"jit_mode_eval": {
"desc": null,
"value": false
},
"learning_rate": {
"desc": null,
"value": 0.0002
},
"logging_steps": {
"desc": null,
"value": 1
},
"max_grad_norm": {
"desc": null,
"value": 1
},
"mp_parameters": {
"desc": null,
"value": ""
},
"output_scores": {
"desc": null,
"value": false
},
"save_strategy": {
"desc": null,
"value": "steps"
},
"split_batches": {
"desc": null,
"value": null
},
"torch_compile": {
"desc": null,
"value": false
},
"tpu_num_cores": {
"desc": null,
"value": null
},
"attention_bias": {
"desc": null,
"value": false
},
"bf16_full_eval": {
"desc": null,
"value": false
},
"early_stopping": {
"desc": null,
"value": false
},
"fp16_full_eval": {
"desc": null,
"value": false
},
"fp16_opt_level": {
"desc": null,
"value": "O1"
},
"length_penalty": {
"desc": null,
"value": 1
},
"max_seq_length": {
"desc": null,
"value": 4096
},
"sample_packing": {
"desc": null,
"value": false
},
"tf_legacy_loss": {
"desc": null,
"value": false
},
"use_mps_device": {
"desc": null,
"value": false
},
"finetuning_task": {
"desc": null,
"value": null
},
"group_by_length": {
"desc": null,
"value": false
},
"hub_always_push": {
"desc": null,
"value": false
},
"num_beam_groups": {
"desc": null,
"value": 1
},
"save_only_model": {
"desc": null,
"value": false
},
"suppress_tokens": {
"desc": null,
"value": null
},
"tokenizer_class": {
"desc": null,
"value": null
},
"dispatch_batches": {
"desc": null,
"value": null
},
"full_determinism": {
"desc": null,
"value": false
},
"hub_private_repo": {
"desc": null,
"value": false
},
"ignore_data_skip": {
"desc": null,
"value": false
},
"log_on_each_node": {
"desc": null,
"value": true
},
"logging_strategy": {
"desc": null,
"value": "steps"
},
"num_train_epochs": {
"desc": null,
"value": 8
},
"save_safetensors": {
"desc": null,
"value": true
},
"save_total_limit": {
"desc": null,
"value": 4
},
"attention_dropout": {
"desc": null,
"value": 0
},
"ddp_bucket_cap_mb": {
"desc": null,
"value": null
},
"diversity_penalty": {
"desc": null,
"value": 0
},
"do_causal_lm_eval": {
"desc": null,
"value": false
},
"greater_is_better": {
"desc": null,
"value": false
},
"initializer_range": {
"desc": null,
"value": 0.02
},
"intermediate_size": {
"desc": null,
"value": 16384
},
"log_level_replica": {
"desc": null,
"value": "warning"
},
"loraplus_lr_ratio": {
"desc": null,
"value": null
},
"lr_scheduler_type": {
"desc": null,
"value": "cosine"
},
"max_bench_samples": {
"desc": null,
"value": null
},
"num_hidden_layers": {
"desc": null,
"value": 18
},
"output_attentions": {
"desc": null,
"value": false
},
"push_to_hub_token": {
"desc": null,
"value": "<PUSH_TO_HUB_TOKEN>"
},
"save_on_each_node": {
"desc": null,
"value": false
},
"tpu_metrics_debug": {
"desc": null,
"value": false
},
"accelerator_config": {
"desc": null,
"value": {
"even_batches": true,
"split_batches": false,
"dispatch_batches": null,
"use_seedable_sampler": true
}
},
"is_encoder_decoder": {
"desc": null,
"value": false
},
"length_column_name": {
"desc": null,
"value": "length"
},
"logging_first_step": {
"desc": null,
"value": false
},
"relora_prune_ratio": {
"desc": null,
"value": 0.9
},
"repetition_penalty": {
"desc": null,
"value": 1
},
"torch_compile_mode": {
"desc": null,
"value": null
},
"add_cross_attention": {
"desc": null,
"value": false
},
"cosine_min_lr_ratio": {
"desc": null,
"value": null
},
"eval_sample_packing": {
"desc": null,
"value": false
},
"evaluation_strategy": {
"desc": null,
"value": "steps"
},
"forced_bos_token_id": {
"desc": null,
"value": null
},
"forced_eos_token_id": {
"desc": null,
"value": null
},
"fsdp_min_num_params": {
"desc": null,
"value": 0
},
"lr_quadratic_warmup": {
"desc": null,
"value": false
},
"lr_scheduler_kwargs": {
"desc": null,
"value": {}
},
"neftune_noise_alpha": {
"desc": null,
"value": null
},
"num_attention_heads": {
"desc": null,
"value": 8
},
"num_key_value_heads": {
"desc": null,
"value": 1
},
"quantization_config": {
"desc": null,
"value": {
"load_in_4bit": true,
"load_in_8bit": false,
"quant_method": "QuantizationMethod.BITS_AND_BYTES",
"_load_in_4bit": true,
"_load_in_8bit": false,
"llm_int8_threshold": 6,
"bnb_4bit_quant_type": "nf4",
"llm_int8_skip_modules": null,
"bnb_4bit_compute_dtype": "bfloat16",
"bnb_4bit_quant_storage": "uint8",
"llm_int8_has_fp16_weight": false,
"bnb_4bit_use_double_quant": true,
"llm_int8_enable_fp32_cpu_offload": false
}
},
"relora_anneal_steps": {
"desc": null,
"value": null
},
"relora_warmup_steps": {
"desc": null,
"value": null
},
"skip_memory_metrics": {
"desc": null,
"value": true
},
"tie_encoder_decoder": {
"desc": null,
"value": false
},
"tie_word_embeddings": {
"desc": null,
"value": true
},
"auto_find_batch_size": {
"desc": null,
"value": false
},
"bench_source_max_len": {
"desc": null,
"value": 2048
},
"dataloader_drop_last": {
"desc": null,
"value": false
},
"no_repeat_ngram_size": {
"desc": null,
"value": 0
},
"num_return_sequences": {
"desc": null,
"value": 1
},
"optim_target_modules": {
"desc": null,
"value": null
},
"output_hidden_states": {
"desc": null,
"value": false
},
"overwrite_output_dir": {
"desc": null,
"value": false
},
"prediction_loss_only": {
"desc": null,
"value": false
},
"push_to_hub_model_id": {
"desc": null,
"value": null
},
"task_specific_params": {
"desc": null,
"value": null
},
"transformers_version": {
"desc": null,
"value": "4.39.0.dev0"
},
"begin_suppress_tokens": {
"desc": null,
"value": null
},
"dataloader_pin_memory": {
"desc": null,
"value": true
},
"ddp_broadcast_buffers": {
"desc": null,
"value": null
},
"loraplus_lr_embedding": {
"desc": null,
"value": null
},
"metric_for_best_model": {
"desc": null,
"value": "loss"
},
"remove_invalid_values": {
"desc": null,
"value": false
},
"remove_unused_columns": {
"desc": null,
"value": true
},
"torch_compile_backend": {
"desc": null,
"value": null
},
"dataloader_num_workers": {
"desc": null,
"value": 0
},
"decoder_start_token_id": {
"desc": null,
"value": null
},
"gradient_checkpointing": {
"desc": null,
"value": true
},
"half_precision_backend": {
"desc": null,
"value": "auto"
},
"label_smoothing_factor": {
"desc": null,
"value": 0
},
"load_best_model_at_end": {
"desc": null,
"value": true
},
"logging_nan_inf_filter": {
"desc": null,
"value": true
},
"multipack_real_batches": {
"desc": null,
"value": false
},
"resume_from_checkpoint": {
"desc": null,
"value": null
},
"chunk_size_feed_forward": {
"desc": null,
"value": 0
},
"eval_accumulation_steps": {
"desc": null,
"value": 3
},
"max_position_embeddings": {
"desc": null,
"value": 8192
},
"per_gpu_eval_batch_size": {
"desc": null,
"value": null
},
"return_dict_in_generate": {
"desc": null,
"value": false
},
"cosine_constant_lr_ratio": {
"desc": null,
"value": null
},
"per_gpu_train_batch_size": {
"desc": null,
"value": null
},
"push_to_hub_organization": {
"desc": null,
"value": null
},
"include_tokens_per_second": {
"desc": null,
"value": false
},
"sample_packing_efficiency": {
"desc": null,
"value": 1
},
"dataloader_prefetch_factor": {
"desc": null,
"value": null
},
"ddp_find_unused_parameters": {
"desc": null,
"value": false
},
"include_inputs_for_metrics": {
"desc": null,
"value": false
},
"per_device_eval_batch_size": {
"desc": null,
"value": 2
},
"use_legacy_prediction_loop": {
"desc": null,
"value": false
},
"cross_attention_hidden_size": {
"desc": null,
"value": null
},
"gradient_accumulation_steps": {
"desc": null,
"value": 3
},
"per_device_train_batch_size": {
"desc": null,
"value": 2
},
"encoder_no_repeat_ngram_size": {
"desc": null,
"value": 0
},
"dataloader_persistent_workers": {
"desc": null,
"value": false
},
"gradient_checkpointing_kwargs": {
"desc": null,
"value": {
"use_reentrant": true
}
},
"include_num_input_tokens_seen": {
"desc": null,
"value": false
},
"exponential_decay_length_penalty": {
"desc": null,
"value": null
},
"sample_packing_seq_len_multiplier": {
"desc": null,
"value": 2
},
"fsdp_transformer_layer_cls_to_wrap": {
"desc": null,
"value": null
}
}
### Model Architecture and Objective
## Citation [optional]
## Glossary [optional]
### Framework versions
- PEFT 0.9.0