alexghergh commited on
Commit
abd6171
1 Parent(s): 45dbcf7

Add end-of-training model, README, tokenizer

Browse files
README.md CHANGED
@@ -1,3 +1,17 @@
1
  ---
2
- license: mit
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ library_name: peft
3
+ base_model: google/gemma-2b
4
+ widget:
5
+ - text: "Salut, ce zi minunata pentru"
6
  ---
7
+
8
+ ## Model details
9
+
10
+ A (decent) try at fine-tuning a Gemma 2B model on about ~1.6GB of high-quality
11
+ Romanian.
12
+
13
+ All the scripts used + data are available in this repo.
14
+
15
+ ### Framework versions
16
+
17
+ - PEFT 0.9.0
adapter_config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "google/gemma-2b",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "loftq_config": {},
12
+ "lora_alpha": 32,
13
+ "lora_dropout": 0.1,
14
+ "megatron_config": null,
15
+ "megatron_core": "megatron.core",
16
+ "modules_to_save": null,
17
+ "peft_type": "LORA",
18
+ "r": 8,
19
+ "rank_pattern": {},
20
+ "revision": null,
21
+ "target_modules": [
22
+ "down_proj",
23
+ "up_proj",
24
+ "o_proj",
25
+ "v_proj",
26
+ "gate_proj",
27
+ "k_proj",
28
+ "q_proj"
29
+ ],
30
+ "task_type": "CAUSAL_LM",
31
+ "use_dora": false,
32
+ "use_rslora": false
33
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55ffb0e9dd4622929f14a38e560e535b02ffdd2da430c5bd6597af450619e38a
3
+ size 39256456
fine-tuning.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # start with torchrun --nproc-per-node <n-gpu's> fine-tuning.py
2
+ import os
3
+
4
+ import torch
5
+ from transformers import (
6
+ AutoModelForCausalLM,
7
+ AutoTokenizer,
8
+ DataCollatorForLanguageModeling,
9
+ TrainingArguments,
10
+ Trainer,
11
+ BitsAndBytesConfig,
12
+ TrainerCallback,
13
+ )
14
+ from datasets import load_from_disk
15
+ from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
16
+ from peft.tuners.lora import LoraLayer
17
+ from accelerate import Accelerator
18
+
19
+
20
+ batch_size = 2
21
+
22
+ checkpoint = "google/gemma-2b"
23
+ data_dir = "dataset_ro_small_v1/"
24
+ save_dir = "gemma-2b-romanian-1.6gb-finetuned-qlora"
25
+ log_dir = "training_logs/"
26
+
27
+ # load dataset
28
+ tokenized_datasets = load_from_disk(f'tokenized_{data_dir}')
29
+
30
+ tokenized_datasets = tokenized_datasets.shuffle(seed=42)
31
+
32
+ print(tokenized_datasets)
33
+
34
+ # load quantized model
35
+ bnb_config = BitsAndBytesConfig(
36
+ load_in_4bit=True,
37
+ bnb_4bit_quant_type="nf4",
38
+ bnb_4bit_quant_dtype=torch.float16,
39
+ bnb_4bit_compute_dtype=torch.float16,
40
+ )
41
+
42
+ model = AutoModelForCausalLM.from_pretrained(
43
+ checkpoint,
44
+ load_in_8bit=False,
45
+ quantization_config=bnb_config,
46
+ device_map={ "": Accelerator().process_index }, # see https://github.com/huggingface/trl/issues/1348
47
+ torch_dtype=torch.float16,
48
+ trust_remote_code=True,
49
+ attn_implementation='sdpa',#'flash_attention_2',
50
+ use_cache=False,
51
+ )
52
+ model = prepare_model_for_kbit_training(model)
53
+
54
+ # load qlora config
55
+ lora_config = LoraConfig(
56
+ lora_alpha=32,
57
+ lora_dropout=0.1,
58
+ r=8,
59
+ bias="none",
60
+ task_type="CAUSAL_LM",
61
+ target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
62
+ )
63
+ model = get_peft_model(model, lora_config)
64
+
65
+ model.print_trainable_parameters()
66
+
67
+ # load tokenizer from checkpoint
68
+ tokenizer = AutoTokenizer.from_pretrained(checkpoint)
69
+
70
+ tokenizer.pad_token = tokenizer.eos_token
71
+ data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
72
+
73
+ # training args
74
+ args = TrainingArguments(
75
+ output_dir='training_checkpoints/',
76
+ logging_dir=log_dir,
77
+ per_device_train_batch_size=batch_size,
78
+ per_device_eval_batch_size=batch_size,
79
+ evaluation_strategy='no',
80
+ logging_steps=100,
81
+ save_strategy='steps',
82
+ save_steps=100,
83
+ save_total_limit=10,
84
+ gradient_accumulation_steps=4,
85
+ gradient_checkpointing=True,
86
+ gradient_checkpointing_kwargs={ "use_reentrant": False },
87
+ num_train_epochs=1,
88
+ warmup_steps=1_000,
89
+ weight_decay=0.001,
90
+ lr_scheduler_type='cosine',
91
+ learning_rate=1e-4,
92
+ max_grad_norm=0.3,
93
+ fp16=True,
94
+ ddp_find_unused_parameters=False,
95
+ )
96
+
97
+ # stop the training loop after 1000 updates
98
+ class StopCallback(TrainerCallback):
99
+ def on_step_end(self, args, state, control, **kwargs):
100
+ if state.global_step != 0 and state.global_step % 1000 == 0:
101
+ # stop training
102
+ control.should_training_stop = True
103
+
104
+ # train as usual
105
+ trainer = Trainer(
106
+ model=model,
107
+ args=args,
108
+ data_collator=data_collator,
109
+ train_dataset=tokenized_datasets['train'],
110
+ eval_dataset=tokenized_datasets['test'],
111
+ tokenizer=tokenizer,
112
+ )
113
+ trainer.add_callback(StopCallback)
114
+
115
+ print("Starting training...")
116
+
117
+ train_checkpoint = os.getenv("TRAIN_CHECKPOINT")
118
+ if train_checkpoint is not None:
119
+ trainer.train(train_checkpoint) # resume training from checkpoint dir
120
+ else:
121
+ trainer.train()
122
+
123
+ # save trainer state at end
124
+ torch.save(trainer.state.log_history, "trainer_log_history.pth")
125
+
126
+ model.save_pretrained(save_dir)
127
+ tokenizer.save_pretrained(save_dir)
inference.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import (
2
+ AutoModelForCausalLM,
3
+ AutoTokenizer,
4
+ )
5
+ from peft import PeftModel, PeftConfig
6
+ import torch
7
+
8
+ orig_checkpoint = 'google/gemma-2b'
9
+ checkpoint = 'checkpoint-4000'
10
+ HF_TOKEN = ''
11
+ PROMPT = 'Salut, ca sa imi schimb buletinul pot sa'
12
+
13
+ seq_len = 2048
14
+
15
+ # load original model first
16
+ tokenizer = AutoTokenizer.from_pretrained(orig_checkpoint, token=HF_TOKEN)
17
+
18
+ config = PeftConfig.from_pretrained(checkpoint)
19
+ model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, token=HF_TOKEN)
20
+
21
+ # then merge trained QLoRA weights
22
+ model = PeftModel.from_pretrained(model, checkpoint)
23
+ model.merge_and_unload()
24
+
25
+ model = model.cuda()
26
+
27
+ # generate normally
28
+ inputs = tokenizer.encode(PROMPT, return_tensors="pt").cuda()
29
+ outputs = model.generate(inputs, max_new_tokens=seq_len)
30
+
31
+ print(tokenizer.decode(outputs[0]))
preprocessing.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer
2
+ from datasets import load_dataset
3
+
4
+
5
+ checkpoint = "google/gemma-2b"
6
+ data_dir = "dataset_ro_small_v1/"
7
+
8
+ seq_len = 2048
9
+
10
+ raw_datasets = load_dataset("json", data_dir=data_dir, split='train')
11
+ raw_datasets = raw_datasets.remove_columns(['url', 'date_download', 'digest',
12
+ 'length', 'nlines', 'source_domain',
13
+ 'title', 'cc_segment',
14
+ 'original_nlines',
15
+ 'original_length', 'line_ids',
16
+ 'language', 'language_score'])
17
+ raw_datasets = raw_datasets.rename_column('raw_content', 'text')
18
+ raw_datasets = raw_datasets.train_test_split(test_size=0.1)
19
+
20
+ print(raw_datasets)
21
+
22
+ # load tokenizer from checkpoint
23
+ tokenizer = AutoTokenizer.from_pretrained(checkpoint)
24
+
25
+ def tokenize_fn(examples):
26
+ return tokenizer(examples['text'],
27
+ max_length=seq_len,
28
+ return_overflowing_tokens=True,
29
+ truncation=True)
30
+
31
+ tokenizer.pad_token = tokenizer.eos_token
32
+ tokenized_datasets = raw_datasets.map(
33
+ tokenize_fn,
34
+ batched=True,
35
+ remove_columns=raw_datasets['train'].column_names
36
+ )
37
+
38
+ tokenized_datasets.save_to_disk(f'tokenized_{data_dir}')
special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<bos>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<eos>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "<eos>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<pad>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<eos>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "<bos>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "3": {
30
+ "content": "<unk>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ }
37
+ },
38
+ "bos_token": "<bos>",
39
+ "clean_up_tokenization_spaces": false,
40
+ "eos_token": "<eos>",
41
+ "legacy": null,
42
+ "model_max_length": 1000000000000000019884624838656,
43
+ "pad_token": "<eos>",
44
+ "sp_model_kwargs": {},
45
+ "spaces_between_special_tokens": false,
46
+ "tokenizer_class": "GemmaTokenizer",
47
+ "unk_token": "<unk>",
48
+ "use_default_system_prompt": false
49
+ }