Image-Text-to-Text
PEFT
Safetensors
English
æLtorio commited on
Commit
297cc58
1 Parent(s): e6f9e1a

add docker job

Browse files
Files changed (3) hide show
  1. Dockerfile +8 -0
  2. learn.py +146 -0
  3. start.sh +10 -0
Dockerfile ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ FROM ovhcom/ai-training-pytorch:latest
2
+ RUN source /workspace/.miniconda3/bin/activate \
3
+ && pip install -U "safetensors>=0.4.5" \
4
+ && pip install -U git+https://github.com/huggingface/transformers.git\
5
+ && pip install huggingface_hub accelerate datasets peft\
6
+ && pip install -U Pillow
7
+ COPY --chmod=777 start.sh /start.sh
8
+ COPY learn.py /learn.py
learn.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+
4
+ from huggingface_hub import login as hf_login
5
+ from datasets import load_dataset
6
+ from peft import LoraConfig
7
+ from transformers import AutoProcessor, BitsAndBytesConfig, Idefics3ForConditionalGeneration, TrainingArguments, Trainer
8
+
9
+ HF_TOKEN = ""
10
+
11
+ if os.environ.get('HF_TOKEN') is not None:
12
+ HF_TOKEN = os.environ.get('HF_TOKEN')
13
+ print(f"Hugging Face token found in environment variable")
14
+
15
+ hf_login(
16
+ token=HF_TOKEN,
17
+ add_to_git_credential=True
18
+ )
19
+ dataset_id = "eltorio/ROCO-radiology"
20
+ prompt= "You are an expert radiologist certified with over 15 years of experience in diagnostic imaging, describe this image"
21
+ source_model_id = "HuggingFaceM4/Idefics3-8B-Llama3"
22
+ destination_model_id = "eltorio/ROCO-idefics3-8B"
23
+ output_dir = "IDEFICS3_ROCO"
24
+
25
+ train_dataset = load_dataset(dataset_id, split="train")
26
+
27
+ DEVICE = "cuda:0"
28
+ USE_LORA = False
29
+ USE_QLORA = True
30
+
31
+ processor = AutoProcessor.from_pretrained(
32
+ source_model_id,
33
+ do_image_splitting=False
34
+ )
35
+
36
+ if USE_QLORA or USE_LORA:
37
+ lora_config = LoraConfig(
38
+ r=8,
39
+ lora_alpha=8,
40
+ lora_dropout=0.1,
41
+ target_modules='.*(text_model|modality_projection|perceiver_resampler).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$',
42
+ use_dora=False if USE_QLORA else True,
43
+ init_lora_weights="gaussian"
44
+ )
45
+ if USE_QLORA:
46
+ bnb_config = BitsAndBytesConfig(
47
+ load_in_4bit=True,
48
+ bnb_4bit_quant_type="nf4",
49
+ bnb_4bit_compute_dtype=torch.float16
50
+ )
51
+ model = Idefics3ForConditionalGeneration.from_pretrained(
52
+ source_model_id,
53
+ torch_dtype=torch.float16,
54
+ quantization_config=bnb_config if USE_QLORA else None,
55
+ )
56
+ model.add_adapter(lora_config)
57
+ model.enable_adapters()
58
+ else:
59
+ model = Idefics3ForConditionalGeneration.from_pretrained(
60
+ source_model_id,
61
+ torch_dtype=torch.float16,
62
+ _attn_implementation="flash_attention_2", # This works for A100 or H100
63
+ ).to(DEVICE)
64
+
65
+ class MyDataCollator:
66
+ def __init__(self, processor):
67
+ self.processor = processor
68
+ self.image_token_id = processor.tokenizer.additional_special_tokens_ids[
69
+ processor.tokenizer.additional_special_tokens.index("<image>")
70
+ ]
71
+
72
+ def __call__(self, samples):
73
+ texts = []
74
+ images = []
75
+ for sample in samples:
76
+ image = sample["image"]
77
+ answer = sample["caption"]
78
+ messages = [
79
+ {
80
+ "role": "system",
81
+ "content": [
82
+ {"type": "text", "text": prompt}
83
+ ]
84
+
85
+ },
86
+ {
87
+ "role": "user",
88
+ "content": [
89
+ {"type": "image"},
90
+ ]
91
+ },
92
+ {
93
+ "role": "assistant",
94
+ "content": [
95
+ {"type": "text", "text": answer}
96
+ ]
97
+ }
98
+ ]
99
+ text = processor.apply_chat_template(messages, add_generation_prompt=False)
100
+ texts.append(text.strip())
101
+ images.append([image.convert('RGB')])
102
+
103
+ batch = processor(text=texts, images=images, return_tensors="pt", padding=True)
104
+
105
+ labels = batch["input_ids"].clone()
106
+ labels[labels == processor.tokenizer.pad_token_id] = self.image_token_id
107
+ batch["labels"] = labels
108
+
109
+ return batch
110
+
111
+ data_collator = MyDataCollator(processor)
112
+
113
+
114
+ training_args = TrainingArguments(
115
+ output_dir = output_dir,
116
+ overwrite_output_dir = False,
117
+ auto_find_batch_size = True,
118
+ learning_rate = 2e-4,
119
+ fp16 = True,
120
+ per_device_train_batch_size = 2,
121
+ per_device_eval_batch_size = 2,
122
+ gradient_accumulation_steps = 8,
123
+ dataloader_pin_memory = False,
124
+ save_total_limit = 3,
125
+ evaluation_strategy = None,
126
+ save_strategy = "steps",
127
+ eval_steps = 100,
128
+ save_steps = 10, # checkpoint each 10 steps
129
+ resume_from_checkpoint = True,
130
+ logging_steps = 5,
131
+ remove_unused_columns = False,
132
+ push_to_hub = True,
133
+ label_names = ["labels"],
134
+ load_best_model_at_end = False,
135
+ report_to = "none",
136
+ optim = "paged_adamw_8bit",
137
+ )
138
+
139
+ trainer = Trainer(
140
+ model = model,
141
+ args = training_args,
142
+ data_collator = data_collator,
143
+ train_dataset = train_dataset,
144
+ )
145
+
146
+ trainer.train()
start.sh ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ cd /workspace
3
+ git config --global credential.helper store
4
+ git lfs install
5
+ export HF_TOKEN=$1
6
+ echo "HF_TOKEN: $HF_TOKEN"
7
+ huggingface-cli login --add-to-git-credential --token $HF_TOKEN
8
+ git clone https://huggingface.co/eltorio/IDEFICS3_ROCO
9
+ . /workspace/.miniconda3/bin/activate
10
+ python /learn.py