Kevin Fink commited on
Commit
ab2f056
·
1 Parent(s): 36b5e88
Files changed (1) hide show
  1. app.py +190 -10
app.py CHANGED
@@ -1,21 +1,190 @@
 
1
  import gradio as gr
2
- from transformers import AutoModelForSeq2SeqLM
3
  from transformers import DataCollatorForSeq2Seq, AutoConfig
 
 
 
 
 
 
 
 
 
4
 
5
- model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
6
- print(f"Successfully loaded the model without gradio or spaces, model object: {model}")
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
- @spaces.GPU(duration=120)
9
- def run_train(model_name, dataset_name, hub_id, api_key, num_epochs, batch_size, lr, grad):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
- model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
12
- return "WORKS"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  # Create Gradio interface
14
  try:
15
  iface = gr.Interface(
16
  fn=run_train,
17
  inputs=[
18
- gr.Textbox(label="Model Name (e.g., 'google/t5-efficient-tiny-nh8')"),
19
  gr.Textbox(label="Dataset Name (e.g., 'imdb')"),
20
  gr.Textbox(label="HF hub to push to after training"),
21
  gr.Textbox(label="HF API token"),
@@ -28,8 +197,19 @@ try:
28
  title="Fine-Tune Hugging Face Model",
29
  description="This interface allows you to fine-tune a Hugging Face model on a specified dataset."
30
  )
31
-
 
 
 
 
 
 
 
 
 
 
32
  # Launch the interface
33
  iface.launch()
34
  except Exception as e:
35
- print(f"An error occurred: {str(e)}, TB: {traceback.format_exc()}")
 
 
1
+ import spaces
2
  import gradio as gr
3
+ from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM
4
  from transformers import DataCollatorForSeq2Seq, AutoConfig
5
+ from datasets import load_dataset, concatenate_datasets, load_from_disk
6
+ import traceback
7
+ from sklearn.metrics import accuracy_score
8
+ import numpy as np
9
+ import torch
10
+ import os
11
+ import evaluate
12
+ from huggingface_hub import login
13
+ from peft import get_peft_model, LoraConfig
14
 
15
+ os.environ['HF_HOME'] = '/data/.huggingface'
16
+ '''
17
+ lora_config = LoraConfig(
18
+ r=16, # Rank of the low-rank adaptation
19
+ lora_alpha=32, # Scaling factor
20
+ lora_dropout=0.1, # Dropout for LoRA layers
21
+ bias="none" # Bias handling
22
+ )
23
+ model = AutoModelForSeq2SeqLM.from_pretrained('google/t5-efficient-tiny', num_labels=2, force_download=True)
24
+ model = get_peft_model(model, lora_config)
25
+ model.gradient_checkpointing_enable()
26
+ model_save_path = '/data/lora_finetuned_model' # Specify your desired save path
27
+ model.save_pretrained(model_save_path)
28
+ '''
29
 
30
+ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size, lr, grad):
31
+ try:
32
+ torch.nn.CrossEntropyLoss()
33
+ metric = evaluate.load("rouge", cache_dir='/data/cache')
34
+ def compute_metrics(eval_preds):
35
+ preds, labels = eval_preds
36
+ if isinstance(preds, tuple):
37
+ preds = preds[0]
38
+ # Replace -100s used for padding as we can't decode them
39
+ preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
40
+ decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
41
+ labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
42
+ decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
43
+
44
+ result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
45
+ result = {k: round(v * 100, 4) for k, v in result.items()}
46
+ prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
47
+ result["gen_len"] = np.mean(prediction_lens)
48
+ return result
49
+
50
+ login(api_key.strip())
51
+
52
+
53
+ # Load the model and tokenizer
54
+
55
+
56
+
57
+ # Set training arguments
58
+ training_args = TrainingArguments(
59
+ output_dir='/data/results',
60
+ eval_strategy="steps", # Change this to steps
61
+ save_strategy='steps',
62
+ learning_rate=lr*0.00001,
63
+ per_device_train_batch_size=int(batch_size),
64
+ per_device_eval_batch_size=int(batch_size),
65
+ num_train_epochs=int(num_epochs),
66
+ weight_decay=0.01,
67
+ #gradient_accumulation_steps=int(grad),
68
+ #max_grad_norm = 1.0,
69
+ load_best_model_at_end=True,
70
+ metric_for_best_model="accuracy",
71
+ greater_is_better=True,
72
+ logging_dir='/data/logs',
73
+ logging_steps=10,
74
+ #push_to_hub=True,
75
+ hub_model_id=hub_id.strip(),
76
+ fp16=True,
77
+ #lr_scheduler_type='cosine',
78
+ save_steps=100, # Save checkpoint every 500 steps
79
+ save_total_limit=3,
80
+ )
81
+ # Check if a checkpoint exists and load it
82
+ if os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir):
83
+ print("Loading model from checkpoint...")
84
+ model = AutoModelForSeq2SeqLM.from_pretrained(training_args.output_dir)
85
+
86
+ def tokenize_function(examples):
87
+
88
+ # Assuming 'text' is the input and 'target' is the expected output
89
+ model_inputs = tokenizer(
90
+ examples['text'],
91
+ max_length=max_length, # Set to None for dynamic padding
92
+ truncation=True,
93
+ padding='longest',
94
+ return_tensors='pt',
95
+ )
96
+
97
+ # Setup the decoder input IDs (shifted right)
98
+ labels = tokenizer(
99
+ examples['target'],
100
+ max_length=max_length, # Set to None for dynamic padding
101
+ truncation=True,
102
+ padding='longest',
103
+ #text_target=examples['target'],
104
+ return_tensors='pt',
105
+ )
106
+
107
+ # Add labels to the model inputs
108
+ model_inputs["labels"] = labels["input_ids"]
109
+ return model_inputs
110
+
111
+ #max_length = 512
112
+ # Load the dataset
113
+ dataset = load_dataset(dataset_name.strip())
114
+ train_size = len(dataset['train'])
115
+ half_size = train_size // 2
116
+ max_length = model.get_input_embeddings().weight.shape[0]
117
+ try:
118
+ tokenized_first_half = load_from_disk(f'/data/{hub_id.strip()}_train_dataset')
119
+ second_half = dataset['train'].select(range(half_size, train_size))
120
+ tokenized_second_half = tokenize_function(second_half.to_dict())
121
+ tokenized_train_dataset = concatenate_datasets([tokenized_first_half, tokenized_second_half])
122
+ tokenized_test_dataset = tokenize_function(dataset['test'])
123
+
124
+ # Create Trainer
125
+ trainer = Trainer(
126
+ model=model,
127
+ args=training_args,
128
+ train_dataset=tokenized_train_dataset,
129
+ eval_dataset=tokenized_test_dataset,
130
+ compute_metrics=compute_metrics,
131
+ )
132
+ except:
133
+ tokenizer = AutoTokenizer.from_pretrained('google/t5-efficient-tiny-nh8')
134
+ # Tokenize the dataset
135
+ first_half = dataset['train'].select(range(half_size))
136
+ tokenized_half = tokenize_function(first_half.to_dict())
137
+
138
+ tokenized_half.save_to_disk(f'/data/{hub_id.strip()}_train_dataset')
139
+
140
+ return 'RUN AGAIN TO LOAD REST OF DATA'
141
+
142
+ # Fine-tune the model
143
+ if os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir):
144
+ train_result = trainer.train(resume_from_checkpoint=True)
145
+ else:
146
+ train_result = trainer.train()
147
+ trainer.push_to_hub(commit_message="Training complete!")
148
+ except Exception as e:
149
+ return f"An error occurred: {str(e)}, TB: {traceback.format_exc()}"
150
+ return 'DONE!'#train_result
151
+ '''
152
+ # Define Gradio interface
153
+ def predict(text):
154
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name.strip(), num_labels=2)
155
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
156
+ inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
157
+ outputs = model(inputs)
158
+ predictions = outputs.logits.argmax(dim=-1)
159
+ return predictions.item()
160
+ '''
161
 
162
+ @spaces.GPU(duration=120)
163
+ def run_train(dataset_name, hub_id, api_key, num_epochs, batch_size, lr, grad):
164
+ def initialize_weights(model):
165
+ for name, param in model.named_parameters():
166
+ if 'encoder.block.0.layer.0.DenseReluDense.wi.weight' in name: # Example layer
167
+ torch.nn.init.xavier_uniform_(param.data) # Xavier initialization
168
+ elif 'encoder.block.0.layer.0.DenseReluDense.wo.weight' in name: # Another example layer
169
+ torch.nn.init.kaiming_normal_(param.data) # Kaiming initialization
170
+
171
+ config = AutoConfig.from_pretrained("google/t5-efficient-tiny")
172
+ model = AutoModelForSeq2SeqLM.from_config(config)
173
+ initialize_weights(model)
174
+ lora_config = LoraConfig(
175
+ r=16, # Rank of the low-rank adaptation
176
+ lora_alpha=32, # Scaling factor
177
+ lora_dropout=0.1, # Dropout for LoRA layers
178
+ bias="none" # Bias handling
179
+ )
180
+ model = get_peft_model(model, lora_config)
181
+ result = fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size, lr, grad)
182
+ return result
183
  # Create Gradio interface
184
  try:
185
  iface = gr.Interface(
186
  fn=run_train,
187
  inputs=[
 
188
  gr.Textbox(label="Dataset Name (e.g., 'imdb')"),
189
  gr.Textbox(label="HF hub to push to after training"),
190
  gr.Textbox(label="HF API token"),
 
197
  title="Fine-Tune Hugging Face Model",
198
  description="This interface allows you to fine-tune a Hugging Face model on a specified dataset."
199
  )
200
+ '''
201
+ iface = gr.Interface(
202
+ fn=predict,
203
+ inputs=[
204
+ gr.Textbox(label="Query"),
205
+ ],
206
+ outputs="text",
207
+ title="Fine-Tune Hugging Face Model",
208
+ description="This interface allows you to test a fine-tune Hugging Face model."
209
+ )
210
+ '''
211
  # Launch the interface
212
  iface.launch()
213
  except Exception as e:
214
+ print(f"An error occurred: {str(e)}, TB: {traceback.format_exc()}")
215
+