|
import spaces |
|
import gradio as gr |
|
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM |
|
from transformers import DataCollatorForSeq2Seq |
|
from datasets import load_dataset, concatenate_datasets, load_from_disk |
|
import traceback |
|
from sklearn.metrics import accuracy_score |
|
import numpy as np |
|
|
|
import os |
|
from huggingface_hub import login |
|
from peft import get_peft_model, LoraConfig |
|
|
|
os.environ['HF_HOME'] = '/data/.huggingface' |
|
|
|
lora_config = LoraConfig( |
|
r=16, |
|
lora_alpha=32, |
|
lora_dropout=0.1, |
|
bias="none" |
|
) |
|
model = AutoModelForSeq2SeqLM.from_pretrained('google/t5-efficient-tiny', num_labels=2, force_download=True) |
|
model = get_peft_model(model, lora_config) |
|
model.gradient_checkpointing_enable() |
|
|
|
|
|
def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size, lr, grad): |
|
try: |
|
def compute_metrics(eval_pred): |
|
logits, labels = eval_pred |
|
predictions = np.argmax(logits, axis=1) |
|
accuracy = accuracy_score(labels, predictions) |
|
return { |
|
'eval_accuracy': accuracy, |
|
'eval_loss': eval_pred.loss, |
|
} |
|
login(api_key.strip()) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
training_args = TrainingArguments( |
|
output_dir='/data/results', |
|
eval_strategy="steps", |
|
save_strategy='steps', |
|
learning_rate=lr*0.00001, |
|
per_device_train_batch_size=int(batch_size), |
|
per_device_eval_batch_size=int(batch_size), |
|
num_train_epochs=int(num_epochs), |
|
weight_decay=0.01, |
|
|
|
max_grad_norm = 1.0, |
|
load_best_model_at_end=True, |
|
metric_for_best_model="accuracy", |
|
greater_is_better=True, |
|
logging_dir='/data/logs', |
|
logging_steps=10, |
|
|
|
hub_model_id=hub_id.strip(), |
|
fp16=True, |
|
|
|
save_steps=100, |
|
save_total_limit=3, |
|
) |
|
|
|
if os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir): |
|
print("Loading model from checkpoint...") |
|
model = AutoModelForSeq2SeqLM.from_pretrained(training_args.output_dir) |
|
|
|
max_length = 128 |
|
try: |
|
tokenized_train_dataset = load_from_disk(f'/data/{hub_id.strip()}_train_dataset') |
|
tokenized_test_dataset = load_from_disk(f'/data/{hub_id.strip()}_test_dataset') |
|
|
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=tokenized_train_dataset, |
|
eval_dataset=tokenized_test_dataset, |
|
compute_metrics=compute_metrics, |
|
) |
|
except: |
|
|
|
dataset = load_dataset(dataset_name.strip()) |
|
tokenizer = AutoTokenizer.from_pretrained('google/t5-efficient-tiny-nh8') |
|
|
|
def tokenize_function(examples): |
|
|
|
|
|
model_inputs = tokenizer( |
|
examples['text'], |
|
max_length=max_length, |
|
truncation='max_length', |
|
) |
|
|
|
|
|
labels = tokenizer( |
|
examples['target'], |
|
max_length=max_length, |
|
truncation=True, |
|
text_target=examples['target'] |
|
) |
|
|
|
|
|
model_inputs["labels"] = labels["input_ids"] |
|
return model_inputs |
|
|
|
tokenized_datasets = dataset.map(tokenize_function, batched=True) |
|
|
|
tokenized_datasets['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset') |
|
tokenized_datasets['test'].save_to_disk(f'/data/{hub_id.strip()}_test_dataset') |
|
|
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=tokenized_datasets['train'], |
|
eval_dataset=tokenized_datasets['test'], |
|
compute_metrics=compute_metrics, |
|
|
|
) |
|
|
|
|
|
trainer.train() |
|
trainer.push_to_hub(commit_message="Training complete!") |
|
except Exception as e: |
|
return f"An error occurred: {str(e)}, TB: {traceback.format_exc()}" |
|
return 'DONE!' |
|
''' |
|
# Define Gradio interface |
|
def predict(text): |
|
model = AutoModelForSeq2SeqLM.from_pretrained(model_name.strip(), num_labels=2) |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True) |
|
outputs = model(inputs) |
|
predictions = outputs.logits.argmax(dim=-1) |
|
return predictions.item() |
|
''' |
|
|
|
@spaces.GPU(duration=120) |
|
def run_train(dataset_name, hub_id, api_key, num_epochs, batch_size, lr, grad): |
|
result = fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size, lr, grad) |
|
return result |
|
|
|
try: |
|
model = AutoModelForSeq2SeqLM.from_pretrained('google/t5-efficient-tiny-nh8'.strip(), num_labels=2, force_download=True) |
|
iface = gr.Interface( |
|
fn=run_train, |
|
inputs=[ |
|
gr.Textbox(label="Dataset Name (e.g., 'imdb')"), |
|
gr.Textbox(label="HF hub to push to after training"), |
|
gr.Textbox(label="HF API token"), |
|
gr.Slider(minimum=1, maximum=10, value=3, label="Number of Epochs", step=1), |
|
gr.Slider(minimum=1, maximum=2000, value=1, label="Batch Size", step=1), |
|
gr.Slider(minimum=1, maximum=1000, value=1, label="Learning Rate (e-5)", step=1), |
|
gr.Slider(minimum=1, maximum=100, value=1, label="Gradient accumulation", step=1), |
|
], |
|
outputs="text", |
|
title="Fine-Tune Hugging Face Model", |
|
description="This interface allows you to fine-tune a Hugging Face model on a specified dataset." |
|
) |
|
''' |
|
iface = gr.Interface( |
|
fn=predict, |
|
inputs=[ |
|
gr.Textbox(label="Query"), |
|
], |
|
outputs="text", |
|
title="Fine-Tune Hugging Face Model", |
|
description="This interface allows you to test a fine-tune Hugging Face model." |
|
) |
|
''' |
|
|
|
iface.launch() |
|
except Exception as e: |
|
print(f"An error occurred: {str(e)}, TB: {traceback.format_exc()}") |
|
|
|
|