gradio-3 / app.py
Kevin Fink
deve
40f1713
raw
history blame
15 kB
import spaces
import gradio as gr
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq, AutoConfig
from datasets import load_dataset, concatenate_datasets, load_from_disk, DatasetDict
import traceback
from sklearn.metrics import accuracy_score
import numpy as np
import torch
import os
import evaluate
from huggingface_hub import login
from peft import get_peft_model, LoraConfig
os.environ['HF_HOME'] = '/data/.huggingface'
'''
lora_config = LoraConfig(
r=16, # Rank of the low-rank adaptation
lora_alpha=32, # Scaling factor
lora_dropout=0.1, # Dropout for LoRA layers
bias="none" # Bias handling
)
model = AutoModelForSeq2SeqLM.from_pretrained('google/t5-efficient-tiny', num_labels=2, force_download=True)
model = get_peft_model(model, lora_config)
model.gradient_checkpointing_enable()
model_save_path = '/data/lora_finetuned_model' # Specify your desired save path
model.save_pretrained(model_save_path)
'''
def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size, lr, grad):
try:
torch.cuda.empty_cache()
torch.nn.CrossEntropyLoss()
rouge_metric = evaluate.load("rouge", cache_dir='/data/cache')
def compute_metrics(eval_preds):
preds, labels = eval_preds
if isinstance(preds, tuple):
preds = preds[0]
from pprint import pprint as pp
pp(preds)
# Replace -100s used for padding as we can't decode them
preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
# Decode predictions and labels
decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
# Compute ROUGE metrics
result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels)
result = {k: round(v * 100, 4) for k, v in result.items()}
# Calculate accuracy
accuracy = accuracy_score(decoded_labels, decoded_preds)
result["accuracy"] = round(accuracy * 100, 4)
# Calculate average generation length
prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
result["gen_len"] = np.mean(prediction_lens)
return result
login(api_key.strip())
# Load the model and tokenizer
# Set training arguments
training_args = TrainingArguments(
remove_unused_columns=False,
torch_empty_cache_steps=100,
overwrite_output_dir=True,
output_dir='/data/results',
eval_strategy="steps", # Change this to steps
save_strategy='steps',
learning_rate=lr*0.00001,
per_device_train_batch_size=int(batch_size),
per_device_eval_batch_size=int(batch_size),
num_train_epochs=int(num_epochs),
weight_decay=0.01,
#gradient_accumulation_steps=int(grad),
#max_grad_norm = 3.0,
load_best_model_at_end=True,
metric_for_best_model="accuracy",
greater_is_better=True,
logging_dir='/data/logs',
logging_steps=200,
#push_to_hub=True,
hub_model_id=hub_id.strip(),
fp16=True,
#lr_scheduler_type='cosine',
save_steps=200, # Save checkpoint every 500 steps
save_total_limit=3,
)
# Check if a checkpoint exists and load it
#if os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir):
#print("Loading model from checkpoint...")
#model = AutoModelForSeq2SeqLM.from_pretrained(training_args.output_dir)
config = AutoConfig.from_pretrained("google/t5-efficient-tiny-nh8")
tokenizer = AutoTokenizer.from_pretrained('google/t5-efficient-tiny-nh8', use_fast=True, trust_remote_code=True)
#max_length = model.get_input_embeddings().weight.shape[0]
max_length = 512
def tokenize_function(examples):
# Assuming 'text' is the input and 'target' is the expected output
model_inputs = tokenizer(
examples['text'],
#max_length=max_length, # Set to None for dynamic padding
truncation=True,
padding='max_length',
#return_tensors='pt',
#padding=True,
)
# Setup the decoder input IDs (shifted right)
labels = tokenizer(
examples['target'],
#max_length=128, # Set to None for dynamic padding
truncation=True,
padding='max_length',
#text_target=examples['target'],
#return_tensors='pt',
#padding=True,
)
#labels["input_ids"] = [
# [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
#]
# Add labels to the model inputs
model_inputs["labels"] = labels["input_ids"]
return model_inputs
#max_length = 512
# Load the dataset
column_names = ['text', 'target']
#try:
#saved_dataset = load_from_disk(f'/data/{hub_id.strip()}_train_dataset')
#if os.access(f'/data/{hub_id.strip()}_test_dataset', os.R_OK):
#train_dataset = load_from_disk(f'/data/{hub_id.strip()}_train_dataset3')
#saved_test_dataset = load_from_disk(f'/data/{hub_id.strip()}_validation_dataset')
#dataset = load_dataset(dataset_name.strip())
#print("FOUND TEST")
## Create Trainer
#data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
#trainer = Trainer(
#model=model,
#args=training_args,
#train_dataset=train_dataset,
#eval_dataset=saved_test_dataset['input_ids'],
#compute_metrics=compute_metrics,
#data_collator=data_collator,
##processing_class=tokenizer,
#)
#elif os.access(f'/data/{hub_id.strip()}_train_dataset3', os.R_OK):
#dataset = load_dataset(dataset_name.strip())
##dataset['test'] = dataset['test'].select(range(700))
#dataset['test'] = dataset['test'].select(range(50))
#del dataset['train']
#del dataset['validation']
#test_set = dataset.map(tokenize_function, batched=True, batch_size=50, remove_columns=column_names,)
#test_set['test'].save_to_disk(f'/data/{hub_id.strip()}_test_dataset')
#return 'TRAINING DONE'
#elif os.access(f'/data/{hub_id.strip()}_validation_dataset', os.R_OK):
#dataset = load_dataset(dataset_name.strip())
#dataset['train'] = dataset['train'].select(range(8000))
#dataset['train'] = dataset['train'].select(range(1000))
#train_size = len(dataset['train'])
#third_size = train_size // 3
#del dataset['test']
#del dataset['validation']
#print("FOUND VALIDATION")
#saved_dataset = load_from_disk(f'/data/{hub_id.strip()}_train_dataset2')
#third_third = dataset['train'].select(range(third_size*2, train_size))
#dataset['train'] = third_third
##tokenized_second_half = tokenize_function(third_third)
#tokenized_second_half = dataset.map(tokenize_function, batched=True, batch_size=50,remove_columns=column_names,)
#dataset['train'] = concatenate_datasets([saved_dataset, tokenized_second_half['train']])
#dataset['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset3')
#return 'THIRD THIRD LOADED'
#if os.access(f'/data/{hub_id.strip()}_train_dataset', os.R_OK) and not os.access(f'/data/{hub_id.strip()}_train_dataset3', os.R_OK):
#dataset = load_dataset(dataset_name.strip())
#dataset['train'] = dataset['train'].select(range(1000))
#dataset['validation'] = dataset['validation'].select(range(100))
##dataset['train'] = dataset['train'].select(range(8000))
##dataset['validation'] = dataset['validation'].select(range(300))
#train_size = len(dataset['train'])
#third_size = train_size // 3
#second_third = dataset['train'].select(range(third_size, third_size*2))
#dataset['train'] = second_third
#del dataset['test']
#tokenized_sh_fq_dataset = dataset.map(tokenize_function, batched=True, batch_size=50, remove_columns=column_names,)
#dataset['train'] = concatenate_datasets([saved_dataset['train'], tokenized_sh_fq_dataset['train']])
#dataset['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset2')
#dataset['validation'].save_to_disk(f'/data/{hub_id.strip()}_validation_dataset')
#return 'SECOND THIRD LOADED'
#except Exception as e:
#print(f"An error occurred: {str(e)}, TB: {traceback.format_exc()}")
#dataset = load_dataset(dataset_name.strip())
##dataset['train'] = dataset['train'].select(range(8000))
#dataset['train'] = dataset['train'].select(range(1000))
#train_size = len(dataset['train'])
#third_size = train_size // 3
## Tokenize the dataset
#first_third = dataset['train'].select(range(third_size))
#dataset['train'] = first_third
#del dataset['test']
#del dataset['validation']
#tokenized_first_third = dataset.map(tokenize_function, batched=True, batch_size=50, remove_columns=column_names,)
#tokenized_first_third.save_to_disk(f'/data/{hub_id.strip()}_train_dataset')
#print('DONE')
#return 'RUN AGAIN TO LOAD REST OF DATA'
dataset = load_dataset(dataset_name.strip())
#dataset['train'] = dataset['train'].select(range(8000))
dataset['train'] = dataset['train'].select(range(4000))
dataset['validation'] = dataset['validation'].select(range(200))
train_set = dataset.map(tokenize_function, batched=True)
#valid_set = dataset['validation'].map(tokenize_function, batched=True)
#print(train_set.keys())
print('DONE')
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_set['train'],
eval_dataset=train_set['validation'],
#compute_metrics=compute_metrics,
#data_collator=data_collator,
#processing_class=tokenizer,
)
# Fine-tune the model
trainer.train()
#if os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir):
#train_result = trainer.train(resume_from_checkpoint=True)
#else:
#train_result = trainer.train()
trainer.push_to_hub(commit_message="Training complete!")
except Exception as e:
return f"An error occurred: {str(e)}, TB: {traceback.format_exc()}"
return 'DONE!'#train_result
'''
# Define Gradio interface
def predict(text):
model = AutoModelForSeq2SeqLM.from_pretrained(model_name.strip(), num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_name)
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
outputs = model(inputs)
predictions = outputs.logits.argmax(dim=-1)
return predictions.item()
'''
@spaces.GPU(duration=120)
def run_train(dataset_name, hub_id, api_key, num_epochs, batch_size, lr, grad):
def initialize_weights(model):
for name, param in model.named_parameters():
if 'encoder.block.0.layer.0.DenseReluDense.wi.weight' in name: # Example layer
torch.nn.init.xavier_uniform_(param.data) # Xavier initialization
elif 'encoder.block.0.layer.0.DenseReluDense.wo.weight' in name: # Another example layer
torch.nn.init.kaiming_normal_(param.data) # Kaiming initialization
config = AutoConfig.from_pretrained("google/t5-efficient-tiny-nh8")
model = AutoModelForSeq2SeqLM.from_config(config)
initialize_weights(model)
lora_config = LoraConfig(
r=16, # Rank of the low-rank adaptation
lora_alpha=32, # Scaling factor
lora_dropout=0.1, # Dropout for LoRA layers
bias="none" # Bias handling
)
#model = get_peft_model(model, lora_config)
result = fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size, lr, grad)
return result
# Create Gradio interface
try:
iface = gr.Interface(
fn=run_train,
inputs=[
gr.Textbox(label="Dataset Name (e.g., 'imdb')"),
gr.Textbox(label="HF hub to push to after training"),
gr.Textbox(label="HF API token"),
gr.Slider(minimum=1, maximum=10, value=3, label="Number of Epochs", step=1),
gr.Slider(minimum=1, maximum=2000, value=1, label="Batch Size", step=1),
gr.Slider(minimum=1, maximum=1000, value=1, label="Learning Rate (e-5)", step=1),
gr.Slider(minimum=1, maximum=100, value=1, label="Gradient accumulation", step=1),
],
outputs="text",
title="Fine-Tune Hugging Face Model",
description="This interface allows you to fine-tune a Hugging Face model on a specified dataset."
)
'''
iface = gr.Interface(
fn=predict,
inputs=[
gr.Textbox(label="Query"),
],
outputs="text",
title="Fine-Tune Hugging Face Model",
description="This interface allows you to test a fine-tune Hugging Face model."
)
'''
# Launch the interface
iface.launch()
except Exception as e:
print(f"An error occurred: {str(e)}, TB: {traceback.format_exc()}")