import spaces | |
import gradio as gr | |
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM | |
from transformers import DataCollatorForSeq2Seq, AutoConfig | |
from datasets import load_dataset, concatenate_datasets, load_from_disk, DatasetDict | |
import traceback | |
from sklearn.metrics import accuracy_score | |
import numpy as np | |
import torch | |
import os | |
import evaluate | |
from huggingface_hub import login | |
from peft import get_peft_model, LoraConfig | |
os.environ['HF_HOME'] = '/data/.huggingface' | |
''' | |
lora_config = LoraConfig( | |
r=16, # Rank of the low-rank adaptation | |
lora_alpha=32, # Scaling factor | |
lora_dropout=0.1, # Dropout for LoRA layers | |
bias="none" # Bias handling | |
) | |
model = AutoModelForSeq2SeqLM.from_pretrained('google/t5-efficient-tiny', num_labels=2, force_download=True) | |
model = get_peft_model(model, lora_config) | |
model.gradient_checkpointing_enable() | |
model_save_path = '/data/lora_finetuned_model' # Specify your desired save path | |
model.save_pretrained(model_save_path) | |
''' | |
def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size, lr, grad): | |
try: | |
torch.cuda.empty_cache() | |
torch.nn.CrossEntropyLoss() | |
rouge_metric = evaluate.load("rouge", cache_dir='/data/cache') | |
def compute_metrics(eval_preds): | |
preds, labels = eval_preds | |
if isinstance(preds, tuple): | |
preds = preds[0] | |
from pprint import pprint as pp | |
pp(preds) | |
# Replace -100s used for padding as we can't decode them | |
preds = np.where(preds != -100, preds, tokenizer.pad_token_id) | |
labels = np.where(labels != -100, labels, tokenizer.pad_token_id) | |
# Decode predictions and labels | |
decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) | |
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) | |
# Compute ROUGE metrics | |
result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels) | |
result = {k: round(v * 100, 4) for k, v in result.items()} | |
# Calculate accuracy | |
accuracy = accuracy_score(decoded_labels, decoded_preds) | |
result["accuracy"] = round(accuracy * 100, 4) | |
# Calculate average generation length | |
prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds] | |
result["gen_len"] = np.mean(prediction_lens) | |
return result | |
login(api_key.strip()) | |
# Load the model and tokenizer | |
# Set training arguments | |
training_args = TrainingArguments( | |
remove_unused_columns=False, | |
torch_empty_cache_steps=100, | |
overwrite_output_dir=True, | |
output_dir='/data/results', | |
eval_strategy="steps", # Change this to steps | |
save_strategy='steps', | |
learning_rate=lr*0.00001, | |
per_device_train_batch_size=int(batch_size), | |
per_device_eval_batch_size=int(batch_size), | |
num_train_epochs=int(num_epochs), | |
weight_decay=0.01, | |
#gradient_accumulation_steps=int(grad), | |
#max_grad_norm = 3.0, | |
load_best_model_at_end=True, | |
metric_for_best_model="accuracy", | |
greater_is_better=True, | |
logging_dir='/data/logs', | |
logging_steps=200, | |
#push_to_hub=True, | |
hub_model_id=hub_id.strip(), | |
fp16=True, | |
#lr_scheduler_type='cosine', | |
save_steps=200, # Save checkpoint every 500 steps | |
save_total_limit=3, | |
) | |
# Check if a checkpoint exists and load it | |
#if os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir): | |
#print("Loading model from checkpoint...") | |
#model = AutoModelForSeq2SeqLM.from_pretrained(training_args.output_dir) | |
config = AutoConfig.from_pretrained("google/t5-efficient-tiny-nh8") | |
tokenizer = AutoTokenizer.from_pretrained('google/t5-efficient-tiny-nh8', use_fast=True, trust_remote_code=True) | |
#max_length = model.get_input_embeddings().weight.shape[0] | |
max_length = 512 | |
def tokenize_function(examples): | |
# Assuming 'text' is the input and 'target' is the expected output | |
model_inputs = tokenizer( | |
examples['text'], | |
#max_length=max_length, # Set to None for dynamic padding | |
truncation=True, | |
padding='max_length', | |
#return_tensors='pt', | |
#padding=True, | |
) | |
# Setup the decoder input IDs (shifted right) | |
labels = tokenizer( | |
examples['target'], | |
#max_length=128, # Set to None for dynamic padding | |
truncation=True, | |
padding='max_length', | |
#text_target=examples['target'], | |
#return_tensors='pt', | |
#padding=True, | |
) | |
#labels["input_ids"] = [ | |
# [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"] | |
#] | |
# Add labels to the model inputs | |
model_inputs["labels"] = labels["input_ids"] | |
return model_inputs | |
#max_length = 512 | |
# Load the dataset | |
column_names = ['text', 'target'] | |
#try: | |
#saved_dataset = load_from_disk(f'/data/{hub_id.strip()}_train_dataset') | |
#if os.access(f'/data/{hub_id.strip()}_test_dataset', os.R_OK): | |
#train_dataset = load_from_disk(f'/data/{hub_id.strip()}_train_dataset3') | |
#saved_test_dataset = load_from_disk(f'/data/{hub_id.strip()}_validation_dataset') | |
#dataset = load_dataset(dataset_name.strip()) | |
#print("FOUND TEST") | |
## Create Trainer | |
#data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) | |
#trainer = Trainer( | |
#model=model, | |
#args=training_args, | |
#train_dataset=train_dataset, | |
#eval_dataset=saved_test_dataset['input_ids'], | |
#compute_metrics=compute_metrics, | |
#data_collator=data_collator, | |
##processing_class=tokenizer, | |
#) | |
#elif os.access(f'/data/{hub_id.strip()}_train_dataset3', os.R_OK): | |
#dataset = load_dataset(dataset_name.strip()) | |
##dataset['test'] = dataset['test'].select(range(700)) | |
#dataset['test'] = dataset['test'].select(range(50)) | |
#del dataset['train'] | |
#del dataset['validation'] | |
#test_set = dataset.map(tokenize_function, batched=True, batch_size=50, remove_columns=column_names,) | |
#test_set['test'].save_to_disk(f'/data/{hub_id.strip()}_test_dataset') | |
#return 'TRAINING DONE' | |
#elif os.access(f'/data/{hub_id.strip()}_validation_dataset', os.R_OK): | |
#dataset = load_dataset(dataset_name.strip()) | |
#dataset['train'] = dataset['train'].select(range(8000)) | |
#dataset['train'] = dataset['train'].select(range(1000)) | |
#train_size = len(dataset['train']) | |
#third_size = train_size // 3 | |
#del dataset['test'] | |
#del dataset['validation'] | |
#print("FOUND VALIDATION") | |
#saved_dataset = load_from_disk(f'/data/{hub_id.strip()}_train_dataset2') | |
#third_third = dataset['train'].select(range(third_size*2, train_size)) | |
#dataset['train'] = third_third | |
##tokenized_second_half = tokenize_function(third_third) | |
#tokenized_second_half = dataset.map(tokenize_function, batched=True, batch_size=50,remove_columns=column_names,) | |
#dataset['train'] = concatenate_datasets([saved_dataset, tokenized_second_half['train']]) | |
#dataset['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset3') | |
#return 'THIRD THIRD LOADED' | |
#if os.access(f'/data/{hub_id.strip()}_train_dataset', os.R_OK) and not os.access(f'/data/{hub_id.strip()}_train_dataset3', os.R_OK): | |
#dataset = load_dataset(dataset_name.strip()) | |
#dataset['train'] = dataset['train'].select(range(1000)) | |
#dataset['validation'] = dataset['validation'].select(range(100)) | |
##dataset['train'] = dataset['train'].select(range(8000)) | |
##dataset['validation'] = dataset['validation'].select(range(300)) | |
#train_size = len(dataset['train']) | |
#third_size = train_size // 3 | |
#second_third = dataset['train'].select(range(third_size, third_size*2)) | |
#dataset['train'] = second_third | |
#del dataset['test'] | |
#tokenized_sh_fq_dataset = dataset.map(tokenize_function, batched=True, batch_size=50, remove_columns=column_names,) | |
#dataset['train'] = concatenate_datasets([saved_dataset['train'], tokenized_sh_fq_dataset['train']]) | |
#dataset['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset2') | |
#dataset['validation'].save_to_disk(f'/data/{hub_id.strip()}_validation_dataset') | |
#return 'SECOND THIRD LOADED' | |
#except Exception as e: | |
#print(f"An error occurred: {str(e)}, TB: {traceback.format_exc()}") | |
#dataset = load_dataset(dataset_name.strip()) | |
##dataset['train'] = dataset['train'].select(range(8000)) | |
#dataset['train'] = dataset['train'].select(range(1000)) | |
#train_size = len(dataset['train']) | |
#third_size = train_size // 3 | |
## Tokenize the dataset | |
#first_third = dataset['train'].select(range(third_size)) | |
#dataset['train'] = first_third | |
#del dataset['test'] | |
#del dataset['validation'] | |
#tokenized_first_third = dataset.map(tokenize_function, batched=True, batch_size=50, remove_columns=column_names,) | |
#tokenized_first_third.save_to_disk(f'/data/{hub_id.strip()}_train_dataset') | |
#print('DONE') | |
#return 'RUN AGAIN TO LOAD REST OF DATA' | |
dataset = load_dataset(dataset_name.strip()) | |
#dataset['train'] = dataset['train'].select(range(8000)) | |
dataset['train'] = dataset['train'].select(range(4000)) | |
dataset['validation'] = dataset['validation'].select(range(200)) | |
train_set = dataset.map(tokenize_function, batched=True) | |
#valid_set = dataset['validation'].map(tokenize_function, batched=True) | |
#print(train_set.keys()) | |
print('DONE') | |
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
train_dataset=train_set['train'], | |
eval_dataset=train_set['validation'], | |
#compute_metrics=compute_metrics, | |
#data_collator=data_collator, | |
#processing_class=tokenizer, | |
) | |
# Fine-tune the model | |
trainer.train() | |
#if os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir): | |
#train_result = trainer.train(resume_from_checkpoint=True) | |
#else: | |
#train_result = trainer.train() | |
trainer.push_to_hub(commit_message="Training complete!") | |
except Exception as e: | |
return f"An error occurred: {str(e)}, TB: {traceback.format_exc()}" | |
return 'DONE!'#train_result | |
''' | |
# Define Gradio interface | |
def predict(text): | |
model = AutoModelForSeq2SeqLM.from_pretrained(model_name.strip(), num_labels=2) | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True) | |
outputs = model(inputs) | |
predictions = outputs.logits.argmax(dim=-1) | |
return predictions.item() | |
''' | |
def run_train(dataset_name, hub_id, api_key, num_epochs, batch_size, lr, grad): | |
def initialize_weights(model): | |
for name, param in model.named_parameters(): | |
if 'encoder.block.0.layer.0.DenseReluDense.wi.weight' in name: # Example layer | |
torch.nn.init.xavier_uniform_(param.data) # Xavier initialization | |
elif 'encoder.block.0.layer.0.DenseReluDense.wo.weight' in name: # Another example layer | |
torch.nn.init.kaiming_normal_(param.data) # Kaiming initialization | |
config = AutoConfig.from_pretrained("google/t5-efficient-tiny-nh8") | |
model = AutoModelForSeq2SeqLM.from_config(config) | |
initialize_weights(model) | |
lora_config = LoraConfig( | |
r=16, # Rank of the low-rank adaptation | |
lora_alpha=32, # Scaling factor | |
lora_dropout=0.1, # Dropout for LoRA layers | |
bias="none" # Bias handling | |
) | |
#model = get_peft_model(model, lora_config) | |
result = fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size, lr, grad) | |
return result | |
# Create Gradio interface | |
try: | |
iface = gr.Interface( | |
fn=run_train, | |
inputs=[ | |
gr.Textbox(label="Dataset Name (e.g., 'imdb')"), | |
gr.Textbox(label="HF hub to push to after training"), | |
gr.Textbox(label="HF API token"), | |
gr.Slider(minimum=1, maximum=10, value=3, label="Number of Epochs", step=1), | |
gr.Slider(minimum=1, maximum=2000, value=1, label="Batch Size", step=1), | |
gr.Slider(minimum=1, maximum=1000, value=1, label="Learning Rate (e-5)", step=1), | |
gr.Slider(minimum=1, maximum=100, value=1, label="Gradient accumulation", step=1), | |
], | |
outputs="text", | |
title="Fine-Tune Hugging Face Model", | |
description="This interface allows you to fine-tune a Hugging Face model on a specified dataset." | |
) | |
''' | |
iface = gr.Interface( | |
fn=predict, | |
inputs=[ | |
gr.Textbox(label="Query"), | |
], | |
outputs="text", | |
title="Fine-Tune Hugging Face Model", | |
description="This interface allows you to test a fine-tune Hugging Face model." | |
) | |
''' | |
# Launch the interface | |
iface.launch() | |
except Exception as e: | |
print(f"An error occurred: {str(e)}, TB: {traceback.format_exc()}") |