rafaelsandroni's picture
dev-v2-20240827022505
1f30202 verified
from pathlib import Path
import time
import modal
from modal import App, Image, Volume, enter, method, wsgi_app
import os
#os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
VOL_MOUNT_PATH = Path("/vol")
cuda_version = "12.4.0" # should be no greater than host CUDA version
flavor = "devel" # includes full CUDA toolkit
_os = "ubuntu22.04"
tag = f"{cuda_version}-{flavor}-{_os}"
#image = Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.11")
image = Image.debian_slim(python_version="3.10")
image = image.pip_install(
"accelerate",
"transformers",
"torch",
"datasets",
"tensorboard",
"trl",
"xformers",
"bitsandbytes",
"peft",
"protobuf==3.20.*",
"onnxruntime",
"onnx",
"setfit",
"nltk",
"firebase_admin",
"openai",
"evaluate",
"sentencepiece",
"pandas",
"scikit-learn",
"huggingface_hub"
)
app = App(
name="finetune-run", image=image
) # Note: prior to April 2024, "app" was called "stub"
output_vol = Volume.from_name("finetune-volume", create_if_missing=True)
@app.function(gpu="any")
def gpu_function():
import subprocess
import torch
subprocess.run(["nvidia-smi"])
print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("CUDA device count:", torch.cuda.device_count())
GPU_CONFIG = modal.gpu.A100(count=1, size="80GB")
@app.function(
gpu=GPU_CONFIG, # self.params.gpu,
timeout=7200, # self.params.timeout,
volumes={VOL_MOUNT_PATH: output_vol},
secrets=[modal.Secret.from_dotenv()]
)
def run_finetune(data):
import subprocess
import torch
subprocess.run(["nvidia-smi"])
print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("CUDA device count:", torch.cuda.device_count())
import pandas as pd
open('./features_ms_deberta_v3.json', 'w').write(data)
df = pd.read_json('./features_ms_deberta_v3.json', lines=False)
dfs = []
for _ in range(50):
dfs.append(df)
df = pd.concat(dfs, ignore_index=True)
from datasets import Dataset
from transformers import (
AutoModel,AutoTokenizer,
AutoModelForSequenceClassification, DebertaV2Model, DebertaV2Tokenizer, DebertaV2ForSequenceClassification,
Trainer, TrainingArguments ,EvalPrediction,DataCollatorWithPadding
)
from transformers import EarlyStoppingCallback
import numpy as np
from itertools import chain
import re
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import random
from sklearn.model_selection import train_test_split
from transformers import pipeline
import torch
from evaluate import load as load_metric
HF_ORGANIZATION = "rafaelsandroni"
token = os.getenv("HF_TOKEN")
# In[12]:
# Define the task for zero-shot classification
task = "zero-shot-classification"
# Define the specific pre-trained model to be used
model_name = "MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7"
# model_name = "MoritzLaurer/bge-m3-zeroshot-v2.0"
# model_name = "cross-encoder/nli-deberta-v3-base"
model_name = "tasksource/deberta-small-long-nli"
# Define the directory where the output/results will be saved
output_dir = "./"
# Clear the CUDA cache to free up GPU memory
torch.cuda.empty_cache()
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# Most common init
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# In[5]:
def create_input_sequence(sample):
# Get text from the 'premise' column
text = sample["premise"]
# Get hypothesis from the 'hypothesis' column
hypothesis = sample['hypothesis']
# Get label from the 'class' column
label = sample['class']
# Encoding the sequence using the tokenizer
encoded_sequence = tokenizer(text, hypothesis, truncation=True, padding='max_length')
# Assign label to the encoded sequence
encoded_sequence['labels'] = label
# Decode the input_ids
encoded_sequence["input_sentence"] = tokenizer.batch_decode(encoded_sequence.input_ids)
return encoded_sequence
# In[6]:
df['class'] = (df['target'] == 'PASS').astype(int).apply(lambda x: 0 if x == 1 else 2)
print(df.head())
print(df.shape)
# In[7]:
train_data, test_data = train_test_split(df, test_size=0.1, random_state=42)
# Shuffle the train_data DataFrame and create a new DataFrame with shuffled rows
train_shuffle_df = train_data
# Shuffle the test_data DataFrame and create a new DataFrame with shuffled rows
test_shuffle_df = test_data
# Create a Dataset object from the shuffled train DataFrame
train = Dataset.from_pandas(train_shuffle_df)
# Create a Dataset object from the shuffled test DataFrame
test = Dataset.from_pandas(test_shuffle_df)
# Map the create_input_sequence function to the train and test datasets
# This function encodes the data, adds labels, and generates input sentences
train_dataset = train.map(create_input_sequence,batched=True,batch_size=1,remove_columns=["class","premise"])
test_dataset = test.map(create_input_sequence,batched=True,batch_size=1,remove_columns=["class","premise"])
# In[8]:
def compute_metrics(p: EvalPrediction):
# Extracting predictions from EvalPrediction object
preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
# Obtaining the predicted classes
preds = np.argmax(preds, axis = 1)
# Calculating the ratio of predictions equal to 2 (assumed label)
ratio = np.mean(preds == 2)
# Dictionary to store computed metrics
result = {}
# Loading evaluation metrics
metric_f1 = load_metric("f1")
metric_precision = load_metric("precision")
metric_recall = load_metric("recall")
metric_acc = load_metric("accuracy")
# Computing various metrics
result["accuracy"] = metric_acc.compute(predictions = preds, references = p.label_ids)["accuracy"]
result["precision"] = metric_precision.compute(predictions = preds, references = p.label_ids,average = 'macro')['precision']
result["recall"] = metric_recall.compute(predictions = preds, references = p.label_ids,average = 'macro')["recall"]
result["f1"] = metric_f1.compute(predictions = preds, references = p.label_ids, average = 'macro')["f1"]
result["ratio"] = ratio
return result
# In[14]:
training_args = TrainingArguments(
output_dir=output_dir, # Output directory
logging_dir=output_dir + "/logs",# Output directory for logging
num_train_epochs=1, # Total number of training epochs
per_device_train_batch_size=16, # Batch size per device during training
per_device_eval_batch_size=16, # Batch size for evaluation
warmup_steps=4, # Number of warmup steps for learning rate scheduler
weight_decay=0.01, # Strength of weight decay
gradient_accumulation_steps=2, # The number of steps whose gradients are accumulated
learning_rate=2e-05, # Controls the magnitude of updates to the model weights
warmup_ratio=0.06, # Represents the proportion of training steps
label_smoothing_factor=0.1, # Regularization technique to prevent the model from becoming overconfident
evaluation_strategy='steps', # Frequency or timing of evaluating
logging_strategy='steps', # Frequency or timing of logging
logging_steps = 10, # Frequency or timing of logging
eval_steps = 10, # Frequency or timing of evaluating
logging_first_step=True,
do_eval=True,
hub_model_id="rafaelsandroni/ms-deberta-v2-xlarge-mnli-finetuned-pt",
load_best_model_at_end=True,
)
# In[15]:
callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
trainer = Trainer(
model=model, # The instantiated model to be trained
args=training_args, # Training arguments, defined above
train_dataset=train_dataset, # Training dataset
eval_dataset=test_dataset, # Evaluation dataset
tokenizer=tokenizer,
compute_metrics=compute_metrics,
callbacks=callbacks
)
# In[16]:
# In[18]:
trainer.train()
# In[ ]:
trainer.evaluate()
t = time.strftime("%Y%m%d%H%M%S")
v = 2
commit = f"dev-v{v}-{t}"
trainer.push_to_hub(commit, token=token)
@app.local_entrypoint()
def run():
import time
import pandas as pd
t0 = time.time()
#df = pd.read_json('./features_ms_deberta_v2.json', lines=False)
with open('./features_ms_deberta_v3.json') as f:
data = f.read()
run_finetune.remote(data)
print("Full time spent:", time.time() - t0)