dev-v2-20240827022505

1f30202 verified 6 months ago

9.35 kB

	from pathlib import Path
	import time
	import modal
	from modal import App, Image, Volume, enter, method, wsgi_app
	import os

	#os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
	VOL_MOUNT_PATH = Path("/vol")

	cuda_version = "12.4.0" # should be no greater than host CUDA version
	flavor = "devel" # includes full CUDA toolkit
	_os = "ubuntu22.04"
	tag = f"{cuda_version}-{flavor}-{_os}"

	#image = Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.11")
	image = Image.debian_slim(python_version="3.10")

	image = image.pip_install(
	"accelerate",
	"transformers",
	"torch",
	"datasets",
	"tensorboard",
	"trl",
	"xformers",
	"bitsandbytes",
	"peft",
	"protobuf==3.20.*",
	"onnxruntime",
	"onnx",
	"setfit",
	"nltk",
	"firebase_admin",
	"openai",
	"evaluate",
	"sentencepiece",
	"pandas",
	"scikit-learn",
	"huggingface_hub"
	)

	app = App(
	name="finetune-run", image=image
	) # Note: prior to April 2024, "app" was called "stub"
	output_vol = Volume.from_name("finetune-volume", create_if_missing=True)


	@app.function(gpu="any")
	def gpu_function():
	import subprocess

	import torch

	subprocess.run(["nvidia-smi"])
	print("Torch version:", torch.__version__)
	print("CUDA available:", torch.cuda.is_available())
	print("CUDA device count:", torch.cuda.device_count())



	GPU_CONFIG = modal.gpu.A100(count=1, size="80GB")

	@app.function(
	gpu=GPU_CONFIG, # self.params.gpu,
	timeout=7200, # self.params.timeout,
	volumes={VOL_MOUNT_PATH: output_vol},
	secrets=[modal.Secret.from_dotenv()]
	)
	def run_finetune(data):
	import subprocess

	import torch

	subprocess.run(["nvidia-smi"])
	print("Torch version:", torch.__version__)
	print("CUDA available:", torch.cuda.is_available())
	print("CUDA device count:", torch.cuda.device_count())

	import pandas as pd
	open('./features_ms_deberta_v3.json', 'w').write(data)
	df = pd.read_json('./features_ms_deberta_v3.json', lines=False)
	dfs = []
	for _ in range(50):
	dfs.append(df)
	df = pd.concat(dfs, ignore_index=True)

	from datasets import Dataset
	from transformers import (
	AutoModel,AutoTokenizer,
	AutoModelForSequenceClassification, DebertaV2Model, DebertaV2Tokenizer, DebertaV2ForSequenceClassification,

	Trainer, TrainingArguments ,EvalPrediction,DataCollatorWithPadding
	)
	from transformers import EarlyStoppingCallback
	import numpy as np
	from itertools import chain
	import re
	from collections import Counter

	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	import random
	from sklearn.model_selection import train_test_split
	from transformers import pipeline
	import torch
	from evaluate import load as load_metric

	HF_ORGANIZATION = "rafaelsandroni"
	token = os.getenv("HF_TOKEN")

	# In[12]:


	# Define the task for zero-shot classification
	task = "zero-shot-classification"

	# Define the specific pre-trained model to be used

	model_name = "MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7"
	# model_name = "MoritzLaurer/bge-m3-zeroshot-v2.0"
	# model_name = "cross-encoder/nli-deberta-v3-base"
	model_name = "tasksource/deberta-small-long-nli"

	# Define the directory where the output/results will be saved
	output_dir = "./"

	# Clear the CUDA cache to free up GPU memory
	torch.cuda.empty_cache()
	device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")


	# Most common init
	model = AutoModelForSequenceClassification.from_pretrained(model_name)
	tokenizer = AutoTokenizer.from_pretrained(model_name)


	# In[5]:


	def create_input_sequence(sample):
	# Get text from the 'premise' column
	text = sample["premise"]

	# Get hypothesis from the 'hypothesis' column
	hypothesis = sample['hypothesis']

	# Get label from the 'class' column
	label = sample['class']

	# Encoding the sequence using the tokenizer
	encoded_sequence = tokenizer(text, hypothesis, truncation=True, padding='max_length')

	# Assign label to the encoded sequence
	encoded_sequence['labels'] = label

	# Decode the input_ids
	encoded_sequence["input_sentence"] = tokenizer.batch_decode(encoded_sequence.input_ids)

	return encoded_sequence


	# In[6]:

	df['class'] = (df['target'] == 'PASS').astype(int).apply(lambda x: 0 if x == 1 else 2)

	print(df.head())

	print(df.shape)



	# In[7]:


	train_data, test_data = train_test_split(df, test_size=0.1, random_state=42)

	# Shuffle the train_data DataFrame and create a new DataFrame with shuffled rows
	train_shuffle_df = train_data

	# Shuffle the test_data DataFrame and create a new DataFrame with shuffled rows
	test_shuffle_df = test_data

	# Create a Dataset object from the shuffled train DataFrame
	train = Dataset.from_pandas(train_shuffle_df)

	# Create a Dataset object from the shuffled test DataFrame
	test = Dataset.from_pandas(test_shuffle_df)

	# Map the create_input_sequence function to the train and test datasets
	# This function encodes the data, adds labels, and generates input sentences
	train_dataset = train.map(create_input_sequence,batched=True,batch_size=1,remove_columns=["class","premise"])
	test_dataset = test.map(create_input_sequence,batched=True,batch_size=1,remove_columns=["class","premise"])


	# In[8]:


	def compute_metrics(p: EvalPrediction):
	# Extracting predictions from EvalPrediction object
	preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions

	# Obtaining the predicted classes
	preds = np.argmax(preds, axis = 1)

	# Calculating the ratio of predictions equal to 2 (assumed label)
	ratio = np.mean(preds == 2)

	# Dictionary to store computed metrics
	result = {}

	# Loading evaluation metrics
	metric_f1 = load_metric("f1")
	metric_precision = load_metric("precision")
	metric_recall = load_metric("recall")
	metric_acc = load_metric("accuracy")

	# Computing various metrics
	result["accuracy"] = metric_acc.compute(predictions = preds, references = p.label_ids)["accuracy"]
	result["precision"] = metric_precision.compute(predictions = preds, references = p.label_ids,average = 'macro')['precision']
	result["recall"] = metric_recall.compute(predictions = preds, references = p.label_ids,average = 'macro')["recall"]
	result["f1"] = metric_f1.compute(predictions = preds, references = p.label_ids, average = 'macro')["f1"]
	result["ratio"] = ratio

	return result


	# In[14]:


	training_args = TrainingArguments(
	output_dir=output_dir, # Output directory
	logging_dir=output_dir + "/logs",# Output directory for logging
	num_train_epochs=1, # Total number of training epochs
	per_device_train_batch_size=16, # Batch size per device during training
	per_device_eval_batch_size=16, # Batch size for evaluation
	warmup_steps=4, # Number of warmup steps for learning rate scheduler
	weight_decay=0.01, # Strength of weight decay
	gradient_accumulation_steps=2, # The number of steps whose gradients are accumulated
	learning_rate=2e-05, # Controls the magnitude of updates to the model weights
	warmup_ratio=0.06, # Represents the proportion of training steps
	label_smoothing_factor=0.1, # Regularization technique to prevent the model from becoming overconfident
	evaluation_strategy='steps', # Frequency or timing of evaluating
	logging_strategy='steps', # Frequency or timing of logging
	logging_steps = 10, # Frequency or timing of logging
	eval_steps = 10, # Frequency or timing of evaluating
	logging_first_step=True,
	do_eval=True,
	hub_model_id="rafaelsandroni/ms-deberta-v2-xlarge-mnli-finetuned-pt",
	load_best_model_at_end=True,
	)


	# In[15]:

	callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]


	trainer = Trainer(
	model=model, # The instantiated model to be trained
	args=training_args, # Training arguments, defined above
	train_dataset=train_dataset, # Training dataset
	eval_dataset=test_dataset, # Evaluation dataset
	tokenizer=tokenizer,
	compute_metrics=compute_metrics,
	callbacks=callbacks
	)


	# In[16]:



	# In[18]:


	trainer.train()


	# In[ ]:


	trainer.evaluate()
	t = time.strftime("%Y%m%d%H%M%S")
	v = 2
	commit = f"dev-v{v}-{t}"
	trainer.push_to_hub(commit, token=token)




	@app.local_entrypoint()
	def run():
	import time
	import pandas as pd
	t0 = time.time()
	#df = pd.read_json('./features_ms_deberta_v2.json', lines=False)
	with open('./features_ms_deberta_v3.json') as f:
	data = f.read()
	run_finetune.remote(data)

	print("Full time spent:", time.time() - t0)