Akaash

add python code for fine tuning

405f3d6 10 months ago

No virus

4.84 kB

	from transformers import TFDistilBertModel
	from transformers import DistilBertConfig
	from transformers import AutoTokenizer
	from transformers import TFAutoModelForTokenClassification
	from transformers import create_optimizer
	from transformers import DataCollatorForTokenClassification
	import tensorflow as tf


	from transformers.keras_callbacks import PushToHubCallback
	from tensorflow.keras.callbacks import TensorBoard


	from datasets import load_dataset, load_metric

	dataset = load_dataset('conll2003')

	task = 'ner'
	model_checkpoint = 'distilbert-base-uncased'

	my_config = DistilBertConfig.from_pretrained("distilbert-base-uncased", activation = 'relu', attention_dropout = 0.4)
	tf_model = TFDistilBertModel(my_config)

	tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
	label_all_tokens = True

	def tokenize_and_align_labels(examples):
	tokenized_inputs = tokenizer(
	examples["tokens"], truncation=True, is_split_into_words=True
	)

	labels = []
	for i, label in enumerate(examples[f"{task}_tags"]):
	word_ids = tokenized_inputs.word_ids(batch_index=i)
	previous_word_idx = None
	label_ids = []
	for word_idx in word_ids:
	# Special tokens have a word id that is None. We set the label to -100 so they are automatically
	# ignored in the loss function.
	if word_idx is None:
	label_ids.append(-100)
	# We set the label for the first token of each word.
	elif word_idx != previous_word_idx:
	label_ids.append(label[word_idx])
	# For the other tokens in a word, we set the label to either the current label or -100, depending on
	# the label_all_tokens flag.
	else:
	label_ids.append(label[word_idx] if label_all_tokens else -100)
	previous_word_idx = word_idx

	labels.append(label_ids)

	tokenized_inputs["labels"] = labels
	return tokenized_inputs


	tokenized_datasets = dataset.map(tokenize_and_align_labels, batched = True)

	print(dataset['train'][0])
	print(tokenized_datasets['train']['labels'][0])

	label_list = dataset['train'].features[f'{task}_tags'].feature.names
	id2label = {i: label for i, label in enumerate(label_list)}
	label2id = {label: i for i, label in enumerate(label_list)}

	model = TFAutoModelForTokenClassification.from_pretrained(
	model_checkpoint, num_labels=len(label_list), id2label=id2label, label2id=label2id
	)

	num_train_epochs = 3
	batch_size = 16
	num_train_steps = (len(tokenized_datasets['train']) // batch_size) * num_train_epochs
	optimizer, lr_schedule = create_optimizer(
	init_lr=2e-5,
	num_train_steps=num_train_steps,
	num_warmup_steps= 0
	)

	model.compile(optimizer = optimizer)

	data_collator = DataCollatorForTokenClassification(tokenizer, return_tensors='np')

	train_set = model.prepare_tf_dataset(
	tokenized_datasets['train'],
	shuffle = True,
	batch_size = batch_size,
	collate_fn = data_collator
	)

	validation_set = model.prepare_tf_dataset(
	tokenized_datasets['validation'],
	shuffle = False,
	batch_size = batch_size,
	collate_fn = data_collator
	)

	model_name = model_checkpoint.split('/')[-1]
	push_to_hub_model_id = f"{model_name}-finetuned-{task}"

	tensorboard_callback = TensorBoard(log_dir = './model/logs')

	push_to_hub_callback = PushToHubCallback(
	output_dir= "./tc_model_save",
	tokenizer=tokenizer,
	hub_model_id=push_to_hub_model_id
	)
	import numpy as np
	from transformers.keras_callbacks import KerasMetricCallback
	example = dataset["train"][4]
	metric = load_metric("seqeval")
	labels = [label_list[i] for i in example[f"{task}_tags"]]
	metric.compute(predictions=[labels], references=[labels])


	def compute_metrics(p):
	predictions, labels = p
	predictions = np.argmax(predictions, axis=2)

	# Remove ignored index (special tokens)
	true_predictions = [
	[label_list[p] for (p, l) in zip(prediction, label) if l != -100]
	for prediction, label in zip(predictions, labels)
	]
	true_labels = [
	[label_list[l] for (p, l) in zip(prediction, label) if l != -100]
	for prediction, label in zip(predictions, labels)
	]

	results = metric.compute(predictions=true_predictions, references=true_labels)
	return {
	"precision": results["overall_precision"],
	"recall": results["overall_recall"],
	"f1": results["overall_f1"],
	"accuracy": results["overall_accuracy"],
	}


	metric_callback = KerasMetricCallback(
	metric_fn=compute_metrics, eval_dataset=validation_set
	)

	callbacks = (metric_callback, tensorboard_callback, push_to_hub_callback)

	model.fit(
	train_set,
	validation_data = validation_set,
	epochs = num_train_epochs,
	callbacks = callbacks
	)

	my_config.push_to_hub('distilbert-base-uncased-finetuned-ner')