Akaash

add python code for fine tuning

405f3d6 about 1 year ago

No virus

4.84 kB

	from transformers import TFDistilBertModel
	from transformers import DistilBertConfig
	from transformers import AutoTokenizer
	from transformers import TFAutoModelForTokenClassification
	from transformers import create_optimizer
	from transformers import DataCollatorForTokenClassification
	import tensorflow as tf


	from transformers.keras_callbacks import PushToHubCallback
	from tensorflow.keras.callbacks import TensorBoard


	from datasets import load_dataset, load_metric

	dataset = load_dataset('conll2003')

	task = 'ner'
	model_checkpoint = 'distilbert-base-uncased'

	my_config = DistilBertConfig.from_pretrained("distilbert-base-uncased", activation = 'relu', attention_dropout = 0.4)
	tf_model = TFDistilBertModel(my_config)

	tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
	label_all_tokens = True

	def tokenize_and_align_labels(examples):
	tokenized_inputs = tokenizer(
	examples["tokens"], truncation=True, is_split_into_words=True
	)

	labels = []
	for i, label in enumerate(examples[f"{task}_tags"]):
	word_ids = tokenized_inputs.word_ids(batch_index=i)
	previous_word_idx = None
	label_ids = []
	for word_idx in word_ids:
	# Special tokens have a word id that is None. We set the label to -100 so they are automatically
	# ignored in the loss function.
	if word_idx is None:
	label_ids.append(-100)
	# We set the label for the first token of each word.
	elif word_idx != previous_word_idx:
	label_ids.append(label[word_idx])
	# For the other tokens in a word, we set the label to either the current label or -100, depending on
	# the label_all_tokens flag.
	else:
	label_ids.append(label[word_idx] if label_all_tokens else -100)
	previous_word_idx = word_idx

	labels.append(label_ids)

	tokenized_inputs["labels"] = labels
	return tokenized_inputs


	tokenized_datasets = dataset.map(tokenize_and_align_labels, batched = True)

	print(dataset['train'][0])
	print(tokenized_datasets['train']['labels'][0])

	label_list = dataset['train'].features[f'{task}_tags'].feature.names
	id2label = {i: label for i, label in enumerate(label_list)}
	label2id = {label: i for i, label in enumerate(label_list)}

	model = TFAutoModelForTokenClassification.from_pretrained(
	model_checkpoint, num_labels=len(label_list), id2label=id2label, label2id=label2id
	)

	num_train_epochs = 3
	batch_size = 16
	num_train_steps = (len(tokenized_datasets['train']) // batch_size) * num_train_epochs
	optimizer, lr_schedule = create_optimizer(
	init_lr=2e-5,
	num_train_steps=num_train_steps,
	num_warmup_steps= 0
	)

	model.compile(optimizer = optimizer)

	data_collator = DataCollatorForTokenClassification(tokenizer, return_tensors='np')

	train_set = model.prepare_tf_dataset(
	tokenized_datasets['train'],
	shuffle = True,
	batch_size = batch_size,
	collate_fn = data_collator
	)

	validation_set = model.prepare_tf_dataset(
	tokenized_datasets['validation'],
	shuffle = False,
	batch_size = batch_size,
	collate_fn = data_collator
	)

	model_name = model_checkpoint.split('/')[-1]
	push_to_hub_model_id = f"{model_name}-finetuned-{task}"

	tensorboard_callback = TensorBoard(log_dir = './model/logs')

	push_to_hub_callback = PushToHubCallback(
	output_dir= "./tc_model_save",
	tokenizer=tokenizer,
	hub_model_id=push_to_hub_model_id
	)
	import numpy as np
	from transformers.keras_callbacks import KerasMetricCallback
	example = dataset["train"][4]
	metric = load_metric("seqeval")
	labels = [label_list[i] for i in example[f"{task}_tags"]]
	metric.compute(predictions=[labels], references=[labels])


	def compute_metrics(p):
	predictions, labels = p
	predictions = np.argmax(predictions, axis=2)

	# Remove ignored index (special tokens)
	true_predictions = [
	[label_list[p] for (p, l) in zip(prediction, label) if l != -100]
	for prediction, label in zip(predictions, labels)
	]
	true_labels = [
	[label_list[l] for (p, l) in zip(prediction, label) if l != -100]
	for prediction, label in zip(predictions, labels)
	]

	results = metric.compute(predictions=true_predictions, references=true_labels)
	return {
	"precision": results["overall_precision"],
	"recall": results["overall_recall"],
	"f1": results["overall_f1"],
	"accuracy": results["overall_accuracy"],
	}


	metric_callback = KerasMetricCallback(
	metric_fn=compute_metrics, eval_dataset=validation_set
	)

	callbacks = (metric_callback, tensorboard_callback, push_to_hub_callback)

	model.fit(
	train_set,
	validation_data = validation_set,
	epochs = num_train_epochs,
	callbacks = callbacks
	)

	my_config.push_to_hub('distilbert-base-uncased-finetuned-ner')