Spaces:

aimlnerd
/

legal-entity-ner-transformers

Runtime error

App Files Files Community

legal-entity-ner-transformers / source /services /ner /train /train.py

aimlnerd

format code

d665726 over 1 year ago

raw

history blame contribute delete

10.7 kB

	"""
	https://github.com/huggingface/transformers/tree/66fd3a8d626a32989f4569260db32785c6cbf42a/examples/pytorch/token-classification

	run this command in terminal to login to huggingface hub
	huggingface-cli login

	instead of

	from huggingface_hub import notebook_login
	notebook_login()

	"""
	import torch
	import datasets
	import evaluate
	import numpy as np
	from tqdm.auto import tqdm

	from transformers import Trainer, AutoModelForTokenClassification, TrainingArguments, DataCollatorForTokenClassification

	dataset = datasets.load_dataset("json", data_files="data/ner_input_data/ner_dataset.json")

	# Convert ner_tag list of string to sequence of classlabels as expected by hugging face for target var https://discuss.huggingface.co/t/sequence-features-class-label-cast/44638/3
	def get_label_list(labels):
	"""Create list of ner labels to create ClassLabel

	Args:
	labels (_type_): ner label column in the dataset

	Returns:
	_type_: unique NER labels
	https://github.com/huggingface/transformers/blob/66fd3a8d626a32989f4569260db32785c6cbf42a/examples/pytorch/token-classification/run_ner.py#L320
	"""

	unique_labels = set()
	for label in labels:
	unique_labels = unique_labels \| set(label)
	label_list = list(unique_labels)
	label_list.sort()
	return label_list

	all_labels = get_label_list(dataset['train']["ner_tags"])

	dataset = dataset.cast_column("ner_tags", datasets.Sequence(datasets.ClassLabel(names=all_labels)))

	raw_datasets = dataset["train"].train_test_split(train_size=0.8, seed=20)
	raw_datasets["validation"] = raw_datasets.pop("test")

	raw_datasets["train"][0]["tokens"]
	raw_datasets["train"][0]["ner_tags"]

	ner_feature = raw_datasets["train"].features["ner_tags"]
	ner_feature

	label_names = ner_feature.feature.names
	label_names

	words = raw_datasets["train"][0]["tokens"]
	labels = raw_datasets["train"][0]["ner_tags"]
	line1 = ""
	line2 = ""
	for word, label in zip(words, labels):
	full_label = label_names[label]
	max_length = max(len(word), len(full_label))
	line1 += word + " " * (max_length - len(word) + 1)
	line2 += full_label + " " * (max_length - len(full_label) + 1)

	print(line1)
	print(line2)

	from transformers import AutoTokenizer

	model_checkpoint = "bert-base-cased"
	tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

	tokenizer.is_fast

	inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True)
	inputs.tokens()

	inputs.word_ids()

	def align_labels_with_tokens(labels, word_ids):
	"""Expand our label list to match the ##subtokens post tokenization. Because tokenization adds ##subtokenz
	Special tokens get a label of -100(ignored in the loss function)
	For tokens inside a word but not at the beginning, we replace the B- with I-

	Args:
	labels (_type_): labels column
	word_ids (_type_): word_ids

	Returns:
	_type_: new labels
	"""
	new_labels = []
	current_word = None
	for word_id in word_ids:
	if word_id != current_word:
	# Start of a new word!
	current_word = word_id
	label = -100 if word_id is None else labels[word_id]
	new_labels.append(label)
	elif word_id is None:
	# Special token
	new_labels.append(-100)
	else:
	# Same word as previous token
	label = labels[word_id]
	# If the label is B-XXX we change it to I-XXX
	if label % 2 == 1:
	label += 1
	new_labels.append(label)

	return new_labels

	labels = raw_datasets["train"][0]["ner_tags"]
	word_ids = inputs.word_ids()
	print(labels)
	print(align_labels_with_tokens(labels, word_ids))

	def tokenize_and_align_labels(examples):
	"""Tokenize and handle ##subword tokens

	Args:
	examples (_type_): _description_

	Returns:
	_type_: _description_
	"""
	tokenized_inputs = tokenizer(
	examples["tokens"], truncation=True, is_split_into_words=True
	)
	all_labels = examples["ner_tags"]
	new_labels = []
	for i, labels in enumerate(all_labels):
	word_ids = tokenized_inputs.word_ids(i)
	new_labels.append(align_labels_with_tokens(labels, word_ids))

	tokenized_inputs["labels"] = new_labels
	return tokenized_inputs

	tokenized_datasets = raw_datasets.map(
	tokenize_and_align_labels,
	batched=True,
	remove_columns=raw_datasets["train"].column_names,
	)

	data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

	batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
	batch["labels"]

	for i in range(2):
	print(tokenized_datasets["train"][i]["labels"])


	metric = evaluate.load("seqeval")

	labels = raw_datasets["train"][0]["ner_tags"]
	labels = [label_names[i] for i in labels]
	labels

	predictions = labels.copy()
	predictions[2] = "O"
	metric.compute(predictions=[predictions], references=[labels])




	def compute_metrics(eval_preds):
	logits, labels = eval_preds
	predictions = np.argmax(logits, axis=-1)

	# Remove ignored index (special tokens) and convert to labels
	true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
	true_predictions = [
	[label_names[p] for (p, l) in zip(prediction, label) if l != -100]
	for prediction, label in zip(predictions, labels)
	]
	all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
	return {
	"precision": all_metrics["overall_precision"],
	"recall": all_metrics["overall_recall"],
	"f1": all_metrics["overall_f1"],
	"accuracy": all_metrics["overall_accuracy"],
	}

	id2label = {i: label for i, label in enumerate(label_names)}
	label2id = {v: k for k, v in id2label.items()}


	""" Uncomment to uses highlevel Trainer from huggingface instead of custom training loop
	model = AutoModelForTokenClassification.from_pretrained(
	model_checkpoint,
	id2label=id2label,
	label2id=label2id,
	)

	model.config.num_labels


	args = TrainingArguments(
	output_dir="source/services/ner/model/hf_tokenclassification/bert-finetuned-legalentity-ner",
	evaluation_strategy="epoch",
	save_strategy="epoch",
	learning_rate=2e-5,
	num_train_epochs=6,
	weight_decay=0.01,
	push_to_hub=True,
	)


	trainer = Trainer(
	model=model,
	args=args,
	train_dataset=tokenized_datasets["train"],
	eval_dataset=tokenized_datasets["validation"],
	data_collator=data_collator,
	compute_metrics=compute_metrics,
	tokenizer=tokenizer,
	)
	trainer.train()

	trainer.push_to_hub(commit_message="Training complete")
	"""
	from torch.utils.data import DataLoader

	train_dataloader = DataLoader(
	tokenized_datasets["train"],
	shuffle=True,
	collate_fn=data_collator,
	batch_size=8,
	)
	eval_dataloader = DataLoader(
	tokenized_datasets["validation"], collate_fn=data_collator, batch_size=8
	)

	model = AutoModelForTokenClassification.from_pretrained(
	model_checkpoint,
	id2label=id2label,
	label2id=label2id,
	)

	from torch.optim import AdamW

	optimizer = AdamW(model.parameters(), lr=2e-5)

	from accelerate import Accelerator

	accelerator = Accelerator()
	model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
	model, optimizer, train_dataloader, eval_dataloader
	)

	from transformers import get_scheduler

	num_train_epochs = 6
	num_update_steps_per_epoch = len(train_dataloader)
	num_training_steps = num_train_epochs * num_update_steps_per_epoch

	lr_scheduler = get_scheduler(
	"linear",
	optimizer=optimizer,
	num_warmup_steps=0,
	num_training_steps=num_training_steps,
	)

	from huggingface_hub import Repository, get_full_repo_name

	model_name = "bert-finetuned-legalentity-ner-accelerate"
	repo_name = get_full_repo_name(model_name)
	repo_name

	output_dir = "source/services/ner/model/hf_tokenclassification/bert-finetuned-legalentity-ner-accelerate"
	repo = Repository(output_dir, clone_from=repo_name)

	def postprocess(predictions, labels):
	predictions = predictions.detach().cpu().clone().numpy()
	labels = labels.detach().cpu().clone().numpy()

	# Remove ignored index (special tokens) and convert to labels
	true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
	true_predictions = [
	[label_names[p] for (p, l) in zip(prediction, label) if l != -100]
	for prediction, label in zip(predictions, labels)
	]
	return true_labels, true_predictions


	progress_bar = tqdm(range(num_training_steps))

	for epoch in range(num_train_epochs):
	# Training
	model.train()
	for batch in train_dataloader:
	outputs = model(**batch)
	loss = outputs.loss
	accelerator.backward(loss)

	optimizer.step()
	lr_scheduler.step()
	optimizer.zero_grad()
	progress_bar.update(1)

	# Evaluation
	model.eval()
	for batch in eval_dataloader:
	with torch.no_grad():
	outputs = model(**batch)

	predictions = outputs.logits.argmax(dim=-1)
	labels = batch["labels"]

	# Necessary to pad predictions and labels for being gathered
	predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
	labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

	predictions_gathered = accelerator.gather(predictions)
	labels_gathered = accelerator.gather(labels)

	true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
	metric.add_batch(predictions=true_predictions, references=true_labels)

	results = metric.compute()
	print(
	f"epoch {epoch}:",
	{
	key: results[f"overall_{key}"]
	for key in ["precision", "recall", "f1", "accuracy"]
	},
	)

	# Save and upload
	accelerator.wait_for_everyone()
	unwrapped_model = accelerator.unwrap_model(model)
	unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
	if accelerator.is_main_process:
	tokenizer.save_pretrained(output_dir)
	repo.push_to_hub(
	commit_message=f"Training in progress epoch {epoch}", blocking=False
	)

	accelerator.wait_for_everyone()
	unwrapped_model = accelerator.unwrap_model(model)
	unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)

	from transformers import pipeline

	# Replace this with your own checkpoint
	model_checkpoint = "aimlnerd/bert-finetuned-legalentity-ner-accelerate"
	token_classifier = pipeline(
	"token-classification", model=model_checkpoint, aggregation_strategy="simple"
	)
	token_classifier("My name is James Bond and I work at MI6 in London.")