NER-PT-BERT-CRF-HAREM-Default / deploy_pipeline.py

Update deploy_pipeline.py

66ca236 over 1 year ago

2.03 kB

	class TokenizeAndAlignLabelsStep():

	# Adapted From : https://huggingface.co/docs/transformers/tasks/token_classification
	def tokenize_and_align_labels(self, examples, tokenizer):

	tokenized_inputs = tokenizer(examples, padding='max_length', truncation=True, max_length=128)

	# Map tokens to their respective word.
	word_ids = tokenized_inputs.word_ids()

	previous_word_idx = None

	tokens= []
	labels_mask = []

	for word_idx in word_ids: # Set the special tokens to -100.
	if word_idx is None:
	labels_mask.append(False)
	# Only label the first token of a given word.
	elif word_idx != previous_word_idx:
	labels_mask.append(True)
	tokens.append(tokenized_inputs["input_ids"][word_idx])
	else:
	labels_mask.append(False)

	previous_word_idx = word_idx

	tokenized_inputs["tokens"] = tokens
	tokenized_inputs["labels_mask"] = labels_mask

	return tokenized_inputs
	def main():

	PIPELINE_REGISTRY.register_pipeline("PT-BERT-Large-CRF-HAREM-Default-pipeline",
	pipeline_class=BERT_CRF_Pipeline,
	pt_model=AutoModelForTokenClassification,
	)
	classifier = pipeline("PT-BERT-Large-CRF-HAREM-Default-pipeline", model="arubenruben/PT-BERT-Large-CRF-HAREM-Default",
	device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu"), trust_remote_code=True)
	out_path = os.path.join(sys.path[0], 'out', 'pipeline')
	repo = Repository(
	out_path, clone_from=f"arubenruben/PT-BERT-Large-CRF-HAREM-Default", use_auth_token=True)

	# repo.git_pull()

	classifier.save_pretrained(out_path)
	repo.push_to_hub()