Spaces:

mlwong
/

npc-bert-demo

Sleeping

App Files Files Community

npc-bert-demo / npc_bert_models /cls_module.py

mlwong

Fall back to use CPU

8e24969 9 months ago

raw

history blame contribute delete

3.41 kB

	import spaces.zero
	from transformers import AutoTokenizer, AutoModelForSequenceClassification
	from transformers import pipeline as hf_pipeline
	from pathlib import Path
	from typing import Any, Dict
	import spaces
	from .app_logger import get_logger

	class NpcBertCLS():
	r"""A class for performing report classification with BERT.

	This class facilitates report classification tasks using a BERT model
	fine-tuned on NPC staging reports. The base model is an uncased model
	released by Microsoft, which can be found on the Hugging Face model hub
	under the name 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext'.

	Attributes:
	model (transformers.PreTrainedModel):
	The fine-tuned BERT model for sequence classification.
	tokenizer (transformers.PreTrainedTokenizer):
	The tokenizer for the BERT model.
	pipeline (transformers.text-classification):
	The Hugging Face text-classification pipeline.
	pretrained_model (str):
	The path to the directory containing the fine-tuned model.
	"""
	logger = get_logger()
	def __init__(self):
	self.model = None
	self.tokenizer = None
	self.pipeline = None
	# relative to app.py
	self.pretrained_model = "./models/npc-bert-cls"
	self.logger.info(f"Created {__class__.__name__} instance.")

	def load(self) -> None:
	"""Loads the fine-tuned BERT model and related components.

	This method initializes the model, tokenizer, and pipeline for the
	text classification tasks using the pre-trained weights from the
	specified directory.

	Raises:
	FileNotFoundError: If the pretrained model directory is not found.
	"""
	if not Path(self.pretrained_model).is_dir():
	raise FileNotFoundError(f"Cannot found pretrained model at: {self.pretrained_model}")

	self.model = AutoModelForSequenceClassification.from_pretrained(self.pretrained_model)
	self.tokenizer = AutoTokenizer.from_pretrained(self.pretrained_model)
	self.pipeline = hf_pipeline("text-classification", model=self.model, tokenizer=self.tokenizer, device_map='auto')

	@spaces.GPU
	def __call__(self, *args: Any) -> Any:
	"""Performs classification on the given reports.

	This method should be called only after the `load` method has been executed
	to ensure that the model and pipeline are properly initialized. It accepts
	arguments to pass to the Hugging Face text-classification pipeline.

	Args:
	*args: Variable length argument list to pass to the pipeline.

	Returns:
	The output of the text-classification pipeline.

	Raises:
	BrokenPipeError: If the model has not been loaded before calling this method.
	"""
	self.logger.info(f"Called with {args = }")
	if self.pipeline is None:
	msg = "Model was not initialized, have you run load()?"
	raise BrokenPipeError(msg)

	# check length of text
	if len(args[0]) < 10:
	return "Not enough text for classification!"

	self.logger.info(f"{self.pipeline.model.device = }")

	pipe_out = self.pipeline(*args)
	pipe_out = {o['label']: o['score'] for o in pipe_out}
	return pipe_out