File size: 3,001 Bytes
08232dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import pipeline as hf_pipeline
from pathlib import Path

class NpcBertMLM():
    r"""A class for performing masked language modeling with BERT.

    This class provides functionality to perform masked language modeling
    predictions using a BERT model fine-tuned on NPC staging reports. The
    base model used is an uncased model released by Microsoft, and it can be
    found on the Hugging Face model hub under the name
    'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext'.

    Attributes:
        model (transformers.PreTrainedModel): 
            The fine-tuned BERT model.
        tokenizer (transformers.PreTrainedTokenizer): 
            The tokenizer for the BERT model.
        pipeline (transformers.fill-mask): 
            The Hugging Face fill-mask pipeline.
        pretrained_model (str): The path to 
            the directory containing the fine-tuned model.
    """
    def __init__(self):
        self.model = None
        self.tokenizer = None
        self.pipeline = None
        # relative to app.py
        self.pretrained_model = "./models/npc-bert-best"
    
    def load(self):
        """Loads the fine-tuned BERT model and related components.

        This method initializes the model, tokenizer, and pipeline for the
        masked language modeling tasks using the pre-trained weights from the
        specified directory.

        Raises:
            FileNotFoundError: If the pretrained model directory is not found.
        """
        if not Path(self.pretrained_model).is_dir():
            raise FileNotFoundError(f"Cannot found pretrained model at: {self.pretrained_model}")
        
        self.model = AutoModelForMaskedLM.from_pretrained(self.pretrained_model)
        self.tokenizer = AutoTokenizer.from_pretrained(self.pretrained_model)
        self.pipeline = hf_pipeline("fill-mask", model=self.model, tokenizer=self.tokenizer, device='cpu')
    
    def __call__(self, *args):
        """Performs masked language modeling prediction.

        This method should be called only after the `load` method has been executed
        to ensure that the model and pipeline are properly initialized. It accepts
        arguments to pass to the Hugging Face fill-mask pipeline.

        Args:
            *args: Variable length argument list to pass to the pipeline.

        Returns:
            The output of the fill-mask pipeline.

        Raises:
            BrokenPipeError: If the model has not been loaded before calling this method.
        """
        if self.pipeline is None:
            msg = "Model was not initialized, have you run load()?"
            raise BrokenPipeError(msg)
        pipe_out = self.pipeline(*args)
        # Just use the first output
        if not isinstance(pipe_out[0], dict):
            pipe_out = pipe_out[0]
            
        pipe_out = {oo['token_str']: oo['score'] for oo in pipe_out}
        return pipe_out