File size: 4,069 Bytes
7120e17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
776fd18
 
7120e17
 
 
 
 
776fd18
7120e17
 
 
 
 
 
 
2249e1f
7120e17
 
 
 
 
 
 
 
 
 
2249e1f
7120e17
 
 
 
 
776fd18
 
7120e17
 
 
776fd18
 
7120e17
 
 
 
 
9112af4
7120e17
 
9112af4
7120e17
 
9112af4
7120e17
 
9112af4
7120e17
 
 
 
 
 
 
 
776fd18
7120e17
 
 
776fd18
7120e17
 
 
776fd18
7120e17
 
9112af4
7120e17
 
 
9112af4
 
7120e17
 
 
 
 
 
 
9112af4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import torch
from transformers import Pipeline
from transformers import AutoTokenizer
from transformers.pipelines import PIPELINE_REGISTRY
from transformers import pipeline
from transformers import AutoModelForTokenClassification
from huggingface_hub import Repository
import sys
import os


class TokenizeAndAlignLabelsStep():

    # Adapted From : https://huggingface.co/docs/transformers/tasks/token_classification
    def tokenize_and_align_labels(self, examples, tokenizer):
                
        tokenized_inputs = tokenizer(examples, padding='max_length', truncation=True, max_length=128, is_split_into_words=True)
                
        # Map tokens to their respective word.
        word_ids = tokenized_inputs.word_ids()

        previous_word_idx = None
                
        labels_mask = []        
        
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:                    
                labels_mask.append(False)
            # Only label the first token of a given word.
            elif word_idx != previous_word_idx:                    
                labels_mask.append(True)                
            else:                    
                labels_mask.append(False)
            
            previous_word_idx = word_idx
                        
        tokenized_inputs["labels_mask"] = labels_mask

        return tokenized_inputs



class BERT_CRF_Pipeline(Pipeline):

    def _sanitize_parameters(self, **kwargs):
        return {}, {}, {}

    def preprocess(self, inputs):
        tokens = inputs['tokens']

        tokenizer = AutoTokenizer.from_pretrained(
            "neuralmind/bert-base-portuguese-cased", do_lower_case=False)            
        
        return TokenizeAndAlignLabelsStep().tokenize_and_align_labels(examples=tokens, tokenizer=tokenizer)


    def _forward(self, tokenizer_results):

        input_ids = torch.tensor(
            tokenizer_results['input_ids'], dtype=torch.long, device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")).unsqueeze(0)

        token_type_ids = torch.tensor(
            tokenizer_results['token_type_ids'], dtype=torch.long, device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")).unsqueeze(0)

        attention_mask = torch.tensor(
            tokenizer_results['attention_mask'], dtype=torch.bool, device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")).unsqueeze(0)

        labels_mask = torch.tensor(
            tokenizer_results['labels_mask'], dtype=torch.bool, device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")).unsqueeze(0)

        # input_ids, token_type_ids, attention_mask, labels, labels_mask
        outputs = self.model(input_ids=input_ids, token_type_ids=token_type_ids,
                             attention_mask=attention_mask, labels=None, labels_mask=labels_mask)

        return outputs

    def postprocess(self, model_outputs):
        
        # From Ner_tags to Ner_labels
        for i, label in enumerate(model_outputs[0]):
            model_outputs[0][i] = self.model.config.id2label[label]
                
        return model_outputs[0]



def main():    

    PIPELINE_REGISTRY.register_pipeline("PT-BERT-Large-CRF-HAREM-Selective-pipeline",
                                        pipeline_class=BERT_CRF_Pipeline,
                                        pt_model=AutoModelForTokenClassification,
                                        )
    classifier = pipeline("PT-BERT-Large-CRF-HAREM-Selective-pipeline", model="arubenruben/PT-BERT-Large-CRF-HAREM-Selective",
                          device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu"), trust_remote_code=True)
    out_path = os.path.join(sys.path[0], 'out', 'pipeline')
    repo = Repository(
        out_path, clone_from=f"arubenruben/PT-BERT-Large-CRF-HAREM-Selective", use_auth_token=True)

    # repo.git_pull()

    classifier.save_pretrained(out_path)
    repo.push_to_hub()