|
class TokenizeAndAlignLabelsStep(): |
|
|
|
|
|
def tokenize_and_align_labels(self, examples, tokenizer): |
|
|
|
tokenized_inputs = tokenizer(examples, padding='max_length', truncation=True, max_length=128) |
|
|
|
|
|
word_ids = tokenized_inputs.word_ids() |
|
|
|
previous_word_idx = None |
|
|
|
tokens= [] |
|
labels_mask = [] |
|
|
|
for word_idx in word_ids: |
|
if word_idx is None: |
|
labels_mask.append(False) |
|
|
|
elif word_idx != previous_word_idx: |
|
labels_mask.append(True) |
|
tokens.append(tokenized_inputs["input_ids"][word_idx]) |
|
else: |
|
labels_mask.append(False) |
|
|
|
previous_word_idx = word_idx |
|
|
|
tokenized_inputs["tokens"] = tokens |
|
tokenized_inputs["labels_mask"] = labels_mask |
|
|
|
return tokenized_inputs |
|
def main(): |
|
|
|
PIPELINE_REGISTRY.register_pipeline("PT-BERT-Large-CRF-HAREM-Default-pipeline", |
|
pipeline_class=BERT_CRF_Pipeline, |
|
pt_model=AutoModelForTokenClassification, |
|
) |
|
classifier = pipeline("PT-BERT-Large-CRF-HAREM-Default-pipeline", model="arubenruben/PT-BERT-Large-CRF-HAREM-Default", |
|
device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu"), trust_remote_code=True) |
|
out_path = os.path.join(sys.path[0], 'out', 'pipeline') |
|
repo = Repository( |
|
out_path, clone_from=f"arubenruben/PT-BERT-Large-CRF-HAREM-Default", use_auth_token=True) |
|
|
|
|
|
|
|
classifier.save_pretrained(out_path) |
|
repo.push_to_hub() |