Spaces:
Running
Running
from transformers import AutoTokenizer | |
import re | |
import string | |
class TF_Tokenizer: | |
def __init__(self, model_str): | |
tok = AutoTokenizer.from_pretrained(model_str) | |
def __call__(self, txt): | |
return self.tok.tokenize(txt) | |
class WS_Tokenizer: | |
def __init__(self): | |
pass | |
def __call__(self, txt): | |
return re.findall(r"[{}]|\w+".format(string.punctuation), txt) | |
def convert_spans_to_bio(txt, roles, tokenizer_func): | |
roles = sorted(roles, key=lambda x: x["start"]) | |
roles_left = [r["start"] for r in roles] | |
ttxt = tokenizer_func(txt) | |
c = 0 | |
cr = -1 | |
prev = "O" | |
troles = [] | |
for tok in ttxt: | |
if c >= len(txt): | |
break | |
while txt[c] == " ": | |
c += 1 | |
else: | |
if c in roles_left: # Start of a new role | |
ind = roles_left.index(c) | |
cr = roles[ind]["end"] | |
prev = "I-" + roles[ind]["label"] | |
troles.append("B-" + roles[ind]["label"]) | |
else: | |
if c < cr: # Assign previous role | |
troles.append(prev) | |
else: # Assign 'O' | |
troles.append("O") | |
c += len(tok) | |
if len(ttxt) != len(troles): | |
troles += ["O"] * (len(ttxt) - len(troles)) | |
assert len(ttxt) == len(troles) | |
return troles | |
def convert_bio_to_spans(txt, troles, tokenizer_func): | |
c = 0 | |
c2 = 0 | |
cr = -1 | |
cs = -1 | |
prev = "O" | |
roles = [] | |
ttxt = tokenizer_func(txt) | |
if len(ttxt) != len(troles): | |
ttxt = ttxt[: len(troles)] | |
for j, tok in enumerate(ttxt): | |
if c >= len(txt): | |
break | |
while c < len(txt) and txt[c].isspace(): | |
c += 1 | |
if tok[:2] == "##" or tok == "[UNK]": | |
c += len(tok) - 2 if tok[:2] == "##" else 1 | |
else: | |
if troles[j].startswith("B-"): | |
if cs >= cr: | |
cr = c | |
if cs >= 0: | |
roles.append({"start": cs, "end": c2, "label": prev}) | |
cs = c | |
prev = troles[j][2:] | |
else: | |
if troles[j] == "O": | |
if cs >= cr: | |
cr = c | |
if cs >= 0: | |
roles.append({"start": cs, "end": c2, "label": prev}) | |
c += len(tok) | |
c2 = c | |
if cs >= cr: | |
if cs >= 0: | |
roles.append({"start": cs, "end": c2, "label": prev}) | |
return roles | |
def span2bio(txt, labels): | |
roles = sorted(labels, key=lambda x: x["label"]) | |
roles_left = [r["start"] for r in roles] | |
ttxt = re.findall(r"[{}]|\w+".format(string.punctuation), txt) | |
c = 0 | |
cr = -1 | |
prev = "O" | |
troles = [] | |
for tok in ttxt: | |
if c >= len(txt): | |
break | |
while txt[c] == " ": | |
c += 1 | |
else: | |
if c in roles_left: # Start of a new role | |
ind = roles_left.index(c) | |
cr = roles[ind]["end"] | |
prev = "I-" + roles[ind]["label"] | |
troles.append("B-" + roles[ind]["label"]) | |
else: | |
if c < cr: # Assign previous role | |
troles.append(prev) | |
else: # Assign 'O' | |
troles.append("O") | |
c += len(tok) | |
if len(ttxt) != len(troles): | |
troles += ["O"] * (len(ttxt) - len(troles)) | |
assert len(ttxt) == len(troles) | |
return ttxt, troles | |