Edit model card

This model can be used to more accurately detokenize the moses tokenizer (it does a better job with certain lossy quotes and things)

batched usage:


sentences = [
    "They 're a young team . they have great players and amazing freshmen coming in , so think they 'll grow into themselves next year ,",
    "\" We 'll talk go by now ; \" says Shucksmith ;",
    "He 'll enjoy it more now that this he be dead , if put 'll pardon the expression .",
    "I think you 'll be amazed at this way it finds ,",
    "Michigan voters ^ are so frightened of fallen in permanent economic collapse that they 'll grab onto anything .",
    "You 'll finding outs episode 4 .",
    "\" Warren Gatland is a professional person and it wasn 't a case of 's I 'll phone my mate Rob up to if he wants a coaching job ' , he would done a fair amount of homework about , \" Howley air said .",
    "You can look at the things I 'm saying about my record and about the events of campaign and history and you 'll find if now and and then I miss a words or I get something slightly off , I 'll correct it , acknowledge where it are wrong .",
    "Wonder if 'll alive to see .",
    "We 'll have to combine and a numbered of people ."
]

def sentences_to_input_tokens(sentences):
    all_tokens = []
    max_length = 0
    sents_tokens = []
    iids = tokenizer(sentences)
    for sent_tokens in iids['input_ids']:        
        sents_tokens.append(sent_tokens)
        
        if len(sent_tokens) > max_length:
            max_length = len(sent_tokens)
            
        attention_mask = [1] * len(sent_tokens)
        pos_ids = list(range(len(sent_tokens)))

        encoding = {
            "iids": sent_tokens,
            "am": attention_mask,
            "pos": pos_ids
        }
        
        all_tokens.append(encoding)
    
    input_ids = []
    attention_masks = []
    position_ids = []
    for i in range(len(all_tokens)):
        
        encoding = all_tokens[i]
        
        pad_len = max_length - len(encoding['iids'])
        attention_masks.append(encoding['am'] + [0] * pad_len)
        position_ids.append(encoding['pos'] + [0] * pad_len)
        input_ids.append(encoding['iids'] + [tokenizer.pad_token_id] * pad_len)        
    
    encoding = {
        "input_ids": torch.tensor(input_ids).to(device),
        "attention_mask": torch.tensor(attention_masks).to(device),
        "position_ids": torch.tensor(position_ids).to(device)
    }
        
    return encoding, sents_tokens

def run_token_predictor_sentences(sentences):
    encoding, at = sentences_to_input_tokens(sentences)
    predictions = model(**encoding)[0].cpu().tolist()
    outstrs = []

    for i in range(len(predictions)):
        outstr = ""
        for p in zip(tokenizer.convert_ids_to_tokens(at[i][1:-1]), predictions[i][1:-1]):
            if not "▁" in p[0]:
                outstr+=p[0]
            else:
                if p[1][0] > p[1][1]:
                    outstr+=p[0].replace("▁", " ")
                else:
                    outstr+=p[0].replace("▁", "")
        outstrs.append(outstr.strip())
    return outstrs
    
outs = run_token_predictor_sentences(sentences)
for p in zip(outs, sentences):
    print(p[1])
    print(p[0])
    print('\n------\n')
    
Downloads last month
9
Inference API
This model does not have enough activity to be deployed to Inference API (serverless) yet. Increase its social visibility and check back later, or deploy to Inference Endpoints (dedicated) instead.