dsilin
/

detok-deberta-xl

@@ -1,95 +1,3 @@
 ---
-language: english
-widget:
-- text: "They 're a young team . they have great players and amazing freshmen coming in , so think they 'll grow into themselves next year ,"
-- text: "\" We 'll talk go by now ; \" says Shucksmith ;"
-- text: "\" Warren Gatland is a professional person and it wasn 't a case of 's I 'll phone my mate Rob up to if he wants a coaching job ' , he would done a fair amount of homework about , \" Howley air said ."
 ---
-This model can be used to more accurately detokenize the moses tokenizer (it does a better job with certain lossy quotes and things)
-batched usage:
-```python
-sentences = [
-    "They 're a young team . they have great players and amazing freshmen coming in , so think they 'll grow into themselves next year ,",
-    "\" We 'll talk go by now ; \" says Shucksmith ;",
-    "He 'll enjoy it more now that this he be dead , if put 'll pardon the expression .",
-    "I think you 'll be amazed at this way it finds ,",
-    "Michigan voters ^ are so frightened of fallen in permanent economic collapse that they 'll grab onto anything .",
-    "You 'll finding outs episode 4 .",
-    "\" Warren Gatland is a professional person and it wasn 't a case of 's I 'll phone my mate Rob up to if he wants a coaching job ' , he would done a fair amount of homework about , \" Howley air said .",
-    "You can look at the things I 'm saying about my record and about the events of campaign and history and you 'll find if now and and then I miss a words or I get something slightly off , I 'll correct it , acknowledge where it are wrong .",
-    "Wonder if 'll alive to see .",
-    "We 'll have to combine and a numbered of people ."
-]
-def sentences_to_input_tokens(sentences):
-    all_tokens = []
-    max_length = 0
-    sents_tokens = []
-    iids = tokenizer(sentences)
-    for sent_tokens in iids['input_ids']:
-        sents_tokens.append(sent_tokens)
-        if len(sent_tokens) > max_length:
-            max_length = len(sent_tokens)
-        attention_mask = [1] * len(sent_tokens)
-        pos_ids = list(range(len(sent_tokens)))
-        encoding = {
-            "iids": sent_tokens,
-            "am": attention_mask,
-            "pos": pos_ids
-        }
-        all_tokens.append(encoding)
-    input_ids = []
-    attention_masks = []
-    position_ids = []
-    for i in range(len(all_tokens)):
-        encoding = all_tokens[i]
-        pad_len = max_length - len(encoding['iids'])
-        attention_masks.append(encoding['am'] + [0] * pad_len)
-        position_ids.append(encoding['pos'] + [0] * pad_len)
-        input_ids.append(encoding['iids'] + [tokenizer.pad_token_id] * pad_len)
-    encoding = {
-        "input_ids": torch.tensor(input_ids).to(device),
-        "attention_mask": torch.tensor(attention_masks).to(device),
-        "position_ids": torch.tensor(position_ids).to(device)
-    }
-    return encoding, sents_tokens
-def run_token_predictor_sentences(sentences):
-    encoding, at = sentences_to_input_tokens(sentences)
-    predictions = model(**encoding)[0].cpu().tolist()
-    outstrs = []
-    for i in range(len(predictions)):
-        outstr = ""
-        for p in zip(tokenizer.convert_ids_to_tokens(at[i][1:-1]), predictions[i][1:-1]):
-            if not "▁" in p[0]:
-                outstr+=p[0]
-            else:
-                if p[1][0] > p[1][1]:
-                    outstr+=p[0].replace("▁", " ")
-                else:
-                    outstr+=p[0].replace("▁", "")
-        outstrs.append(outstr.strip())
-    return outstrs
-outs = run_token_predictor_sentences(sentences)
-for p in zip(outs, sentences):
-    print(p[1])
-    print(p[0])
-    print('\n------\n')
-```

 ---
+language: en
 ---