dsilin
/

detok-deberta-xl

+This model can be used to more accurately detokenize the moses tokenizer (it does a better job with certain lossy quotes and things)
+batched usage:
+```
+sentences = [
+    "They 're a young team . they have great players and amazing freshmen coming in , so think they 'll grow into themselves next year ,",
+    "\" We 'll talk go by now ; \" says Shucksmith ;",
+    "He 'll enjoy it more now that this he be dead , if put 'll pardon the expression .",
+    "I think you 'll be amazed at this way it finds ,",
+    "Michigan voters ^ are so frightened of fallen in permanent economic collapse that they 'll grab onto anything .",
+    "You 'll finding outs episode 4 .",
+    "\" Warren Gatland is a professional person and it wasn 't a case of 's I 'll phone my mate Rob up to if he wants a coaching job ' , he would done a fair amount of homework about , \" Howley air said .",
+    "You can look at the things I 'm saying about my record and about the events of campaign and history and you 'll find if now and and then I miss a words or I get something slightly off , I 'll correct it , acknowledge where it are wrong .",
+    "Wonder if 'll alive to see .",
+    "We 'll have to combine and a numbered of people ."
+]
+def sentences_to_input_tokens(sentences):
+    all_tokens = []
+    max_length = 0
+    sents_tokens = []
+    iids = tokenizer(sentences)
+    for sent_tokens in iids['input_ids']:
+#         sent_tokens = tokenizer.build_inputs_with_special_tokens(
+#             tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentence)))
+        sents_tokens.append(sent_tokens)
+        if len(sent_tokens) > max_length:
+            max_length = len(sent_tokens)
+        attention_mask = [1] * len(sent_tokens)
+        pos_ids = list(range(len(sent_tokens)))
+        encoding = {
+            "iids": sent_tokens,
+            "am": attention_mask,
+            "pos": pos_ids
+        }
+        all_tokens.append(encoding)
+    input_ids = []
+    attention_masks = []
+    position_ids = []
+    for i in range(len(all_tokens)):
+        encoding = all_tokens[i]
+        pad_len = max_length - len(encoding['iids'])
+        attention_masks.append(encoding['am'] + [0] * pad_len)
+        position_ids.append(encoding['pos'] + [0] * pad_len)
+        input_ids.append(encoding['iids'] + [tokenizer.pad_token_id] * pad_len)
+    encoding = {
+        "input_ids": torch.tensor(input_ids).to(device),
+        "attention_mask": torch.tensor(attention_masks).to(device),
+        "position_ids": torch.tensor(position_ids).to(device)
+    }
+    return encoding, sents_tokens
+def run_token_predictor_sentences(sentences):
+    encoding, at = sentences_to_input_tokens(sentences)
+    predictions = model(**encoding)[0].cpu().tolist()
+    outstrs = []
+    for i in range(len(predictions)):
+        outstr = ""
+        for p in zip(tokenizer.convert_ids_to_tokens(at[i][1:-1]), predictions[i][1:-1]):
+            if not "▁" in p[0]:
+                outstr+=p[0]
+            else:
+                if p[1][0] > p[1][1]:
+                    outstr+=p[0].replace("▁", " ")
+                else:
+                    outstr+=p[0].replace("▁", "")
+        outstrs.append(outstr.strip())
+    return outstrs
+outs = run_token_predictor_sentences(sentences)
+for p in zip(outs, sentences):
+    print(p[1])
+    print(p[0])
+    print('\n------\n')
+```