File size: 3,726 Bytes
40bc550 30ff282 40bc550 30ff282 40bc550 30ff282 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
---
language: english
widget:
- text: "They 're a young team . they have great players and amazing freshmen coming in , so think they 'll grow into themselves next year ,"
- text: "\" We 'll talk go by now ; \" says Shucksmith ;"
- text: "\" Warren Gatland is a professional person and it wasn 't a case of 's I 'll phone my mate Rob up to if he wants a coaching job ' , he would done a fair amount of homework about , \" Howley air said ."
---
This model can be used to more accurately detokenize the moses tokenizer (it does a better job with certain lossy quotes and things)
batched usage:
```python
sentences = [
"They 're a young team . they have great players and amazing freshmen coming in , so think they 'll grow into themselves next year ,",
"\" We 'll talk go by now ; \" says Shucksmith ;",
"He 'll enjoy it more now that this he be dead , if put 'll pardon the expression .",
"I think you 'll be amazed at this way it finds ,",
"Michigan voters ^ are so frightened of fallen in permanent economic collapse that they 'll grab onto anything .",
"You 'll finding outs episode 4 .",
"\" Warren Gatland is a professional person and it wasn 't a case of 's I 'll phone my mate Rob up to if he wants a coaching job ' , he would done a fair amount of homework about , \" Howley air said .",
"You can look at the things I 'm saying about my record and about the events of campaign and history and you 'll find if now and and then I miss a words or I get something slightly off , I 'll correct it , acknowledge where it are wrong .",
"Wonder if 'll alive to see .",
"We 'll have to combine and a numbered of people ."
]
def sentences_to_input_tokens(sentences):
all_tokens = []
max_length = 0
sents_tokens = []
iids = tokenizer(sentences)
for sent_tokens in iids['input_ids']:
sents_tokens.append(sent_tokens)
if len(sent_tokens) > max_length:
max_length = len(sent_tokens)
attention_mask = [1] * len(sent_tokens)
pos_ids = list(range(len(sent_tokens)))
encoding = {
"iids": sent_tokens,
"am": attention_mask,
"pos": pos_ids
}
all_tokens.append(encoding)
input_ids = []
attention_masks = []
position_ids = []
for i in range(len(all_tokens)):
encoding = all_tokens[i]
pad_len = max_length - len(encoding['iids'])
attention_masks.append(encoding['am'] + [0] * pad_len)
position_ids.append(encoding['pos'] + [0] * pad_len)
input_ids.append(encoding['iids'] + [tokenizer.pad_token_id] * pad_len)
encoding = {
"input_ids": torch.tensor(input_ids).to(device),
"attention_mask": torch.tensor(attention_masks).to(device),
"position_ids": torch.tensor(position_ids).to(device)
}
return encoding, sents_tokens
def run_token_predictor_sentences(sentences):
encoding, at = sentences_to_input_tokens(sentences)
predictions = model(**encoding)[0].cpu().tolist()
outstrs = []
for i in range(len(predictions)):
outstr = ""
for p in zip(tokenizer.convert_ids_to_tokens(at[i][1:-1]), predictions[i][1:-1]):
if not "▁" in p[0]:
outstr+=p[0]
else:
if p[1][0] > p[1][1]:
outstr+=p[0].replace("▁", " ")
else:
outstr+=p[0].replace("▁", "")
outstrs.append(outstr.strip())
return outstrs
outs = run_token_predictor_sentences(sentences)
for p in zip(outs, sentences):
print(p[1])
print(p[0])
print('\n------\n')
``` |