|
--- |
|
language: english |
|
widget: |
|
- text: "They 're a young team . they have great players and amazing freshmen coming in , so think they 'll grow into themselves next year ," |
|
- text: "\" We 'll talk go by now ; \" says Shucksmith ;" |
|
- text: "\" Warren Gatland is a professional person and it wasn 't a case of 's I 'll phone my mate Rob up to if he wants a coaching job ' , he would done a fair amount of homework about , \" Howley air said ." |
|
--- |
|
|
|
This model can be used to more accurately detokenize the moses tokenizer (it does a better job with certain lossy quotes and things) |
|
|
|
|
|
batched usage: |
|
|
|
```python |
|
|
|
sentences = [ |
|
"They 're a young team . they have great players and amazing freshmen coming in , so think they 'll grow into themselves next year ,", |
|
"\" We 'll talk go by now ; \" says Shucksmith ;", |
|
"He 'll enjoy it more now that this he be dead , if put 'll pardon the expression .", |
|
"I think you 'll be amazed at this way it finds ,", |
|
"Michigan voters ^ are so frightened of fallen in permanent economic collapse that they 'll grab onto anything .", |
|
"You 'll finding outs episode 4 .", |
|
"\" Warren Gatland is a professional person and it wasn 't a case of 's I 'll phone my mate Rob up to if he wants a coaching job ' , he would done a fair amount of homework about , \" Howley air said .", |
|
"You can look at the things I 'm saying about my record and about the events of campaign and history and you 'll find if now and and then I miss a words or I get something slightly off , I 'll correct it , acknowledge where it are wrong .", |
|
"Wonder if 'll alive to see .", |
|
"We 'll have to combine and a numbered of people ." |
|
] |
|
|
|
def sentences_to_input_tokens(sentences): |
|
all_tokens = [] |
|
max_length = 0 |
|
sents_tokens = [] |
|
iids = tokenizer(sentences) |
|
for sent_tokens in iids['input_ids']: |
|
sents_tokens.append(sent_tokens) |
|
|
|
if len(sent_tokens) > max_length: |
|
max_length = len(sent_tokens) |
|
|
|
attention_mask = [1] * len(sent_tokens) |
|
pos_ids = list(range(len(sent_tokens))) |
|
|
|
encoding = { |
|
"iids": sent_tokens, |
|
"am": attention_mask, |
|
"pos": pos_ids |
|
} |
|
|
|
all_tokens.append(encoding) |
|
|
|
input_ids = [] |
|
attention_masks = [] |
|
position_ids = [] |
|
for i in range(len(all_tokens)): |
|
|
|
encoding = all_tokens[i] |
|
|
|
pad_len = max_length - len(encoding['iids']) |
|
attention_masks.append(encoding['am'] + [0] * pad_len) |
|
position_ids.append(encoding['pos'] + [0] * pad_len) |
|
input_ids.append(encoding['iids'] + [tokenizer.pad_token_id] * pad_len) |
|
|
|
encoding = { |
|
"input_ids": torch.tensor(input_ids).to(device), |
|
"attention_mask": torch.tensor(attention_masks).to(device), |
|
"position_ids": torch.tensor(position_ids).to(device) |
|
} |
|
|
|
return encoding, sents_tokens |
|
|
|
def run_token_predictor_sentences(sentences): |
|
encoding, at = sentences_to_input_tokens(sentences) |
|
predictions = model(**encoding)[0].cpu().tolist() |
|
outstrs = [] |
|
|
|
for i in range(len(predictions)): |
|
outstr = "" |
|
for p in zip(tokenizer.convert_ids_to_tokens(at[i][1:-1]), predictions[i][1:-1]): |
|
if not "▁" in p[0]: |
|
outstr+=p[0] |
|
else: |
|
if p[1][0] > p[1][1]: |
|
outstr+=p[0].replace("▁", " ") |
|
else: |
|
outstr+=p[0].replace("▁", "") |
|
outstrs.append(outstr.strip()) |
|
return outstrs |
|
|
|
outs = run_token_predictor_sentences(sentences) |
|
for p in zip(outs, sentences): |
|
print(p[1]) |
|
print(p[0]) |
|
print('\n------\n') |
|
|
|
``` |