Upload README.md with huggingface_hub
Browse files
README.md
CHANGED
@@ -1,95 +1,3 @@
|
|
1 |
---
|
2 |
-
language:
|
3 |
-
widget:
|
4 |
-
- text: "They 're a young team . they have great players and amazing freshmen coming in , so think they 'll grow into themselves next year ,"
|
5 |
-
- text: "\" We 'll talk go by now ; \" says Shucksmith ;"
|
6 |
-
- text: "\" Warren Gatland is a professional person and it wasn 't a case of 's I 'll phone my mate Rob up to if he wants a coaching job ' , he would done a fair amount of homework about , \" Howley air said ."
|
7 |
---
|
8 |
-
|
9 |
-
This model can be used to more accurately detokenize the moses tokenizer (it does a better job with certain lossy quotes and things)
|
10 |
-
|
11 |
-
|
12 |
-
batched usage:
|
13 |
-
|
14 |
-
```python
|
15 |
-
|
16 |
-
sentences = [
|
17 |
-
"They 're a young team . they have great players and amazing freshmen coming in , so think they 'll grow into themselves next year ,",
|
18 |
-
"\" We 'll talk go by now ; \" says Shucksmith ;",
|
19 |
-
"He 'll enjoy it more now that this he be dead , if put 'll pardon the expression .",
|
20 |
-
"I think you 'll be amazed at this way it finds ,",
|
21 |
-
"Michigan voters ^ are so frightened of fallen in permanent economic collapse that they 'll grab onto anything .",
|
22 |
-
"You 'll finding outs episode 4 .",
|
23 |
-
"\" Warren Gatland is a professional person and it wasn 't a case of 's I 'll phone my mate Rob up to if he wants a coaching job ' , he would done a fair amount of homework about , \" Howley air said .",
|
24 |
-
"You can look at the things I 'm saying about my record and about the events of campaign and history and you 'll find if now and and then I miss a words or I get something slightly off , I 'll correct it , acknowledge where it are wrong .",
|
25 |
-
"Wonder if 'll alive to see .",
|
26 |
-
"We 'll have to combine and a numbered of people ."
|
27 |
-
]
|
28 |
-
|
29 |
-
def sentences_to_input_tokens(sentences):
|
30 |
-
all_tokens = []
|
31 |
-
max_length = 0
|
32 |
-
sents_tokens = []
|
33 |
-
iids = tokenizer(sentences)
|
34 |
-
for sent_tokens in iids['input_ids']:
|
35 |
-
sents_tokens.append(sent_tokens)
|
36 |
-
|
37 |
-
if len(sent_tokens) > max_length:
|
38 |
-
max_length = len(sent_tokens)
|
39 |
-
|
40 |
-
attention_mask = [1] * len(sent_tokens)
|
41 |
-
pos_ids = list(range(len(sent_tokens)))
|
42 |
-
|
43 |
-
encoding = {
|
44 |
-
"iids": sent_tokens,
|
45 |
-
"am": attention_mask,
|
46 |
-
"pos": pos_ids
|
47 |
-
}
|
48 |
-
|
49 |
-
all_tokens.append(encoding)
|
50 |
-
|
51 |
-
input_ids = []
|
52 |
-
attention_masks = []
|
53 |
-
position_ids = []
|
54 |
-
for i in range(len(all_tokens)):
|
55 |
-
|
56 |
-
encoding = all_tokens[i]
|
57 |
-
|
58 |
-
pad_len = max_length - len(encoding['iids'])
|
59 |
-
attention_masks.append(encoding['am'] + [0] * pad_len)
|
60 |
-
position_ids.append(encoding['pos'] + [0] * pad_len)
|
61 |
-
input_ids.append(encoding['iids'] + [tokenizer.pad_token_id] * pad_len)
|
62 |
-
|
63 |
-
encoding = {
|
64 |
-
"input_ids": torch.tensor(input_ids).to(device),
|
65 |
-
"attention_mask": torch.tensor(attention_masks).to(device),
|
66 |
-
"position_ids": torch.tensor(position_ids).to(device)
|
67 |
-
}
|
68 |
-
|
69 |
-
return encoding, sents_tokens
|
70 |
-
|
71 |
-
def run_token_predictor_sentences(sentences):
|
72 |
-
encoding, at = sentences_to_input_tokens(sentences)
|
73 |
-
predictions = model(**encoding)[0].cpu().tolist()
|
74 |
-
outstrs = []
|
75 |
-
|
76 |
-
for i in range(len(predictions)):
|
77 |
-
outstr = ""
|
78 |
-
for p in zip(tokenizer.convert_ids_to_tokens(at[i][1:-1]), predictions[i][1:-1]):
|
79 |
-
if not "▁" in p[0]:
|
80 |
-
outstr+=p[0]
|
81 |
-
else:
|
82 |
-
if p[1][0] > p[1][1]:
|
83 |
-
outstr+=p[0].replace("▁", " ")
|
84 |
-
else:
|
85 |
-
outstr+=p[0].replace("▁", "")
|
86 |
-
outstrs.append(outstr.strip())
|
87 |
-
return outstrs
|
88 |
-
|
89 |
-
outs = run_token_predictor_sentences(sentences)
|
90 |
-
for p in zip(outs, sentences):
|
91 |
-
print(p[1])
|
92 |
-
print(p[0])
|
93 |
-
print('\n------\n')
|
94 |
-
|
95 |
-
```
|
|
|
1 |
---
|
2 |
+
language: en
|
|
|
|
|
|
|
|
|
3 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|