nguyenvulebinh
commited on
Commit
•
95d8e06
1
Parent(s):
90a61c9
Create README.md
Browse files
README.md
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
```python
|
2 |
+
from transformers import EncoderDecoderModel
|
3 |
+
from importlib.machinery import SourceFileLoader
|
4 |
+
from transformers.file_utils import cached_path, hf_bucket_url
|
5 |
+
import torch
|
6 |
+
import os
|
7 |
+
|
8 |
+
## Load model & tokenizer
|
9 |
+
cache_dir='./cache'
|
10 |
+
model_name='nguyenvulebinh/spelling-oov'
|
11 |
+
|
12 |
+
def download_tokenizer_files():
|
13 |
+
resources = ['envibert_tokenizer.py', 'dict.txt', 'sentencepiece.bpe.model']
|
14 |
+
for item in resources:
|
15 |
+
if not os.path.exists(os.path.join(cache_dir, item)):
|
16 |
+
tmp_file = hf_bucket_url(model_name, filename=item)
|
17 |
+
tmp_file = cached_path(tmp_file,cache_dir=cache_dir)
|
18 |
+
os.rename(tmp_file, os.path.join(cache_dir, item))
|
19 |
+
|
20 |
+
download_tokenizer_files()
|
21 |
+
spell_tokenizer = SourceFileLoader("envibert.tokenizer",os.path.join(cache_dir,'envibert_tokenizer.py')).load_module().RobertaTokenizer(cache_dir)
|
22 |
+
spell_model = EncoderDecoderModel.from_pretrained(model_name)
|
23 |
+
|
24 |
+
def oov_spelling(word, num_candidate=1):
|
25 |
+
result = []
|
26 |
+
inputs = spell_tokenizer([word])
|
27 |
+
input_ids = inputs['input_ids']
|
28 |
+
attention_mask = inputs['attention_mask']
|
29 |
+
inputs = {
|
30 |
+
"input_ids": torch.tensor(input_ids),
|
31 |
+
"attention_mask": torch.tensor(attention_mask)
|
32 |
+
}
|
33 |
+
outputs = spell_model.generate(**inputs, num_return_sequences=num_candidate)
|
34 |
+
for output in outputs.cpu().detach().numpy().tolist():
|
35 |
+
result.append(spell_tokenizer.sp_model.DecodePieces(spell_tokenizer.decode(output, skip_special_tokens=True).split()))
|
36 |
+
return result
|
37 |
+
|
38 |
+
oov_spelling('spacespeaker')
|
39 |
+
# output: ['x pây x pếch cơ']
|
40 |
+
|
41 |
+
```
|