nguyenvulebinh commited on
Commit
95d8e06
1 Parent(s): 90a61c9

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +41 -0
README.md ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ```python
2
+ from transformers import EncoderDecoderModel
3
+ from importlib.machinery import SourceFileLoader
4
+ from transformers.file_utils import cached_path, hf_bucket_url
5
+ import torch
6
+ import os
7
+
8
+ ## Load model & tokenizer
9
+ cache_dir='./cache'
10
+ model_name='nguyenvulebinh/spelling-oov'
11
+
12
+ def download_tokenizer_files():
13
+ resources = ['envibert_tokenizer.py', 'dict.txt', 'sentencepiece.bpe.model']
14
+ for item in resources:
15
+ if not os.path.exists(os.path.join(cache_dir, item)):
16
+ tmp_file = hf_bucket_url(model_name, filename=item)
17
+ tmp_file = cached_path(tmp_file,cache_dir=cache_dir)
18
+ os.rename(tmp_file, os.path.join(cache_dir, item))
19
+
20
+ download_tokenizer_files()
21
+ spell_tokenizer = SourceFileLoader("envibert.tokenizer",os.path.join(cache_dir,'envibert_tokenizer.py')).load_module().RobertaTokenizer(cache_dir)
22
+ spell_model = EncoderDecoderModel.from_pretrained(model_name)
23
+
24
+ def oov_spelling(word, num_candidate=1):
25
+ result = []
26
+ inputs = spell_tokenizer([word])
27
+ input_ids = inputs['input_ids']
28
+ attention_mask = inputs['attention_mask']
29
+ inputs = {
30
+ "input_ids": torch.tensor(input_ids),
31
+ "attention_mask": torch.tensor(attention_mask)
32
+ }
33
+ outputs = spell_model.generate(**inputs, num_return_sequences=num_candidate)
34
+ for output in outputs.cpu().detach().numpy().tolist():
35
+ result.append(spell_tokenizer.sp_model.DecodePieces(spell_tokenizer.decode(output, skip_special_tokens=True).split()))
36
+ return result
37
+
38
+ oov_spelling('spacespeaker')
39
+ # output: ['x pây x pếch cơ']
40
+
41
+ ```