File size: 828 Bytes
4db1ad4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88e4372
4db1ad4
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import os.path
from glob import glob
from tqdm import tqdm
import codecs
import argparse
if __name__=='__main__':
    parser = argparse.ArgumentParser(description='trocr vocab生成')
    parser.add_argument('--cust_vocab', default="./cust-data/vocab.txt", type=str, help="自定义vocab文件生成")
    parser.add_argument('--dataset_path', default="./dataset/train/*/*.jpg", type=str, help="自定义训练数字符集")
    args = parser.parse_args()
    paths = glob(args.dataset_path)
    vocab = set()
    for p in tqdm(paths):
        with codecs.open(p, encoding='utf-8') as f:
            txt = f.read().strip()
        vocab.update(txt)
    root_path = os.path.split(args.cust_vocab)[0]
    os.makedirs(root_path, exist_ok=True)
    with open(args.cust_vocab, 'w') as f:
        f.write('\n'.join(list(vocab)))