|
import os.path |
|
from glob import glob |
|
from tqdm import tqdm |
|
import codecs |
|
import argparse |
|
if __name__=='__main__': |
|
parser = argparse.ArgumentParser(description='trocr vocab生成') |
|
parser.add_argument('--cust_vocab', default="./cust-data/vocab.txt", type=str, help="自定义vocab文件生成") |
|
parser.add_argument('--dataset_path', default="./dataset/train/*/*.jpg", type=str, help="自定义训练数字符集") |
|
args = parser.parse_args() |
|
paths = glob(args.dataset_path) |
|
vocab = set() |
|
for p in tqdm(paths): |
|
with codecs.open(p, encoding='utf-8') as f: |
|
txt = f.read().strip() |
|
vocab.update(txt) |
|
root_path = os.path.split(args.cust_vocab)[0] |
|
os.makedirs(root_path, exist_ok=True) |
|
with open(args.cust_vocab, 'w') as f: |
|
f.write('\n'.join(list(vocab))) |
|
|
|
|
|
|
|
|
|
|
|
|