canto_ocr / gen_vocab.py
lywen's picture
Update gen_vocab.py
88e4372
import os.path
from glob import glob
from tqdm import tqdm
import codecs
import argparse
if __name__=='__main__':
parser = argparse.ArgumentParser(description='trocr vocab生成')
parser.add_argument('--cust_vocab', default="./cust-data/vocab.txt", type=str, help="自定义vocab文件生成")
parser.add_argument('--dataset_path', default="./dataset/train/*/*.jpg", type=str, help="自定义训练数字符集")
args = parser.parse_args()
paths = glob(args.dataset_path)
vocab = set()
for p in tqdm(paths):
with codecs.open(p, encoding='utf-8') as f:
txt = f.read().strip()
vocab.update(txt)
root_path = os.path.split(args.cust_vocab)[0]
os.makedirs(root_path, exist_ok=True)
with open(args.cust_vocab, 'w') as f:
f.write('\n'.join(list(vocab)))