| from argparse import ArgumentParser | |
| from tokenizer import Tokenizer | |
| parser = ArgumentParser( | |
| prog='Flop Tokenizer Python code', | |
| description='' | |
| ) | |
| if __name__ == '__main__': | |
| print('Hello world') | |
| parser.add_argument('-i', '--input_file') | |
| parser.add_argument('-o', '--output_file', default='tokenizer.bin') | |
| parser.add_argument('-n', '--max_vocab_size', default=32000) | |
| args = parser.parse_args() | |
| tokenizer = Tokenizer() | |
| with open(args.input_file, 'r') as f: | |
| dataset = f.read() | |
| tokenizer.train(dataset, max_length=args.max_vocab_size); | |
| tokenizer.to_file(args.output_file) | |
| print(f"Tokenizer has vocab size: {tokenizer.vocab_size}"); | |