import codecs from SmilesPE.learner import * import pandas as pd import argparse parser = argparse.ArgumentParser(description='Train SmilesPE Tokenizer.') parser.add_argument('dataset_file_path', type=str, help='Path to the dataset file') parser.add_argument('output_file_path', type=str, help='Path to file containing trained tokenizer weights') # Parse the arguments args = parser.parse_args() df = pd.read_csv(args.dataset_file_path) # df = df[0:30000] output = codecs.open(args.output_file_path, 'w') learn_SPE(df['canonical_smiles'].tolist(), output, 30000, min_frequency=2000, augmentation=1, verbose=True, total_symbols=True)