SmilesPeTokenizer / trainSmilesPeTokenizer.py
saicharan2804
SmilesPE tokenizer
1fc0c38
raw
history blame contribute delete
No virus
639 Bytes
import codecs
from SmilesPE.learner import *
import pandas as pd
import argparse
parser = argparse.ArgumentParser(description='Train SmilesPE Tokenizer.')
parser.add_argument('dataset_file_path', type=str, help='Path to the dataset file')
parser.add_argument('output_file_path', type=str, help='Path to file containing trained tokenizer weights')
# Parse the arguments
args = parser.parse_args()
df = pd.read_csv(args.dataset_file_path)
# df = df[0:30000]
output = codecs.open(args.output_file_path, 'w')
learn_SPE(df['canonical_smiles'].tolist(), output, 30000, min_frequency=2000, augmentation=1, verbose=True, total_symbols=True)