|
import pandas as pd |
|
import os |
|
|
|
import torch |
|
from transformers import RobertaTokenizerFast, RobertaForMaskedLM, DataCollatorWithPadding |
|
|
|
import datasets |
|
from datasets import disable_caching |
|
disable_caching() |
|
|
|
DEVICE = 'cuda:0' |
|
ENCODER_MODEL_NAME = "entropy/roberta_zinc_480m" |
|
ENCODER_BATCH_SIZE = 1024 |
|
|
|
TOKENIZER_MAX_LEN = 256 |
|
TOKENIZATION_NUM_PROC = 32 |
|
|
|
''' |
|
Data source is expected to be a CSV file with a column of SMILES strings |
|
denoted by `SMILES_COLUMN`. The CSV is processed in chunks of size `PROCESS_CHUNKSIZE`. |
|
|
|
Processed chunks are saved to `SAVE_PATH` with the format `SAVE_PATH/processed_shard_{i}.hf` |
|
''' |
|
|
|
DATASET_CSV_FILENAME = None |
|
PROCESS_CHUNKSIZE = 1000000 |
|
SMILES_COLUMN = 'smiles' |
|
MAX_CHUNKS = None |
|
MAX_SMILES_LENGTH = 90 |
|
MIN_SMILES_LENGTH = 5 |
|
FILTER_NUM_PROC = 32 |
|
SAVE_PATH = None |
|
|
|
assert DATASET_CSV_FILENAME is not None, "must specify dataset filename" |
|
assert SAVE_PATH is not None, "must specify save path" |
|
|
|
|
|
def tokenization(example): |
|
return tokenizer(example[SMILES_COLUMN], add_special_tokens=True, |
|
truncation=True, max_length=TOKENIZER_MAX_LEN) |
|
|
|
def embed(inputs): |
|
inputs = {k:inputs[k] for k in ['input_ids', 'attention_mask']} |
|
inputs = collator(inputs) |
|
inputs = {k:v.to(DEVICE) for k,v in inputs.items()} |
|
|
|
with torch.no_grad(): |
|
outputs = model(**inputs, output_hidden_states=True) |
|
full_embeddings = outputs[-1][-1] |
|
mask = inputs['attention_mask'] |
|
|
|
mean_embeddings = ((full_embeddings * mask.unsqueeze(-1)).sum(1) / mask.sum(-1).unsqueeze(-1)) |
|
|
|
return {'encoder_hidden_states' : mean_embeddings} |
|
|
|
def length_filter_smiles(example): |
|
min_check = (len(example[SMILES_COLUMN])>MIN_SMILES_LENGTH) if (MIN_SMILES_LENGTH is not None) else True |
|
max_check = (len(example[SMILES_COLUMN])<MAX_SMILES_LENGTH) if (MIN_SMILES_LENGTH is not None) else True |
|
type_check = type(example[SMILES_COLUMN])==str |
|
filter_pass = all([min_check, max_check, type_check]) |
|
return filter_pass |
|
|
|
|
|
tokenizer = RobertaTokenizerFast.from_pretrained(ENCODER_MODEL_NAME, max_len=TOKENIZER_MAX_LEN) |
|
collator = DataCollatorWithPadding(tokenizer, padding=True, return_tensors='pt') |
|
|
|
model = RobertaForMaskedLM.from_pretrained(ENCODER_MODEL_NAME) |
|
model.to(DEVICE) |
|
model.eval() |
|
|
|
df_iter = pd.read_csv(DATASET_CSV_FILENAME, chunksize=PROCESS_CHUNKSIZE, usecols=[SMILES_COLUMN]) |
|
|
|
for i, df in enumerate(df_iter): |
|
print(f'processing dataset chunk {i}') |
|
|
|
dataset = datasets.Dataset.from_pandas(df) |
|
|
|
dataset = dataset.filter(lambda example: length_filter_smiles(example), num_proc=FILTER_NUM_PROC) |
|
|
|
dataset = dataset.map(tokenization, batched=True, num_proc=TOKENIZATION_NUM_PROC) |
|
|
|
dataset = dataset.map(embed, batched=True, batch_size=ENCODER_BATCH_SIZE) |
|
|
|
dataset.save_to_disk(f'{SAVE_PATH}/processed_shard_{i}.hf') |
|
|
|
if (MAX_CHUNKS is not None) and (i >= MAX_CHUNKS-1): |
|
break |
|
|
|
print('finished data processing') |
|
|
|
|
|
|