{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU", "gpuClass": "standard" }, "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "id": "v8u-nj2QMdOe" }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "NEhxV9lTIJSC", "outputId": "591217f5-abc4-4813-a8f0-f4c54e6040ac" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", "Requirement already satisfied: rdkit in /usr/local/lib/python3.10/dist-packages (2023.3.1)\n", "Requirement already satisfied: Pillow in /usr/local/lib/python3.10/dist-packages (from rdkit) (8.4.0)\n", "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from rdkit) (1.22.4)\n" ] } ], "source": [ "pip install rdkit" ] }, { "cell_type": "code", "source": [ "'''\n", "from google.colab import drive\n", "drive.mount('/content/drive')\n", "'''" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 36 }, "id": "z296l7b2M6CV", "outputId": "b02b2c1c-4e65-4f2d-c8a3-7fdceb18e79e" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "\"\\nfrom google.colab import drive\\ndrive.mount('/content/drive')\\n\"" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "string" } }, "metadata": {}, "execution_count": 2 } ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "5m_uOzL0IJOQ" }, "outputs": [], "source": [ "import math\n", "import logging\n", "\n", "import torch\n", "import torch.nn as nn\n", "from torch.nn import functional as F\n", "\n", "logger = logging.getLogger(__name__)\n", "\n", "class GPTConfig:\n", " \"\"\" base GPT config, params common to all GPT versions \"\"\"\n", " embd_pdrop = 0.1\n", " resid_pdrop = 0.1\n", " attn_pdrop = 0.1\n", "\n", " def __init__(self, vocab_size, block_size, **kwargs):\n", " self.vocab_size = vocab_size\n", " self.block_size = block_size\n", " for k,v in kwargs.items():\n", " setattr(self, k, v)\n", "\n", "class GPT1Config(GPTConfig):\n", " \"\"\" GPT-1 like network roughly 125M params \"\"\"\n", " n_layer = 12\n", " n_head = 12\n", " n_embd = 768\n", "\n", "class CausalSelfAttention(nn.Module):\n", " \"\"\"\n", " A vanilla multi-head masked self-attention layer with a projection at the end.\n", " I believe I could have just used torch.nn.MultiheadAttention but their documentation\n", " is all but absent and code ugly so I don't trust it, rolling my own here.\n", " \"\"\"\n", "\n", " def __init__(self, config):\n", " super().__init__()\n", " assert config.n_embd % config.n_head == 0\n", " # key, query, value projections for all heads\n", " self.key = nn.Linear(config.n_embd, config.n_embd)\n", " self.query = nn.Linear(config.n_embd, config.n_embd)\n", " self.value = nn.Linear(config.n_embd, config.n_embd)\n", " # regularization\n", " self.attn_drop = nn.Dropout(config.attn_pdrop)\n", " self.resid_drop = nn.Dropout(config.resid_pdrop)\n", " # output projection\n", " self.proj = nn.Linear(config.n_embd, config.n_embd)\n", " # causal mask to ensure that attention is only applied to the left in the input sequence\n", " self.register_buffer(\"mask\", torch.tril(torch.ones(config.block_size, config.block_size))\n", " .view(1, 1, config.block_size, config.block_size))\n", " self.n_head = config.n_head\n", "\n", " def forward(self, x, layer_past=None):\n", " B, T, C = x.size()\n", "\n", " # calculate query, key, values for all heads in batch and move head forward to be the batch dim\n", " k = self.key(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)\n", " q = self.query(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)\n", " v = self.value(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)\n", "\n", " # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)\n", " att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))\n", " att = att.masked_fill(self.mask[:,:,:T,:T] == 0, -1e10) # todo: just use float('-inf') instead?\n", " att = F.softmax(att, dim=-1)\n", " att = self.attn_drop(att)\n", " y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)\n", " y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side\n", "\n", " # output projection\n", " y = self.resid_drop(self.proj(y))\n", " return y\n", "\n", "class Block(nn.Module):\n", " \"\"\" an unassuming Transformer block \"\"\"\n", "\n", " def __init__(self, config):\n", " super().__init__()\n", " self.ln1 = nn.LayerNorm(config.n_embd)\n", " self.ln2 = nn.LayerNorm(config.n_embd)\n", " self.attn = CausalSelfAttention(config)\n", " self.mlp = nn.Sequential(\n", " nn.Linear(config.n_embd, 4 * config.n_embd),\n", " nn.GELU(),\n", " nn.Linear(4 * config.n_embd, config.n_embd),\n", " nn.Dropout(config.resid_pdrop),\n", " )\n", "\n", " def forward(self, x):\n", " x = x + self.attn(self.ln1(x))\n", " x = x + self.mlp(self.ln2(x))\n", " return x\n", "\n", "class GPT(nn.Module):\n", " \"\"\" the full GPT language model, with a context size of block_size \"\"\"\n", "\n", " def __init__(self, config):\n", " super().__init__()\n", "\n", " # input embedding stem\n", " self.tok_emb = nn.Embedding(config.vocab_size, config.n_embd)\n", " self.pos_emb = nn.Parameter(torch.zeros(1, config.block_size, config.n_embd))\n", " self.drop = nn.Dropout(config.embd_pdrop)\n", " # transformer\n", " self.blocks = nn.Sequential(*[Block(config) for _ in range(config.n_layer)])\n", " # decoder head\n", " self.ln_f = nn.LayerNorm(config.n_embd)\n", " self.head = nn.Linear(config.n_embd, config.vocab_size, bias=False)\n", "\n", " self.block_size = config.block_size\n", " self.apply(self._init_weights)\n", "\n", " logger.info(\"number of parameters: %e\", sum(p.numel() for p in self.parameters()))\n", "\n", " def _init_weights(self, module):\n", " if isinstance(module, (nn.Linear, nn.Embedding)):\n", " module.weight.data.normal_(mean=0.0, std=0.02)\n", " if isinstance(module, nn.Linear) and module.bias is not None:\n", " module.bias.data.zero_()\n", " elif isinstance(module, nn.LayerNorm):\n", " module.bias.data.zero_()\n", " module.weight.data.fill_(1.0)\n", "\n", " def get_block_size(self):\n", " return self.block_size\n", "\n", " def forward(self, idx, targets=None):\n", " b, t = idx.size()\n", " assert t <= self.block_size, \"Cannot forward, model block size is exhausted.\"\n", "\n", " # forward the GPT model\n", " token_embeddings = self.tok_emb(idx) # each index maps to a (learnable) vector\n", " position_embeddings = self.pos_emb[:, :t, :] # each position maps to a (learnable) vector\n", " x = self.drop(token_embeddings + position_embeddings)\n", " x = self.blocks(x)\n", " x = self.ln_f(x)\n", " logits = self.head(x)\n", "\n", " # if we are given some desired targets also calculate the loss\n", " loss = None\n", " if targets is not None:\n", " loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))\n", "\n", " return logits, loss" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "1cbaiuQ4IJLF" }, "outputs": [], "source": [ "import math\n", "import logging\n", "\n", "from tqdm import tqdm\n", "import numpy as np\n", "\n", "import torch\n", "import torch.optim as optim\n", "from torch.optim.lr_scheduler import LambdaLR\n", "from torch.utils.data.dataloader import DataLoader\n", "\n", "logger = logging.getLogger(__name__)\n", "\n", "class TrainerConfig:\n", " # optimization parameters\n", " max_epochs = 10\n", " batch_size = 64\n", " learning_rate = 3e-4\n", " betas = (0.9, 0.95)\n", " grad_norm_clip = 1.0\n", " weight_decay = 0.1 # only applied on matmul weights\n", " # learning rate decay params: linear warmup followed by cosine decay to 10% of original\n", " lr_decay = False\n", " warmup_tokens = 375e6 # these two numbers come from the GPT-3 paper, but may not be good defaults elsewhere\n", " final_tokens = 260e9 # (at what point we reach 10% of original LR)\n", " # checkpoint settings\n", " ckpt_path = None\n", " num_workers = 0 # for DataLoader\n", "\n", " def __init__(self, **kwargs):\n", " for k,v in kwargs.items():\n", " setattr(self, k, v)\n", "\n", "class Trainer:\n", "\n", " def __init__(self, model, train_dataset, test_dataset, config):\n", " self.model = model\n", " self.train_dataset = train_dataset\n", " self.test_dataset = test_dataset\n", " self.config = config\n", "\n", " # take over whatever gpus are on the system\n", " self.device = 'cpu'\n", " if torch.cuda.is_available():\n", " self.device = torch.cuda.current_device()\n", " self.model = torch.nn.DataParallel(self.model).to(self.device)\n", "\n", " def save_checkpoint(self):\n", " if self.config.ckpt_path is not None:\n", " ckpt_model = self.model.module if hasattr(self.model, \"module\") else self.model\n", " logger.info(\"saving %s\", self.config.ckpt_path)\n", " torch.save(ckpt_model.state_dict(), self.config.ckpt_path)\n", "\n", " def train(self):\n", " model, config = self.model, self.config\n", "\n", " # create the optimizer\n", " no_decay = [\"bias\", \"LayerNorm.weight\"]\n", " params_decay = [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)]\n", " params_nodecay = [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)]\n", " optim_groups = [\n", " {\"params\": params_decay, \"weight_decay\": config.weight_decay},\n", " {\"params\": params_nodecay, \"weight_decay\": 0.0},\n", " ]\n", " optimizer = optim.AdamW(optim_groups, lr=config.learning_rate, betas=config.betas)\n", "\n", " def run_epoch(split):\n", " is_train = split == 'train'\n", " model.train(is_train)\n", " data = self.train_dataset if is_train else self.test_dataset\n", " loader = DataLoader(data, batch_size=config.batch_size, num_workers=config.num_workers)\n", "\n", " losses = []\n", " pbar = tqdm(enumerate(loader), total=len(loader)) if is_train else enumerate(loader)\n", " for it, (x, y) in pbar:\n", "\n", " # place data on the correct device\n", " x = x.to(self.device)\n", " y = y.to(self.device)\n", "\n", " # forward the model\n", " with torch.set_grad_enabled(is_train):\n", " logits, loss = model(x, y)\n", " loss = loss.mean() # collapse all losses if they are scattered on multiple gpus\n", " losses.append(loss.item())\n", "\n", " if is_train:\n", "\n", " # backprop and update the parameters\n", " model.zero_grad()\n", " loss.backward()\n", " torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_norm_clip)\n", " optimizer.step()\n", "\n", " # decay the learning rate based on our progress\n", " if config.lr_decay:\n", " self.tokens += (y >= 0).sum() # number of tokens processed this step (i.e. label is not -100)\n", " if self.tokens < config.warmup_tokens:\n", " # linear warmup\n", " lr_mult = float(self.tokens) / float(max(1, config.warmup_tokens))\n", " else:\n", " # cosine learning rate decay\n", " progress = float(self.tokens - config.warmup_tokens) / float(max(1, config.final_tokens - config.warmup_tokens))\n", " lr_mult = max(0.1, 0.5 * (1.0 + math.cos(math.pi * progress)))\n", " lr = config.learning_rate * lr_mult\n", " for param_group in optimizer.param_groups:\n", " param_group['lr'] = lr\n", " else:\n", " lr = config.learning_rate\n", "\n", " # report progress\n", " pbar.set_description(f\"epoch {epoch+1} iter {it}: train loss {loss.item():.5f}. lr {lr:e}\")\n", "\n", " if not is_train:\n", " logger.info(\"test loss: %f\", np.mean(losses))\n", "\n", " self.tokens = 0 # counter used for learning rate decay\n", " for epoch in range(config.max_epochs):\n", "\n", " run_epoch('train')\n", " if self.test_dataset is not None:\n", " run_epoch('test')\n", "\n", " self.save_checkpoint()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "mMNpQFu_IJHv" }, "outputs": [], "source": [ "import random\n", "import numpy as np\n", "import torch\n", "import torch.nn as nn\n", "from torch.nn import functional as F\n", "\n", "def set_seed(seed):\n", " random.seed(seed)\n", " np.random.seed(seed)\n", " torch.manual_seed(seed)\n", " torch.cuda.manual_seed_all(seed)\n", "\n", "def top_k_logits(logits, k):\n", " v, ix = torch.topk(logits, k)\n", " out = logits.clone()\n", " out[out < v[:, [-1]]] = -float('Inf')\n", " return out\n", "\n", "@torch.no_grad()\n", "def sample(model, x, steps, temperature=1.0, sample=False, top_k=None):\n", " \"\"\"\n", " take a conditioning sequence of indices in x (of shape (b,t)) and predict the next token in\n", " the sequence, feeding the predictions back into the model each time. Clearly the sampling\n", " has quadratic complexity unlike an RNN that is only linear, and has a finite context window\n", " of block_size, unlike an RNN that has an infinite context window.\n", " \"\"\"\n", " block_size = model.get_block_size()\n", " model.eval()\n", " for k in range(steps):\n", " x_cond = x if x.size(1) <= block_size else x[:, -block_size:] # crop context if needed\n", " logits, _ = model(x_cond)\n", " # pluck the logits at the final step and scale by temperature\n", " logits = logits[:, -1, :] / temperature\n", " # optionally crop probabilities to only the top k options\n", " if top_k is not None:\n", " logits = top_k_logits(logits, top_k)\n", " # apply softmax to convert to probabilities\n", " probs = F.softmax(logits, dim=-1)\n", " # sample from the distribution or take the most likely\n", " if sample:\n", " ix = torch.multinomial(probs, num_samples=1)\n", " else:\n", " _, ix = torch.topk(probs, k=1, dim=-1)\n", " # append to the sequence and continue\n", " x = torch.cat((x, ix), dim=1)\n", "\n", " return x" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "_7F51OUaIJDB" }, "outputs": [], "source": [ "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ElaOntuyH37Z" }, "outputs": [], "source": [ "# set up logging\n", "import logging\n", "import pandas as pd\n", "logging.basicConfig(\n", " format=\"%(asctime)s - %(levelname)s - %(name)s - %(message)s\",\n", " datefmt=\"%m/%d/%Y %H:%M:%S\",\n", " level=logging.INFO,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "UbFxi0jzH37a" }, "outputs": [], "source": [ "# make deterministic\n", "#from mingpt.utils import set_seed\n", "set_seed(42)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "1ycyj9FEH37b" }, "outputs": [], "source": [ "import numpy as np\n", "import torch\n", "import torch.nn as nn\n", "from torch.nn import functional as F" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Q98b17X5H37b" }, "outputs": [], "source": [ "from torch.utils.data import Dataset\n", "\n", "class CharDataset(Dataset):\n", "\n", " def __init__(self, data, content):\n", " chars = sorted(list(set(content)))\n", " data_size, vocab_size = len(data), len(chars)\n", " print('data has %d smiles, %d unique characters.' % (data_size, vocab_size))\n", " \n", " self.stoi = { ch:i for i,ch in enumerate(chars) }\n", " self.itos = { i:ch for i,ch in enumerate(chars) }\n", " self.block_size = block_size\n", " self.vocab_size = vocab_size\n", " self.data = data\n", " \n", " def __len__(self):\n", " return math.ceil(len(self.data) / (self.block_size + 1))\n", "\n", " def __getitem__(self, idx):\n", " smiles = self.data[idx]\n", " len_smiles = len(smiles)\n", " dix = [self.stoi[s] for s in smiles]\n", " x = torch.tensor(dix[:-1], dtype=torch.long)\n", " y = torch.tensor(dix[1:], dtype=torch.long)\n", " \n", " return x, y" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "6o9wcP7-H37c" }, "outputs": [], "source": [ "# you can download this moses file here https://media.githubusercontent.com/media/molecularsets/moses/master/data/dataset_v1.csv\n", "f = open('/content/moses.txt')\n", "smiles = f.read().splitlines()\n", "#smiles =# pd.read_csv('/content/dataset_v1.csv')['SMILES']" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "t-MtD_7CH37c" }, "outputs": [], "source": [ "# some preprocessin, adding \"<\" to make every smile of max length (for us '<' is an end token)\n", "lens = [len(i) for i in smiles]\n", "max_len = max(lens)\n", "smiles = [ i + str('<')*(max_len - len(i)) for i in smiles]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Eib-CkQaH37d" }, "outputs": [], "source": [ "content = ' '.join(smiles)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "_sa8P9wiH37d" }, "outputs": [], "source": [ "block_size = max_len" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "juqft0fIH37e", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "9ad94fc2-41c7-43b6-c295-7a7adc27a8b4" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "data has 1584783 smiles, 35 unique characters.\n" ] } ], "source": [ "train_dataset = CharDataset(smiles, content, )" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "HvmcQyMOH37f" }, "outputs": [], "source": [ "#from mingpt.model import GPT, GPTConfig\n", "mconf = GPTConfig(train_dataset.vocab_size, train_dataset.block_size,\n", " n_layer=8, n_head=8, n_embd=256)\n", "model = GPT(mconf)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "qEd0ltlOH37f", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "14d01029-44ee-4722-f31f-956c44c128db" }, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py:561: UserWarning: This DataLoader will create 10 worker processes in total. Our suggested max number of worker in current system is 2, which is smaller than what this DataLoader is going to create. Please be aware that excessive worker creation might get DataLoader running slow or even freeze, lower the worker number to avoid potential slowness/freeze if necessary.\n", " warnings.warn(_create_warning_msg(\n", "epoch 1 iter 50: train loss 0.28502. lr 5.999633e-04: 100%|██████████| 51/51 [00:35<00:00, 1.42it/s]\n", "epoch 2 iter 50: train loss 0.23235. lr 5.998532e-04: 100%|██████████| 51/51 [00:33<00:00, 1.51it/s]\n", "epoch 3 iter 50: train loss 0.20597. lr 5.996698e-04: 100%|██████████| 51/51 [00:34<00:00, 1.49it/s]\n", "epoch 4 iter 50: train loss 0.18653. lr 5.994130e-04: 100%|██████████| 51/51 [00:34<00:00, 1.48it/s]\n", "epoch 5 iter 50: train loss 0.17477. lr 5.990830e-04: 100%|██████████| 51/51 [00:34<00:00, 1.47it/s]\n", "epoch 6 iter 50: train loss 0.16794. lr 5.986797e-04: 100%|██████████| 51/51 [00:34<00:00, 1.46it/s]\n", "epoch 7 iter 50: train loss 0.15940. lr 5.982034e-04: 100%|██████████| 51/51 [00:35<00:00, 1.45it/s]\n", "epoch 8 iter 50: train loss 0.15124. lr 5.976541e-04: 100%|██████████| 51/51 [00:35<00:00, 1.44it/s]\n", "epoch 9 iter 50: train loss 0.14482. lr 5.970320e-04: 100%|██████████| 51/51 [00:36<00:00, 1.38it/s]\n", "epoch 10 iter 50: train loss 0.13632. lr 5.963372e-04: 100%|██████████| 51/51 [00:35<00:00, 1.43it/s]\n", "epoch 11 iter 50: train loss 0.13094. lr 5.955699e-04: 100%|██████████| 51/51 [00:36<00:00, 1.40it/s]\n", "epoch 12 iter 50: train loss 0.12819. lr 5.947302e-04: 100%|██████████| 51/51 [00:36<00:00, 1.41it/s]\n", "epoch 13 iter 50: train loss 0.12430. lr 5.938184e-04: 100%|██████████| 51/51 [00:36<00:00, 1.42it/s]\n", "epoch 14 iter 50: train loss 0.12056. lr 5.928348e-04: 100%|██████████| 51/51 [00:35<00:00, 1.43it/s]\n", "epoch 15 iter 50: train loss 0.12150. lr 5.917794e-04: 100%|██████████| 51/51 [00:35<00:00, 1.44it/s]\n", "epoch 16 iter 50: train loss 0.11648. lr 5.906527e-04: 100%|██████████| 51/51 [00:35<00:00, 1.44it/s]\n", "epoch 17 iter 50: train loss 0.11410. lr 5.894549e-04: 100%|██████████| 51/51 [00:35<00:00, 1.44it/s]\n", "epoch 18 iter 50: train loss 0.11176. lr 5.881862e-04: 100%|██████████| 51/51 [00:35<00:00, 1.44it/s]\n", "epoch 19 iter 50: train loss 0.10950. lr 5.868470e-04: 100%|██████████| 51/51 [00:35<00:00, 1.44it/s]\n", "epoch 20 iter 50: train loss 0.10975. lr 5.854376e-04: 100%|██████████| 51/51 [00:35<00:00, 1.45it/s]\n", "epoch 21 iter 50: train loss 0.10889. lr 5.839584e-04: 100%|██████████| 51/51 [00:35<00:00, 1.44it/s]\n", "epoch 22 iter 50: train loss 0.10522. lr 5.824096e-04: 100%|██████████| 51/51 [00:35<00:00, 1.44it/s]\n", "epoch 23 iter 50: train loss 0.10198. lr 5.807918e-04: 100%|██████████| 51/51 [00:35<00:00, 1.45it/s]\n", "epoch 24 iter 50: train loss 0.10006. lr 5.791053e-04: 100%|██████████| 51/51 [00:35<00:00, 1.44it/s]\n", "epoch 25 iter 50: train loss 0.10136. lr 5.773504e-04: 100%|██████████| 51/51 [00:35<00:00, 1.44it/s]\n", "epoch 26 iter 50: train loss 0.09903. lr 5.755277e-04: 100%|██████████| 51/51 [00:35<00:00, 1.45it/s]\n", "epoch 27 iter 50: train loss 0.09688. lr 5.736376e-04: 100%|██████████| 51/51 [00:35<00:00, 1.44it/s]\n", "epoch 28 iter 50: train loss 0.09715. lr 5.716805e-04: 100%|██████████| 51/51 [00:35<00:00, 1.45it/s]\n", "epoch 29 iter 50: train loss 0.09342. lr 5.696569e-04: 100%|██████████| 51/51 [00:35<00:00, 1.44it/s]\n", "epoch 30 iter 50: train loss 0.09446. lr 5.675674e-04: 100%|██████████| 51/51 [00:35<00:00, 1.44it/s]\n" ] } ], "source": [ "#from mingpt.trainer import Trainer, TrainerConfig\n", "import math\n", "# initialize a trainer instance and kick off training\n", "tconf = TrainerConfig(max_epochs=30, batch_size=128, learning_rate=6e-4,\n", " lr_decay=True, warmup_tokens=32*20, final_tokens=200*len(train_dataset)*block_size,\n", " num_workers=10)\n", "trainer = Trainer(model, train_dataset, None, tconf)\n", "trainer.train()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "zFZyEKG0FeGK" }, "outputs": [], "source": [ "#torch.save(model.state_dict(), '/content/gpt_model_state')\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "eC3Vw3lHIImL" }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "10Vy3IcOM8Jj" }, "outputs": [], "source": [ "# alright, let's sample some molecules and draw them using rdkit\n", "\n", "from rdkit import Chem\n", "from rdkit.Chem.Draw import IPythonConsole\n", "from IPython.core.display import HTML\n", "from rdkit.Chem.QED import qed\n", "from rdkit.Chem import PandasTools\n", "#from mingpt.utils import sample\n", "import seaborn as sns" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "5LsEKNc-M8Jj" }, "outputs": [], "source": [ "def show(df):\n", " return HTML(df.to_html(notebook=True))\n", "PandasTools.RenderImagesInAllDataFrames(images=True)" ] }, { "cell_type": "code", "source": [ "import tqdm as tqdm" ], "metadata": { "id": "Xa3IuwDeey5k" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 234 }, "outputId": "828d570d-084a-441f-f133-67535d88179a", "id": "6-hBepJvM8Jk" }, "outputs": [ { "output_type": "error", "ename": "NameError", "evalue": "ignored", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mgen_smiles\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mcontext\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"C\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtqdm\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtqdm\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m500\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0mx\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtensor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mtrain_dataset\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstoi\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0ms\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mcontext\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlong\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m...\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrainer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdevice\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msample\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mblock_size\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtemperature\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0.7\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msample\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtop_k\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mNameError\u001b[0m: name 'tqdm' is not defined" ] } ], "source": [ "molecules = []\n", "gen_smiles = []\n", "context = \"C\"\n", "for i in tqdm.tqdm(range(500)):\n", " x = torch.tensor([train_dataset.stoi[s] for s in context], dtype=torch.long)[None,...].to(trainer.device)\n", " y = sample(model, x, block_size, temperature=0.7, sample=True, top_k=5)[0]\n", " completion = ''.join([train_dataset.itos[int(i)] for i in y])\n", " completion = completion.replace('<', '')\n", " mol = Chem.MolFromSmiles(completion)\n", " if mol:\n", " molecules.append(mol)\n", " gen_smiles.append(completion)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "a9f8f5b3-d809-4fbf-f203-3e9ff460ac0e", "id": "kowxCMhZM8Jk" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "['COc1cc(C(=O)Nc2ccccc2)cc(OC)c1OC',\n", " 'Cc1ccc(C(=O)Nc2ccccc2Cl)cc1',\n", " 'COc1ccc(C(=O)Nc2ccc(OC)c(C)c2)cc1',\n", " 'CCOc1ccc(OC)c(NC(=O)CSc2nnc3ccccc3n2)c1',\n", " 'CCOC(=O)c1ccc(NC(=O)c2ccc(C)o2)cc1',\n", " 'Cc1sc2ncnc(NC(=O)c3ccccc3)c2c1C',\n", " 'COc1ccc(-c2nc(C)c(C)c(C(=O)Nc3ccccc3)c2)cc1',\n", " 'COC(=O)c1cccc(NC(=O)COc2ccc(C)c(C)c2)c1',\n", " 'CC(=O)Nc1ccc(-c2ccc3c(c2)OCO3)cc1',\n", " 'Cc1ccc(-c2nnc(C(=O)Nc3ncncc3N)cc2)cc1',\n", " 'COc1cc2nc(SCC(=O)Nc3ccccc3)c(=O)c2cc1',\n", " 'Cc1ccc(C(=O)N2CCN(c3ccccc3)CC2)cc1',\n", " 'COc1ccc(S(=O)(=O)Nc2ccccc2C)cc1',\n", " 'COc1ccc(NC(=O)c2ccc3c(c2)OCO3)cc1',\n", " 'COc1ccc(-n2nnnc2-c2ccccc2C(C)(C)C)cc1',\n", " 'Cc1ccc(S(=O)(=O)Nc2ccccc2C(F)(F)F)cc1',\n", " 'COc1ccc(NC(=O)c2cccc(Cl)c2)cc1',\n", " 'COc1ccc(C(=O)Nc2ccc(F)cc2)cc1OC',\n", " 'Cc1cccc(C(=O)Nc2ccc(C(F)(F)F)cc2)c1',\n", " 'COc1cccc(OC)c1C(=O)Nc1ccc(C)cc1',\n", " 'CCOC(=O)Nc1cccc(NC(=O)c2cccs2)c1',\n", " 'COc1nc2c(cc1S(=O)(=O)c1ccccc1)CC2',\n", " 'Cc1nnc(NC(=O)CSc2ncnc3ccccc3n2C)s1',\n", " 'COc1cccc(C(=O)Nc2ccc(OC)cn2)c1',\n", " 'COC(=O)c1ccccc1NC(=O)c1ccccc1',\n", " 'COc1ccc(CNS(=O)(=O)c2ccccc2)cc1',\n", " 'Cc1cccc(NC(=O)Nc2ccc(Cl)cc2)c1',\n", " 'Cc1ccc(NS(=O)(=O)c2ccc(Cl)c(Cl)c2)cc1',\n", " 'Cc1ccc(NC(=O)Cn2nnc3ccccc3c2=O)cc1',\n", " 'COc1c(Cl)cc(Cl)cc1OC(=O)NCc1ccccc1',\n", " 'CC(=O)C1CCCN(C(=O)Nc2ccc(F)cc2)c2ccccc21',\n", " 'Cc1nc(-c2ccccc2)c(C#N)c(Cl)c1C#N',\n", " 'Cc1cccc(NC(=O)NCc2cccs2)c1',\n", " 'Cc1cc(C)nc(SCC(=O)NCc2cccs2)n1',\n", " 'Cc1ccc(-c2ccccc2)cc1OCC(=O)Nc1ccccc1',\n", " 'Cn1cnnc1SCC(=O)Nc1ccc(Cl)cc1',\n", " 'Cn1c(=O)c2ccc(Nc3ccccc3)cc2n(C)c1=O',\n", " 'CC(C)C(=O)Nc1ccc(C)cc1C(=O)Nc1cccs1',\n", " 'CCOc1ccc(C(=O)Nc2cccc(C(F)(F)F)c2)cc1',\n", " 'COc1ccc(NC(=O)CSc2nc3ccccc3n2C)cc1',\n", " 'Cc1ccc(NC(=O)Cn2c(-c3ccco3)nc3ccccc23)cc1',\n", " 'COc1ccc(NC(=O)c2ccc(Cl)c(Cl)c2)cc1',\n", " 'CN(C)CC(=O)Nc1ccc(C)cc1Cl',\n", " 'CC(=O)Nc1ccc(NC(=O)c2ccc(F)cc2)cc1',\n", " 'CCOc1ccc(C(=O)Nc2ccc(C)cc2)cc1',\n", " 'COC(=O)c1cc(C)nc(SCc2ccc(F)cc2)n1',\n", " 'CC(C)(C)CC(=O)Nc1ccc(Cl)cc1',\n", " 'CCOC(=O)c1c(NC(=O)c2ccccc2)nc2ccccc2c1C',\n", " 'Cn1cnnc1SCC(=O)Nc1ccc(F)cc1',\n", " 'COc1ccc(-c2ccc(NC(=O)c3ccncc3)cn2)cc1',\n", " 'COC(=O)c1sc2nc(SCC(=O)N(C)C)c2c1C',\n", " 'CC(C)S(=O)(=O)Nc1ccc(Cl)cc1',\n", " 'COc1ccc(C(=O)Nc2cccs2)cc1OC',\n", " 'CC(=O)Nc1ccc(NC(=O)c2ccccc2)cc1',\n", " 'CCOC(=O)c1ccc(NC(=O)c2ccccc2)cc1',\n", " 'COC(=O)c1c(N)n(-c2cccs2)c2ccccc2[nH]1',\n", " 'COc1ccc(NC(=O)c2ccc(NC(C)=O)cc2)cc1',\n", " 'CCOC(=O)c1ccc(NC(=O)c2ccccc2)cc1',\n", " 'CCOC(=O)c1ccc(C(=O)NCc2ccco2)cc1',\n", " 'Cc1cc(C)c(C#N)c(SCC(=O)N2CCCOCC2)n1',\n", " 'Cc1ccc(-c2cc(NC(=O)CC(C)C)cc(C)c2)cc1',\n", " 'COC(=O)c1nn(-c2ccc(C(F)(F)F)cc2)cc1C#N',\n", " 'COC(=O)C1=C(C)N(C)C(=O)NC1c1ccc(OC)cc1',\n", " 'COc1ccccc1NC(=O)c1ccc(Cl)cc1',\n", " 'Cc1ccc(-c2nc3ccccc3n2CC(=O)Nc2ccccc2)cc1',\n", " 'Cc1cc(C)n(-c2ccc(Cl)cc2)n1',\n", " 'COc1ccc(NC(=O)c2ccccc2)cc1',\n", " 'COC(=O)c1ccc(NC(=O)c2cccc(Cl)c2)cc1',\n", " 'COc1ccc(OC)c(C(=O)Nc2ccc(C)cc2C#N)c1',\n", " 'Cc1nc2cc(Cl)cc2c(Cl)c(=O)n1Cc1ccccc1',\n", " 'COc1ccc(C(=O)Nc2ccccc2C)cc1',\n", " 'Cc1ccc(NC(=O)CSc2nc(C)cc(C)n2)cc1',\n", " 'COC(=O)c1ccc(S(=O)(=O)NCc2ccccc2)cc1',\n", " 'COc1ccc(C(=O)NCCc2ccccc2C)cc1',\n", " 'COc1ccc(NC(=O)c2ccncc2)cc1',\n", " 'CC1=C(C(=O)Nc2ccc(Cl)cc2)NC(c2ccccc2)NC(=O)N1',\n", " 'CCOC(=O)Cn1nc2cc(Cl)cc2c(=O)n1CCc1ccccc1',\n", " 'COc1ccc2c(c1OC)C(O)(C(=O)c1ccccc1)N2',\n", " 'CCOC(=O)Nc1ccc(NC(=O)c2ccccc2)cc1',\n", " 'COc1ccc(NC(=O)CSc2nc3ccccc3c(=O)s2)cc1',\n", " 'Cc1ccc(OC(=O)Nc2ccccc2)c(C(N)=O)c1',\n", " 'Cc1nc2cccc(-c3ccc(Cl)cc3)cc2n1',\n", " 'CN(C)S(=O)(=O)c1ccc(NC(=O)c2ccccn2)cc1',\n", " 'COc1ccccc1NC(=O)c1cccc(OC)c1',\n", " 'COc1cc(-c2ccc(C)cc2)cc(O)c1OC',\n", " 'CC(=O)Nc1ccc(NC(=O)c2cncc(Cl)c2Cl)cc1',\n", " 'Cc1ccc2c(c1)N(c1ccccc1)S(=O)(=O)N2',\n", " 'CC(=O)c1ccc(NC(=O)c2ccc(NC(C)=O)cc2)cc1',\n", " 'COc1ccccc1NS(=O)(=O)c1ccc(NC(=O)c2ccccc2)cc1',\n", " 'COc1ccc(C(=O)Nc2nc(C)cc(C)c2C#N)cc1',\n", " 'COc1ccc(CC(=O)Nc2cccc(C)c2)cc1',\n", " 'Cc1noc(-c2cccc(Cl)c2)c1C(=O)N1CCOCC1',\n", " 'CCOC(=O)c1ccc(NC(=O)c2cccc(Cl)c2)cc1',\n", " 'CC(=O)OC1CC(C)N(C(=O)c2ccccc2)c2ccccc21',\n", " 'Cc1cccc(NC(=O)Nc2ccccc2)c1',\n", " 'CCOC(=O)c1ccc(NC(=O)c2ccc(Cl)cc2)cc1',\n", " 'Cc1cccc(NC(=O)CSc2nc3ccccc3[nH]2)c1',\n", " 'Cc1cc(C)c(C#N)c(SCC(=O)Nc2ccccc2)n1',\n", " 'Cc1cc(C)c(C#N)c(SCC(=O)N2CCOCC2)n1',\n", " 'CCOC(=O)c1c(NC(=O)c2cccnc2)sc2c1CCCC2',\n", " 'Cc1cc(C)c(NC(=O)c2ccccc2)c(C#N)c1',\n", " 'CC(=O)Nc1ccc(NC(=O)COc2ccccc2)cc1',\n", " 'Cc1nc(NC(=O)CSc2ccc(Cl)cc2)no1',\n", " 'COc1ccc(NC(=O)N2CCN(c3ccc(C)cc3)CC2)cc1',\n", " 'COC(=O)c1scc(C(=O)NCc2ccccc2)c1Cl',\n", " 'Cc1ccc(C(=O)Nc2ccccn2)cc1',\n", " 'COc1ccc(CCNC(=O)CSc2nnc(C)c(C)c2C#N)cc1',\n", " 'CCOC(=O)c1cn(-c2ccccc2)c(C#N)c1C#N',\n", " 'Cc1ccc(S(=O)(=O)Nc2ccc(Cl)cc2)cc1',\n", " 'CCC(C)(C)C(=O)N(c1ccccc1)c1ccc(OC)cc1',\n", " 'CCOC(=O)c1c(NC(=O)c2ccccc2)noc1C',\n", " 'Cc1cccc(NC(=O)CSc2nc3ccccc3n2C)c1',\n", " 'COc1ccc(NC(=O)c2ccc(Cl)cc2OC)cc1',\n", " 'CCN(CC)S(=O)(=O)c1ccc(C)cc1',\n", " 'COc1ccc(OC)c(NC(=O)CSc2nc3ccccc3[nH]2)c1',\n", " 'COc1ccc(C2OC(=O)c3ccccc32)cc1',\n", " 'Cc1ccc(NC(=O)c2ccc(Cl)cc2)cc1',\n", " 'Cc1ccc(NC(=O)c2ccc(NC(C)=O)cc2)cc1',\n", " 'COc1ccc(C2NC(=O)Nc3ccccc3C(=O)OCc3ccccc32)cc1',\n", " 'COc1ccc(C(=O)Oc2ccc(OC)c(OC)c2)cc1',\n", " 'CC(C)(C)NC(=O)N(C)Cc1ccc(OC)cc1',\n", " 'CC(Oc1ccccc1)S(=O)(=O)N1CCN(C)C(C)C1',\n", " 'COc1cccc(C(=O)Nc2cccc(C)c2C(=O)N)c1',\n", " 'COC(=O)c1ccc(C(=O)Nc2cccc(F)c2)cc1',\n", " 'Cc1cccc(NC(=O)c2ccc(Br)cc2)n1',\n", " 'Cc1ccc(S(=O)(=O)NC(C)(C)C)cc1',\n", " 'Cc1cccc(NC(=O)c2ccc(Cl)cc2Cl)c1',\n", " 'Cc1ccc(C(=O)Nc2ccc(S(N)(=O)=O)cc2)cc1',\n", " 'CC1=C(C(=O)Oc2ccccc2Cl)NC(c2ccco2)n2ncnc2N1',\n", " 'COc1ccc(-c2nc(-c3ccccc3)cs2)cc1',\n", " 'COc1ccc(C(=O)Nc2cccc(C)c2)cc1',\n", " 'Cc1ccc(NC(=O)c2ccccc2)c(OC)c1',\n", " 'CC1=C(C(=O)Nc2cccc(F)c2)NC(=O)NC1c1ccccn1',\n", " 'CC(=O)Nc1ccc(NC(=O)c2ccc(Cl)cc2)cc1',\n", " 'Cc1cc(C)nc(NC(=O)c2ccc(Cl)cc2)n1',\n", " 'COc1cccc(NC(=O)c2ccccc2Cl)c1',\n", " 'COc1ccc(C(=O)Nc2ccccc2C#N)cc1',\n", " 'Cc1nc2ccccc2c(=O)n1-c1ccc(Cl)cc1',\n", " 'Cc1nc2ccccc2n1Cc1ccccc1Cl',\n", " 'COc1ccc(C(=O)Nc2ccc(Cl)c(SC)c2)cc1',\n", " 'Cc1onc(-c2ccccc2)c1CNc1cccnc1',\n", " 'Cc1ccc(C(=O)Nc2cc(Cl)c(Cl)cc2)cc1',\n", " 'CC(=O)N1CCN(C(=O)c2cccnc2C)c2ccccc21',\n", " 'CC(=O)Nc1ccc(-c2ccc(C(C)=O)cc2)cc1',\n", " 'CC(=O)Nc1ccc(NC(=O)c2ccccc2)cc1',\n", " 'COc1ccc(C2CC(=O)Nc3ccccc3C2)cc1',\n", " 'COc1cc(C(=O)N(C)c2ccc(O)cc2)cc(OC)c1OC',\n", " 'CC(=O)c1ccc(NC(=O)c2ccccc2)c(OC)c1',\n", " 'COc1cccc(NC(=O)CSc2nc(C)cn2C)c1',\n", " 'COc1ccccc1CNC(=O)CSc1nc2ccccc2o1',\n", " 'CCOC(=O)c1cccc(NC(=O)c2ccc(OC)cc2)c1',\n", " 'Cc1ccc(NC(=O)C2CC(=O)N(C)c3ccccc32)cc1',\n", " 'COc1ccc(C(=O)Nc2ccc(Cl)cc2)cc1OC',\n", " 'Cc1nc2ccccc2c(=O)n1-c1ccc(Cl)cc1',\n", " 'Cc1cccc(NC(=O)c2cccc(C(=O)Nc3ccccc3)c2)n1',\n", " 'COc1ccc(C2CC(=O)Nc3ccccc3C2)cc1OC',\n", " 'COc1ccc(C(=O)NC(C)c2ccc(OC)cc2)cc1',\n", " 'COc1ccc(CNC(=O)c2ccccc2C)cc1',\n", " 'CCN(CC)C(=O)c1ccc(OC)c(NC(=O)c2ccccc2)c1',\n", " 'CC(=O)Nc1ccc(NC(=O)COc2cccc(F)c2)cc1',\n", " 'CC(=O)Nc1ccc(NC(=O)c2ccc3ccccc3n2)cc1',\n", " 'CCOC(=O)c1ccc(O)c(C)c1NC(=O)c1ccccc1',\n", " 'Cc1ccc(C(=O)Nc2ccc(Cl)cc2Cl)cc1',\n", " 'CC(C)(C)N(CC(=O)c1cccnc1)c1ccccc1',\n", " 'Cc1cc(C)n(-c2ccc(Cl)cc2)n1',\n", " 'Cc1sc(NC(=O)c2ccccc2)c(C#N)c1C',\n", " 'COc1ccc(CC(=O)Nc2ccccc2C(F)(F)F)cc1',\n", " 'Cc1cc(S(=O)(=O)NC2CCCC2)cc(N)c1',\n", " 'CC(=O)Nc1ccc(S(=O)(=O)NCc2ccccc2)cc1',\n", " 'Cc1noc(-c2ccc(Cl)cc2)c1C(=O)NC1CCCCC1',\n", " 'Cc1cccc(-c2cccnc2)c1C(=O)NCc1cccnc1',\n", " 'CCOC(=O)Nc1cccc(NC(=O)c2ccccc2)c1',\n", " 'COc1cc(C(=O)Nc2ccc(C)c(C)c2)cc(OC)c1OC',\n", " 'COc1ccc(C(=O)Nc2ccc(Cl)c(Cl)c2)cc1',\n", " 'Cc1cc(C(=O)Nc2ccccc2)cc(OC)c1',\n", " 'CC(=O)Nc1ccc(C(=O)Nc2ccccc2)cc1',\n", " 'CC(=O)Nc1ccc(NC(=O)COc2ccccc2)cc1',\n", " 'CC(=O)Nc1ccc(C(=O)Nc2ccc(Cl)cc2)cc1',\n", " 'COc1cc(Cl)ccc1S(=O)(=O)Nc1ccc(C(F)(F)F)cc1',\n", " 'COc1ccc(CC(=O)Nc2ccc(CC(F)(F)F)cc2)nc1',\n", " 'Cc1noc(C)c1C(=O)Nc1ccc(Cl)cc1',\n", " 'CC(=O)Nc1ccc(S(=O)(=O)N2CCCCC2)cc1',\n", " 'Cc1ccc(NC(=O)CSc2nnc(N)c(C)n2)cc1',\n", " 'COc1ccc(OC)c(C(=O)Nc2cccnc2)c1',\n", " 'CC(=O)N(C)c1ccc(OC)cc1C(=O)Nc1ccco1',\n", " 'Cc1ccc(S(=O)(=O)NCc2ccc3ccccc3n2)cc1',\n", " 'COc1ccc(C2C(=O)Nc3ccccc3C2)cc1OC',\n", " 'Cc1ccc(C(=O)Nc2cc(Cl)ccc2Cl)cc1',\n", " 'CC(=O)Oc1ccc(-c2ccc(Cl)cc2)cc1OC',\n", " 'COc1ccc(-c2nnc3ccccc3c2N)cc1',\n", " 'COc1ccc(C(=O)Nc2ccc(C#N)c(C)c2)cc1',\n", " 'COc1ccc(C(=O)Nc2ccccc2C(C)(C)C)c(C)c1',\n", " 'COc1ccc(-c2nc(C(=O)NCc3ccccc3)no2)cc1',\n", " 'CCOc1ccc(C(=O)Nc2ccccc2)cc1',\n", " 'CCOC(=O)c1sc2ncn(CC(=O)c3ccccc3)c2c1C',\n", " 'CCOC(=O)c1cc2cc(O)c(OC)c(OC)c2[nH]c1=O',\n", " 'COc1ccc(C(=O)Nc2ccc(F)cc2)cc1',\n", " 'CC(=O)c1c(NC(=O)COc2ccccc2)sc(C)c1C#N',\n", " 'COc1cccc(NC(=O)c2ccc(Cl)cc2Cl)c1',\n", " 'COc1ccc(C(=O)NCc2ccccc2)cc1OC',\n", " 'CCOC(=O)c1ccn(C(=O)Nc2ccccc2)c1C',\n", " 'Cc1ccc(-c2noc(-c3ccccc3)n2)cc1',\n", " 'COc1ccccc1-c1nnc(C2CC(=O)Nc3ccccc32)cc1',\n", " 'CC(=O)Nc1ccc(Nc2nc3ccccc3[nH]2)cc1',\n", " 'CC(=O)Nc1ccc(C(=O)Nc2ccc(C)cc2)cc1',\n", " 'CCOC(=O)N1CCN(C(=O)c2ccc(F)cc2)CC1',\n", " 'COc1ccc(S(=O)(=O)N2CCCC2)cc1',\n", " 'COC(=O)c1ccc(NS(=O)(=O)c2cccs2)cc1',\n", " 'COc1ccc(C(=O)N2CCN(C(=O)c3ccco3)CCC2)cc1',\n", " 'Cc1onc(-c2ccccc2)c1CNC(=O)c1ccccc1',\n", " 'COc1ccc(S(=O)(=O)N2CCCCC2)cc1',\n", " 'COc1ccc(C(=O)Nc2ccccc2)cc1',\n", " 'COc1ccc(S(=O)(=O)N2CCN(c3ccc(N)cc3)CC2)cc1',\n", " 'CCOc1cccc(NC(=O)c2ccc(Cl)cc2C#N)c1',\n", " 'Cc1ccc(-c2csc3ncnc3n2)cc1Cl',\n", " 'COc1ccc(NC(=O)c2ccc(Cl)cc2)cc1',\n", " 'Cc1cccc(NC(=O)c2ccc(Cl)cc2)c1',\n", " 'Cc1cc(C)n(C(=O)Nc2ccc(F)cc2)c(=O)c1',\n", " 'Cc1ccc(NC(=O)C2CCCN(c3ccccc3)NC2)cc1',\n", " 'CC(=O)NC1CCN(C(=O)c2cccs2)CC1',\n", " 'COC(=O)c1ccc(NC(=O)c2ccccc2)cc1',\n", " 'CCOC(=O)c1cnn(-c2ccccc2)c1N',\n", " 'CCC(=O)Nc1ccc(NC(=O)c2ccccc2C(N)=O)cc1',\n", " 'CCOC(=O)c1c(NC(=O)c2ccccc2)sc2c1CCCC2',\n", " 'COc1cc(C(=O)Nc2ccccc2)cc(OC)c1OC',\n", " 'CCOC(=O)C1=C(C)N(C)C(=O)NC1c1cccs1',\n", " 'CCOC(=O)c1cn(C)c(NC(=O)c2ccc(Cl)cc2)c1C',\n", " 'COc1cc(CNC(=O)c2ccc(OC)cc2)ccc1OC',\n", " 'CC(C)NC(=O)CN1CCN(C(=O)c2cccs2)CC1',\n", " 'COc1ccc(Nc2nc(-c3ccccc3)no2)cc1',\n", " 'CCOC(=O)c1ccc(NC(=O)c2ccccc2)cc1',\n", " 'COc1cccc(C(=O)Nc2cccc(C)c2)c1',\n", " 'COc1ccc(C(=O)Nc2cccs2)cc1OC',\n", " 'COc1ccc(C(=O)Nc2ccc(Cl)cc2Cl)cc1',\n", " 'COc1ccc(C(=O)Nc2ccc(Cl)cc2)cc1',\n", " 'COc1ccc(NC(=O)c2ccc(F)cc2)cc1',\n", " 'COc1ccccc1NC(=O)c1cccc(NC(=O)c2ccccn2)c1',\n", " 'CCOc1ccc(NC(=O)c2ccc(Cl)c(Cl)c2)cc1',\n", " 'COc1ccc(-c2nnc(C)c(N)n2)cc1OC',\n", " 'COc1ccccc1CNC(=O)c1ccc(C)cc1',\n", " 'Cc1cccc(NC(=O)c2ccc(Cl)cc2)c1',\n", " 'COC(=O)c1cc(C)n(-c2ccc(Cl)cc2)c1C',\n", " 'Cc1sc2ncnc(NC(=O)c3cccs3)c2c1C',\n", " 'COc1ccc(NC(=O)c2ccc(OC)cc2)cc1',\n", " 'CCOC(=O)c1sc2nc[nH]c(=O)c2c1C#N',\n", " 'CCN(CC)S(=O)(=O)c1ccc(NC(=O)c2ccccn2)cc1',\n", " 'Cc1ccc(S(=O)(=O)Nc2ccccc2)cc1',\n", " 'CCOC(=O)Cn1nc2ccc(Br)cc2n1',\n", " 'Cc1cccc(C(=O)Nc2ccc3ccccc3c2)c1',\n", " 'COC(=O)C1=C(C)NC(=O)NC1c1ccc(F)cc1',\n", " 'COc1cc(C(=O)Nc2ccccn2)cc(OC)c1OC',\n", " 'Cc1ccc(NC(=O)c2ccccc2)c(OC)c1',\n", " 'COc1ccc(OC)c(NC(=O)c2ccc(C)cc2)c1',\n", " 'COc1ccc(C(=O)N2CCN(C(=O)c3ccccc3)CCC2)cc1',\n", " 'COc1ccc(C2OC(=O)Nc3ccccc3O2)cc1',\n", " 'COc1ccc(OC)c(-c2nnc(N)nc(NC(C)=O)s2)c1',\n", " 'COc1ccc(C2C(=O)Nc3ccccc3S2)cc1',\n", " 'COc1ccc(C2CC(=O)Nc3ccccc32)cc1OC',\n", " 'CC(=O)N1CCN(S(=O)(=O)c2ccccc2)CC1',\n", " 'COC(=O)c1sc(NC(=O)c2ccccc2)cc1C#N',\n", " 'CC(=O)Nc1ccc(NC(=O)c2ccc(F)cc2)cc1',\n", " 'CCOC(=O)c1ccc(NC(=O)c2cncc(C)c2)cc1',\n", " 'Cc1ccc(NC(=O)CSc2nnc(C)c(-c3ccccc3)n2)cc1',\n", " 'COc1ccc(C(=O)Nc2cccc(Cl)c2)cc1',\n", " 'COc1cccc(C2OC(=O)c3ccccc32)c1',\n", " 'Cn1c(=O)c2c(ncn2Cc2ccccc2)c(=O)n(C)c1=O',\n", " 'CC(=O)Nc1ccc(S(=O)(=O)NCCc2ccccn2)cc1',\n", " 'CCOC(=O)c1cnc2cccc2n1NC(=O)CCc1ccccc1',\n", " 'COc1ccc(NC(=O)c2ccc(Cl)cc2)cc1',\n", " 'COc1cccc(NC(=O)CSc2nc3ccccc3n2C)c1',\n", " 'Cc1cc(C)n(C(=O)Nc2ccccc2)c(=O)c1C#N',\n", " 'CCOC(=O)Nc1ccc(NC(=O)c2cccs2)cc1',\n", " 'COC(=O)c1nc2cc(-c3ccccc3)n2ccc1',\n", " 'CC(=O)Nc1scc(NC(=O)c2ccccc2)c1C',\n", " 'CC(=O)c1ccc(C(=O)NCCc2cccs2)cc1',\n", " 'COc1ccccc1C1CC(=O)Nc2ccc(C(F)(F)F)cc21',\n", " 'COc1ccc(NC(=O)CSc2ncc3cccc23)cc1',\n", " 'COC(=O)c1sc(NC(=O)c2ccc(Cl)cc2)nc1C',\n", " 'CC(=O)Nc1ccc(NC(=O)CSc2nc3ccccc3o2)cc1',\n", " 'Cc1ccc(NC(=O)c2cccc3cc(c2)OCO3)cc1',\n", " 'COc1ccccc1C(=O)Oc1sc(C)c(C(N)=O)c1',\n", " 'CCOC(=O)c1sc2ncn(CC(C)C)c2c1C',\n", " 'COc1ccc(NC(=O)c2ccc(OC)cc2)cc1',\n", " 'COc1ccc(C(=O)Nc2cccc(C)c2)cc1OC',\n", " 'COc1cc(-c2cc(F)c(C)cc2C)cc(OC)c1OC',\n", " 'COc1ccc(NC(=O)c2ccccc2Cl)cc1',\n", " 'COc1ccccc1NC(=O)c1ccccc1C(=O)Nc1cccs1',\n", " 'CC(C)(C)c1ccc(NC(=O)COc2ccccc2)cc1',\n", " 'Cc1nc2ccccc2c(NC(=O)c2ccccc2)c1',\n", " 'COc1cc(C(=O)Nc2ccccc2C(F)(F)F)ccc1OC',\n", " 'CC(=O)Nc1ccc(NC(=O)c2ccc(OC)c(OC)c2)cc1',\n", " 'Cc1onc(-c2ccccc2Cl)c1C(=O)Nc1ccccc1',\n", " 'CC(=O)Nc1ccc(C(=O)Nc2ccccc2CC(C)(C)C)cc1',\n", " 'Cn1c(=O)c2ccccc2cc(SCc2ccccc2)c1=O',\n", " 'COc1ncc2c3c(=O)oc(C(=O)NCC(C)C)c3c(=O)c(C)c2c1',\n", " 'CC(=O)N1CCN(C(=O)c2ccccc2F)c2ccccc21',\n", " 'Cc1cccc(C(=O)Nc2ccc(Cl)cc2)c1',\n", " 'Cc1ccc(NC(=O)CSc2nc(C)cc(C)c2C)cc1',\n", " 'CC(=O)Nc1ccc(NC(=O)c2c(Cl)cccc2C#N)cc1',\n", " 'Cc1c(N)nc(SCC(=O)c2ccccc2)n1CC(=O)Nc1cccnc1',\n", " 'CN(C)S(=O)(=O)c1ccc(NC(=O)c2ccc(F)cc2)cc1',\n", " 'CN(C)C(=O)c1nnc2ccc(Cl)cc2c1OCC',\n", " 'Cc1cc(C)n(Cc2c(Cl)cccc2Cl)c(=O)n1',\n", " 'COc1ccc(-c2nc(-c3cccs3)no2)cc1',\n", " 'COc1ccc(Cl)cc1NC(=O)c1ccc(-c2nonc2N)cc1',\n", " 'CCOC(=O)c1cccc(NC(=O)c2ccc(NC(C)=O)cc2)c1',\n", " 'COc1ccc(C(=O)NCC2CCCCCCC2)cc1',\n", " 'CC(=O)Nc1ccc(NS(=O)(=O)c2ccc(Cl)cc2)cc1',\n", " 'Cc1cc(C)n(C(=O)Nc2ccc(Cl)cc2)n1',\n", " 'COc1cccc(CNC(=O)c2ccc(OC)cc2)c1',\n", " 'COc1cc(C(=O)Nc2nc(C)ccc2C)cc(OC)c1OC',\n", " 'CC(=O)Nc1ccc(C(=O)Nc2ccn(C)c2)cc1',\n", " 'COc1ccc(NC(=O)c2ccccc2Cl)cc1',\n", " 'CC(=O)Nc1ccc(NC(=O)c2cccc(F)c2)cc1',\n", " 'CCN(CC)C(=O)c1ccc(OC(=O)Nc2ccccc2)cc1',\n", " 'CC(=O)Nc1ccc(NC2C(=O)c3ccccc3C2)cc1',\n", " 'COc1cc(OC)cc(C(=O)Nc2cc(Cl)ccc2Cl)c1',\n", " 'COc1cc(C(=O)Nc2ccccc2C#N)cc(OC)c1OC',\n", " 'COc1ccc(C(=O)NC2CCN(c3cccs3)CC2)cc1']" ] }, "metadata": {}, "execution_count": 23 } ], "source": [ "gen_smiles" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 36 }, "outputId": "3db18e7e-4824-4bf4-a32c-5a5075168929", "id": "Sb4rde_0M8Jl" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "'Valid molecules % = 319'" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "string" } }, "metadata": {}, "execution_count": 25 } ], "source": [ "\"Valid molecules % = {}\".format(len(molecules))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "kzAjOnY_M8Jl" }, "outputs": [], "source": [ "mol_dict = []\n", "for i in molecules:\n", " mol_dict.append({'molecule' : i, 'qed': qed(i), 'smiles': Chem.MolToSmiles(i)})" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "39BNrxcTM8Jl" }, "outputs": [], "source": [ "results = pd.DataFrame(mol_dict)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 1000 }, "outputId": "f40b484b-fcba-4ced-b653-de378758d2f8", "id": "GytZST97M8Jl" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " molecule qed \\\n", "0 0.878882 \n", "1 0.881082 \n", "2 0.707781 \n", "3 0.678701 \n", "4 0.783312 \n", "5 0.941276 \n", "6 0.807600 \n", "7 0.878873 \n", "8 0.876200 \n", "9 0.864057 \n", "10 0.860424 \n", "11 0.811711 \n", "12 0.837287 \n", "13 0.874613 \n", "14 0.717143 \n", "15 0.855347 \n", "16 0.841536 \n", "17 0.873587 \n", "18 0.663220 \n", "19 0.899845 \n", "20 0.932065 \n", "21 0.794003 \n", "22 0.788492 \n", "23 0.829125 \n", "24 0.895830 \n", "25 0.726468 \n", "26 0.856963 \n", "27 0.501545 \n", "28 0.946285 \n", "29 0.933409 \n", "30 0.712747 \n", "31 0.884463 \n", "32 0.924984 \n", "33 0.877083 \n", "34 0.831132 \n", "35 0.864707 \n", "36 0.868283 \n", "37 0.866578 \n", "38 0.855010 \n", "39 0.767471 \n", "40 0.745913 \n", "41 0.887220 \n", "42 0.888703 \n", "43 0.841396 \n", "44 0.627649 \n", "45 0.792612 \n", "46 0.773102 \n", "47 0.875962 \n", "48 0.833716 \n", "49 0.879208 \n", "50 0.713352 \n", "51 0.918215 \n", "52 0.909016 \n", "53 0.654133 \n", "54 0.780083 \n", "55 0.798360 \n", "56 0.907214 \n", "\n", " smiles \n", "0 COC(=O)c1cccc(NC(=O)c2ccc(C)c(C)c2)c1 \n", "1 COC(=O)c1sc(NS(=O)(=O)c2ccccc2)cc1C \n", "2 Cc1onc(-c2ccccc2)c1OCc1ccccc1 \n", "3 COC(=O)c1nc(N)cc(C(=O)c2cccs2)c1C#N \n", "4 Cc1ccc(OCC(=O)Nc2ccc3ccccc3c2)cc1 \n", "5 CN(C)S(=O)(=O)c1cccc(NC(=O)c2ccccc2)c1 \n", "6 Cc1ccc(-n2nc(C(N)=O)c(-c3csc(C)c3)n2)cc1 \n", "7 COc1ccc(C2Cc3ccccc3NC(=O)N2)cc1 \n", "8 COc1cc(S(=O)(=O)Nc2ccc(C)c(C)c2)cc(OC)c1O \n", "9 CCOC(=O)N1CCN(c2cccc(OC)c2)c2ccccc21 \n", "10 CCOC(=O)N1CCN(C(=O)Cc2ccccc2)CC(C)(C)C1 \n", "11 CCOC(=O)c1cc(C)n(Cc2ccc(F)cc2)c1C#N \n", "12 Cc1sc(NC(=O)c2ccc(Cl)cc2)c(Cl)c1C \n", "13 CCOC(=O)c1cccc(C(=O)Nc2ccc(C)cc2)c1 \n", "14 CC1CCN(C(=O)c2ccccc2Cl)C1 \n", "15 Cc1onc(-c2ccccc2)c1C(=O)N1CCCN(C)C1C \n", "16 COC(=O)Cn1cnc(Nc2ccccc2)c1C#N \n", "17 Cn1cnnc1SCC(=O)Nc1ccc(Cl)cc1 \n", "18 Cc1cc(C)nc(SCC(=O)Nc2ccc(C(C)(F)F)cc2)n1 \n", "19 CCN(CC)C(=O)Nc1ccc2ccccc2n1 \n", "20 COc1ccc(C(=O)Nc2ccc3c(c2)OCO3)cc1 \n", "21 COc1ccc(NC(C)=O)cc1Cl \n", "22 CCOC(=O)c1ccc(-c2ccc(OC)c(OC)c2)cc1 \n", "23 COC(=O)c1ccc(NC(=O)COc2ccccc2)c(OC)c1 \n", "24 COc1ccc(C(=O)Nc2cccc(S(N)(=O)=O)c2)cc1 \n", "25 Cn1cnnc1SCC(=O)NCc1nc2ccccc2s1 \n", "26 COc1cccc(NC(=O)CNc2ccccc2OC)c1 \n", "27 Cc1cc(N)nc(SCC(=O)Nc2nc3ccccc3[nH]2)n1 \n", "28 Cc1cc(C)c(S(=O)(=O)Nc2ccccn2)cc1Cl \n", "29 Cc1ccc(NS(=O)(=O)c2ccccc2C#N)cc1 \n", "30 CCc1c(C)sc2ncnc(Cc3ccccc3)c12 \n", "31 CC(=O)Nc1ccc(NC(=O)c2conc2C)cc1 \n", "32 COc1cccc(CNC(=O)c2ccc(C)cc2C)c1 \n", "33 COC(=O)c1ccc(NS(=O)(=O)c2ccccc2)cc1 \n", "34 CCOC(=O)C1=C(C)NC(=O)NC1c1ccccc1F \n", "35 CCOC(=O)C1=C(C)N(C)C(=O)NC1c1ccc(OCl)cc1 \n", "36 COc1ccc(S(=O)(=O)c2ccc(Cl)cc2)cc1 \n", "37 CC(C)(C)CNS(=O)(=O)c1ccc(NC(=O)c2cccs2)cc1 \n", "38 COc1ccc(CNC(=O)Cn2nc(C)cc2N)cc1 \n", "39 Cn1cncc1SCc1cccc(F)c1F \n", "40 CC(=O)Nc1ccc2oc(-c3ccccc3)cc2c1 \n", "41 COc1cc(OC)cc(OCC(=O)Nc2ccccc2)c1 \n", "42 CC(=O)NC1Nc2ccccc2C1c1ccccc1Cl \n", "43 COc1ccc(OC)c(NC(=O)c2ccc(NC(C)=O)cc2OC)c1 \n", "44 COc1ccc2c(=O)c3cc(Cl)ccc3oc2c1 \n", "45 CCOC(=O)NC1=CC2=C(C)C=C(Cl)C2=C1N \n", "46 CC(=O)Nc1ccc(-c2nc3ccccc3n2C)cc1 \n", "47 CCOC(=O)c1c(NC(=O)c2ccccc2)sc2c1CCC2 \n", "48 Cn1cnnc1SCC(=O)Nc1ccc(N2CCOCC2)cc1 \n", "49 COC(=O)c1sc(NC(=O)c2cccs2)cc1C \n", "50 Cn1c(=O)c2c(ncn2C(=O)Nc2ccc(F)cc2)n(C)c1=O \n", "51 COc1cccc(NC(=O)c2ccc(OC)c(OC)c2)c1 \n", "52 COc1cc(O)c(OC)c(C(=O)Nc2cccc(Cl)c2)c1 \n", "53 Cc1cccc(C)c1NC(=O)CSc1nc(-c2ccc(Cl)cc2)no1 \n", "54 Cc1onc(-c2ccccc2)c1CNC(=O)c1ccccc1Cl \n", "55 Cc1onc(-c2ccccc2)c1C(=O)Nc1ccccn1 \n", "56 CC(=O)Nc1ccc(C(=O)Nc2ccc(Cl)cc2)cc1 " ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
moleculeqedsmiles
0
\"Mol\"/
0.878882COC(=O)c1cccc(NC(=O)c2ccc(C)c(C)c2)c1
1
\"Mol\"/
0.881082COC(=O)c1sc(NS(=O)(=O)c2ccccc2)cc1C
2
\"Mol\"/
0.707781Cc1onc(-c2ccccc2)c1OCc1ccccc1
3
\"Mol\"/
0.678701COC(=O)c1nc(N)cc(C(=O)c2cccs2)c1C#N
4
\"Mol\"/
0.783312Cc1ccc(OCC(=O)Nc2ccc3ccccc3c2)cc1
5
\"Mol\"/
0.941276CN(C)S(=O)(=O)c1cccc(NC(=O)c2ccccc2)c1
6
\"Mol\"/
0.807600Cc1ccc(-n2nc(C(N)=O)c(-c3csc(C)c3)n2)cc1
7
\"Mol\"/
0.878873COc1ccc(C2Cc3ccccc3NC(=O)N2)cc1
8
\"Mol\"/
0.876200COc1cc(S(=O)(=O)Nc2ccc(C)c(C)c2)cc(OC)c1O
9
\"Mol\"/
0.864057CCOC(=O)N1CCN(c2cccc(OC)c2)c2ccccc21
10
\"Mol\"/
0.860424CCOC(=O)N1CCN(C(=O)Cc2ccccc2)CC(C)(C)C1
11
\"Mol\"/
0.811711CCOC(=O)c1cc(C)n(Cc2ccc(F)cc2)c1C#N
12
\"Mol\"/
0.837287Cc1sc(NC(=O)c2ccc(Cl)cc2)c(Cl)c1C
13
\"Mol\"/
0.874613CCOC(=O)c1cccc(C(=O)Nc2ccc(C)cc2)c1
14
\"Mol\"/
0.717143CC1CCN(C(=O)c2ccccc2Cl)C1
15
\"Mol\"/
0.855347Cc1onc(-c2ccccc2)c1C(=O)N1CCCN(C)C1C
16
\"Mol\"/
0.841536COC(=O)Cn1cnc(Nc2ccccc2)c1C#N
17
\"Mol\"/
0.873587Cn1cnnc1SCC(=O)Nc1ccc(Cl)cc1
18
\"Mol\"/
0.663220Cc1cc(C)nc(SCC(=O)Nc2ccc(C(C)(F)F)cc2)n1
19
\"Mol\"/
0.899845CCN(CC)C(=O)Nc1ccc2ccccc2n1
20
\"Mol\"/
0.932065COc1ccc(C(=O)Nc2ccc3c(c2)OCO3)cc1
21
\"Mol\"/
0.794003COc1ccc(NC(C)=O)cc1Cl
22
\"Mol\"/
0.788492CCOC(=O)c1ccc(-c2ccc(OC)c(OC)c2)cc1
23
\"Mol\"/
0.829125COC(=O)c1ccc(NC(=O)COc2ccccc2)c(OC)c1
24
\"Mol\"/
0.895830COc1ccc(C(=O)Nc2cccc(S(N)(=O)=O)c2)cc1
25
\"Mol\"/
0.726468Cn1cnnc1SCC(=O)NCc1nc2ccccc2s1
26
\"Mol\"/
0.856963COc1cccc(NC(=O)CNc2ccccc2OC)c1
27
\"Mol\"/
0.501545Cc1cc(N)nc(SCC(=O)Nc2nc3ccccc3[nH]2)n1
28
\"Mol\"/
0.946285Cc1cc(C)c(S(=O)(=O)Nc2ccccn2)cc1Cl
29
\"Mol\"/
0.933409Cc1ccc(NS(=O)(=O)c2ccccc2C#N)cc1
30
\"Mol\"/
0.712747CCc1c(C)sc2ncnc(Cc3ccccc3)c12
31
\"Mol\"/
0.884463CC(=O)Nc1ccc(NC(=O)c2conc2C)cc1
32
\"Mol\"/
0.924984COc1cccc(CNC(=O)c2ccc(C)cc2C)c1
33
\"Mol\"/
0.877083COC(=O)c1ccc(NS(=O)(=O)c2ccccc2)cc1
34
\"Mol\"/
0.831132CCOC(=O)C1=C(C)NC(=O)NC1c1ccccc1F
35
\"Mol\"/
0.864707CCOC(=O)C1=C(C)N(C)C(=O)NC1c1ccc(OCl)cc1
36
\"Mol\"/
0.868283COc1ccc(S(=O)(=O)c2ccc(Cl)cc2)cc1
37
\"Mol\"/
0.866578CC(C)(C)CNS(=O)(=O)c1ccc(NC(=O)c2cccs2)cc1
38
\"Mol\"/
0.855010COc1ccc(CNC(=O)Cn2nc(C)cc2N)cc1
39
\"Mol\"/
0.767471Cn1cncc1SCc1cccc(F)c1F
40
\"Mol\"/
0.745913CC(=O)Nc1ccc2oc(-c3ccccc3)cc2c1
41
\"Mol\"/
0.887220COc1cc(OC)cc(OCC(=O)Nc2ccccc2)c1
42
\"Mol\"/
0.888703CC(=O)NC1Nc2ccccc2C1c1ccccc1Cl
43
\"Mol\"/
0.841396COc1ccc(OC)c(NC(=O)c2ccc(NC(C)=O)cc2OC)c1
44
\"Mol\"/
0.627649COc1ccc2c(=O)c3cc(Cl)ccc3oc2c1
45
\"Mol\"/
0.792612CCOC(=O)NC1=CC2=C(C)C=C(Cl)C2=C1N
46
\"Mol\"/
0.773102CC(=O)Nc1ccc(-c2nc3ccccc3n2C)cc1
47
\"Mol\"/
0.875962CCOC(=O)c1c(NC(=O)c2ccccc2)sc2c1CCC2
48
\"Mol\"/
0.833716Cn1cnnc1SCC(=O)Nc1ccc(N2CCOCC2)cc1
49
\"Mol\"/
0.879208COC(=O)c1sc(NC(=O)c2cccs2)cc1C
50
\"Mol\"/
0.713352Cn1c(=O)c2c(ncn2C(=O)Nc2ccc(F)cc2)n(C)c1=O
51
\"Mol\"/
0.918215COc1cccc(NC(=O)c2ccc(OC)c(OC)c2)c1
52
\"Mol\"/
0.909016COc1cc(O)c(OC)c(C(=O)Nc2cccc(Cl)c2)c1
53
\"Mol\"/
0.654133Cc1cccc(C)c1NC(=O)CSc1nc(-c2ccc(Cl)cc2)no1
54
\"Mol\"/
0.780083Cc1onc(-c2ccccc2)c1CNC(=O)c1ccccc1Cl
55
\"Mol\"/
0.798360Cc1onc(-c2ccccc2)c1C(=O)Nc1ccccn1
56
\"Mol\"/
0.907214CC(=O)Nc1ccc(C(=O)Nc2ccc(Cl)cc2)cc1
\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 25 } ], "source": [ "results" ] }, { "cell_type": "code", "source": [], "metadata": { "id": "uYgVRVT5CXft" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 469 }, "id": "tmwbGaSTH37h", "outputId": "8cc62188-d8bb-46d2-d345-dd75f4c816e3" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "Text(0.5, 1.0, 'QED plot')" ] }, "metadata": {}, "execution_count": 26 }, { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": {} } ], "source": [ "import matplotlib.pyplot as plt\n", "sns.kdeplot(results['qed'].values)\n", "plt.title('QED plot')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 1000 }, "id": "SPrWfwl3H37h", "outputId": "128c91c0-435f-44f1-e860-aa3de030824f" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
moleculeqedsmiles
0
\"Mol\"/
0.677112CCOC(=O)c1cnn(-c2nc(C)nc3sc(CC)c(C)c23)c1C
1
\"Mol\"/
0.728099CCc1cccc(CC(=O)Cn2cnc3ccccc3c2=O)c1
2
\"Mol\"/
0.783137CC(C)(C)NC(=O)Nc1ccc(C(=O)NC(C)(C)C)cc1
3
\"Mol\"/
0.895424CC(=O)Nc1ccc(NC(=O)Cc2ccccc2)cc1
4
\"Mol\"/
0.742755Cn1c(=O)c2c(ncn2CC(=O)Nc2ccc(F)cc2)n(C)c1=O
5
\"Mol\"/
0.806002CCOC(=O)c1c[nH]c2ccc(OC)cc12
6
\"Mol\"/
0.745607CC(=O)c1ccc(NC(=O)c2cccc(-n3cnnn3)c2)cc1
7
\"Mol\"/
0.727011Cc1ccccc1NC(=O)CSc1nnc(-c2ccco2)o1
8
\"Mol\"/
0.689722COc1ccc(NC(=O)Cn2nnc(-c3ccccc3N)n2)cc1
9
\"Mol\"/
0.622855CN(C)C(=O)CSc1nc2ccccc2c(=O)n1C
10
\"Mol\"/
0.771935COc1ccc(-n2nnnc2SCC(=O)N2CCCCC2C)cc1
11
\"Mol\"/
0.775285CN(C)C(=O)CSc1nnc(-c2cccnc2)o1
12
\"Mol\"/
0.719536Cc1ccccc1NC(=O)CSc1nnc(-c2cccs2)o1
13
\"Mol\"/
0.786158CC(=O)N(Cc1cccs1)Cc1cc2cc(C)cc(C)c2[nH]c1=O
14
\"Mol\"/
0.682315Cc1ccc(-n2nnnc2SCC(=O)NCc2cccnc2)cc1C
15
\"Mol\"/
0.798353Cc1ccc(-c2nnc(SCC(=O)N3CCOCC3)n2C)cc1
16
\"Mol\"/
0.778989COc1ccc(NC(=O)Cn2nnc(-c3cccs3)n2)cc1
17
\"Mol\"/
0.938534CC(=O)N(Cc1cc2ccc(C)cc2[nH]c1=O)CC1CCCCO1
18
\"Mol\"/
0.677132CCC(C)NC(=O)CSc1nc2ccccc2c(=O)n1C
19
\"Mol\"/
0.642878CC(=O)Nc1ccc(NC(=O)CSc2nnc(-c3cccnc3)o2)cc1
20
\"Mol\"/
0.456887COC(=O)c1ccc(C(=O)CSc2nnc(C3CC3)n2C)o1
21
\"Mol\"/
0.800048CC(=O)Nc1ccccc1-c1nnn(CCC(=O)NCC(C)C)n1
22
\"Mol\"/
0.642831CCC(=O)c1ccc(NC(=O)CSc2nnnn2C)cc1
23
\"Mol\"/
0.862908CCc1nnc(SCC(=O)Nc2ccc(C)cc2C)n1C
24
\"Mol\"/
0.692685COc1cccc(C(=O)Nc2ccc(OC(C)=O)cc2)c1
25
\"Mol\"/
0.800350Cc1cccc(C)c1NC(=O)Cn1nnc(-c2ccccc2F)n1
26
\"Mol\"/
0.822039CCC(C)(C)c1nnc(SCC(=O)Nc2ccccc2C)n1C
27
\"Mol\"/
0.736507Cc1cccc2cc(C(=O)Nc3ccccc3F)c(=O)oc12
28
\"Mol\"/
0.719212Cc1ccc(NC(=O)CSc2nnc(-c3ccco3)o2)cc1C
29
\"Mol\"/
0.890984CC(=O)Nc1ccc(NC(=O)c2cccs2)cc1
30
\"Mol\"/
0.593680CC(=O)Oc1ccc(NC(=O)Cn2cnc3ccccc32)cc1
31
\"Mol\"/
0.784833CC(=O)Nc1ccc(NC(=O)CSc2nnc(C3CC3)n2C)cc1
32
\"Mol\"/
0.802323Cc1ccc(NC(=O)Cn2nnc(-c3cccs3)n2)cc1
33
\"Mol\"/
0.784833CC(=O)Nc1ccc(NC(=O)CSc2nnc(C3CC3)n2C)cc1
34
\"Mol\"/
0.876471COc1ccc(NC(=O)N(C(C)C)C(C)C)cc1NC(C)=O
35
\"Mol\"/
0.677053COc1ccc(-n2nnnc2SCc2ccccc2F)cc1
36
\"Mol\"/
0.514397Cc1ccc(NC(=O)CCSc2nnc(-c3ccc(N)cc3)n2C)cc1
37
\"Mol\"/
0.703942COC(=O)c1ccc(NC(=O)Cn2nnc(-c3ccco3)n2)cc1
38
\"Mol\"/
0.871646CNC(=O)CSc1nnc(-c2cccc(Cl)c2)o1
39
\"Mol\"/
0.889039CC(=O)Nc1ccc(NC(=O)c2ccc(Cl)cc2Cl)cc1
40
\"Mol\"/
0.597320CC(C)CCSc1nc2ccccc2c(=O)n1CC1CCCO1
41
\"Mol\"/
0.820129COC(=O)C1=C(C)NC(=O)NC1c1cccc(OC)c1
42
\"Mol\"/
0.828800COc1ccc(NC(=O)CSc2nnc(C(C)C)n2C)cc1
43
\"Mol\"/
0.589398COC(=O)c1ccc(-n2nnnc2SCC(=O)N2CCCCC2)cc1
44
\"Mol\"/
0.807891Cc1ccc(-n2nnnc2SCC(=O)N2CCCCC2)cc1
45
\"Mol\"/
0.879891CC(C)CNC(=O)Cn1nnc(-c2ccc3c(c2)OCO3)n1
46
\"Mol\"/
0.869387Cc1ccc(NC(=O)CSc2nncn2C)cc1C
47
\"Mol\"/
0.880821Cc1nnc(SCC(=O)Nc2ccccc2Cl)n1C
48
\"Mol\"/
0.613201CCOC(=O)c1cc(CC)c(C(=O)c2ccc(Cl)cc2)nc1C
49
\"Mol\"/
0.592722COC(=O)CSc1nnc(-c2ccco2)o1
50
\"Mol\"/
0.803457Cc1cccc(NC(=O)Cn2nnc(-c3cccs3)n2)c1C
51
\"Mol\"/
0.776652COc1ccc(-n2nnnc2SCC(=O)N2CCCC2C)cc1
52
\"Mol\"/
0.830220CC(C)CNC(=O)CSc1nnc(-c2cccs2)o1
53
\"Mol\"/
0.797543Cc1ccccc1NC(=O)c1cccc(-n2cnnn2)c1
54
\"Mol\"/
0.938409CC(=O)N(Cc1cc2cc(C)cc(C)c2[nH]c1=O)CC1CCCO1
55
\"Mol\"/
0.807891Cc1ccc(-n2nnnc2SCC(=O)N2CCCCC2)cc1
56
\"Mol\"/
0.884997CC(C)C(=O)Nc1ccc(NC(=O)C2CCCCC2)cc1
57
\"Mol\"/
0.877238CC(C)(C)NC(=O)CSc1nnc(-c2ccccc2)o1
58
\"Mol\"/
0.669455CC(C)c1cccc(C(C)c2nc3ccccc3n2C)c1
59
\"Mol\"/
0.828525COc1ccc(NC(=O)CSc2nnc(C3CC3)n2C)cc1
60
\"Mol\"/
0.695628CCc1ccccc1NC(=O)CSc1nnc(-c2ccco2)o1
61
\"Mol\"/
0.644309CCOC(=O)CSc1nnc(-c2ccc(NC(C)=O)cc2)o1
62
\"Mol\"/
0.795909CCn1c(SCC(=O)N2CCCC2)nnc1-c1ccccc1
63
\"Mol\"/
0.838070Cc1ccccc1NC(=O)Nc1cccc(F)c1Cl
64
\"Mol\"/
0.515580COC(=O)c1ccc(-n2nnnc2SCC(=O)Nc2ccccc2C)cc1
65
\"Mol\"/
0.842617CCn1nnnc1SCC(=O)Nc1ccc(C)cc1
66
\"Mol\"/
0.861415Cc1cccc(NC(=O)CSc2nnc(C3CC3)n2C)c1C
67
\"Mol\"/
0.635345CNC(=O)CSc1nnc(-c2cccc(N)c2)o1
68
\"Mol\"/
0.802731CC(=O)Nc1ccc(-c2nnc(SCC(N)=O)o2)cc1
69
\"Mol\"/
0.731640Cc1cc(=O)oc2cc(OC(=O)Nc3ccccc3)ccc12
70
\"Mol\"/
0.862762Cc1ccc(NC(=O)CSc2nnc(C(C)C)n2C)cc1
71
\"Mol\"/
0.641072Cc1ccc(-n2nnnc2SCC(=O)Nc2ccccc2C(N)=O)cc1
72
\"Mol\"/
0.763287CC(=O)Nc1ccc(NC(=O)c2cccc(-n3cnnn3)c2)cc1
73
\"Mol\"/
0.812857Cc1ccc(-c2nnc(SCC(=O)N3CCCC3)o2)cc1
74
\"Mol\"/
0.881461CCCC(=O)Nc1nnc(COc2ccccc2)s1
75
\"Mol\"/
0.879589CC(C)(C)NC(=O)CSc1nnc(-c2ccc(F)cc2)o1
76
\"Mol\"/
0.810280Cc1ccc2nc(SCC(=O)N3CCCCC3)c(C#N)cc2c1
77
\"Mol\"/
0.689292Cc1ccc(C(C)C)cc1NC(=O)Cn1nnc(-c2ccccc2N)n1
78
\"Mol\"/
0.862933CC(C)c1nnc(SCC(=O)Nc2ccccc2F)n1C
79
\"Mol\"/
0.857897COc1ccc(CC(=O)Nc2cccc(C(C)=O)c2)cc1
80
\"Mol\"/
0.765705CCCn1c(SCC(=O)N2CCCC2)nnc1-c1ccco1
81
\"Mol\"/
0.694083COc1ccc(NC(=O)CSc2nnc(-c3ccco3)o2)cc1
82
\"Mol\"/
0.500273Cc1nn(-c2cc3ccccc3oc2=O)c2ccccc12
83
\"Mol\"/
0.829153CCn1c(SCC(=O)Nc2cccc(C)n2)nnc1C1CC1
84
\"Mol\"/
0.678305COc1ccc(C(=O)CC(=O)Nc2cccc(C)c2C)cc1
85
\"Mol\"/
0.915541COc1ccc(CC(=O)Nc2ccc(CC#N)cc2)cc1
86
\"Mol\"/
0.702972CCc1ccc(-n2c(SCC(N)=O)nnc2-c2ccco2)cc1
87
\"Mol\"/
0.594450CC(C)(C)C(=O)Oc1ccc(-n2cnnn2)cc1
88
\"Mol\"/
0.874446Cc1ccc(-n2nnnc2SCC(=O)NC(C)(C)C)cc1
89
\"Mol\"/
0.489481CCOC(=O)CSc1nc(C)nc2sc3c(c12)CCCC3
90
\"Mol\"/
0.745955COc1ccc(CNC(=O)Cn2nnc(-c3ccccc3)n2)cc1
91
\"Mol\"/
0.877238CC(C)(C)NC(=O)CSc1nnc(-c2ccccc2)o1
92
\"Mol\"/
0.725361Cc1ccc(NC(=O)CSc2nnc(-c3cccnc3)o2)cc1
93
\"Mol\"/
0.806911CC(C)C(=O)N(Cc1ccco1)CC1CCCO1
94
\"Mol\"/
0.784833CC(=O)Nc1cccc(NC(=O)CSc2nnc(C3CC3)n2C)c1
95
\"Mol\"/
0.662338COC(=O)c1ccc(NC(=O)CSc2nncn2C)cc1
96
\"Mol\"/
0.879391CC(=O)Nc1ccc(NC(=O)CCc2ccccc2Cl)cc1
97
\"Mol\"/
0.643678CCc1nc2sc3c(c2c(=O)n1CC(=O)OC)CCCCC3
\n", "
" ], "text/plain": [ "" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "show(results)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "-eHjSOTHH37i" }, "outputs": [], "source": [ "from rdkit.DataStructs import TanimotoSimilarity\n", "from rdkit.Chem import AllChem" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 36 }, "id": "nCyiYi2MH37i", "outputId": "62c3d694-22af-4fb7-f3eb-3605d03f2a0b" }, "outputs": [ { "data": { "application/vnd.google.colaboratory.intrinsic+json": { "type": "string" }, "text/plain": [ "'Diversity of molecules % = 0.7729722352118349'" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fp_list = []\n", "for molecule in molecules:\n", " fp = AllChem.GetMorganFingerprintAsBitVect(molecule, 2, nBits=1024)\n", " fp_list.append(fp)\n", "\n", "diversity = []\n", "for i in range(len(fp_list)):\n", " for j in range(i+1, len(fp_list)):\n", " current_diverity = 1 - float(TanimotoSimilarity(fp_list[i], fp_list[j]))\n", " diversity.append(current_diverity)\n", "\n", "\"Diversity of molecules % = {}\".format(np.mean(diversity))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "aLRmtmWVH37i" }, "outputs": [], "source": [ "with open('gen_gpt_moses_1k.txt', 'w') as f:\n", " for line in gen_smiles:\n", " f.write(f\"{line}\\n\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "G4Lfc2rtB4yM" }, "outputs": [], "source": [] } ] }