Spaces:

ttmn
/

SolLlama

Sleeping

App Files Files Community

BrightBlueCheese commited on May 21

Commit

7f92264

•

1 Parent(s): 90d0c74

app

Browse files

Files changed (10) hide show

.ipynb_checkpoints/Untitled-checkpoint.ipynb +6 -0
.ipynb_checkpoints/datamodule_finetune_sl-checkpoint.py +1 -1
Untitled.ipynb +467 -0
__pycache__/auto_evaluator_sl.cpython-311.pyc +0 -0
__pycache__/chemllama_mtr.cpython-311.pyc +0 -0
__pycache__/datamodule_finetune_sl.cpython-311.pyc +0 -0
__pycache__/model_finetune_sl.cpython-311.pyc +0 -0
__pycache__/tokenizer_sl.cpython-311.pyc +0 -0
__pycache__/utils_sl.cpython-311.pyc +0 -0
datamodule_finetune_sl.py +1 -1

.ipynb_checkpoints/Untitled-checkpoint.ipynb ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+ "cells": [],
+ "metadata": {},
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

.ipynb_checkpoints/datamodule_finetune_sl-checkpoint.py CHANGED Viewed

@@ -61,7 +61,7 @@ class CustomLlamaDatasetAbraham(Dataset):
         return {
             "input_ids": torch.tensor(local_encoded["input_ids"]),
             "attention_mask": torch.tensor(local_encoded["attention_mask"]),
-            "labels": None,
         }
 class CustomFinetuneDataModule(L.LightningDataModule):

         return {
             "input_ids": torch.tensor(local_encoded["input_ids"]),
             "attention_mask": torch.tensor(local_encoded["attention_mask"]),
+            "labels": torch.tensor(local_encoded["input_ids"]), # this one does not matter for sl
         }
 class CustomFinetuneDataModule(L.LightningDataModule):

Untitled.ipynb ADDED Viewed

	@@ -0,0 +1,467 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "7e38e8a0-ff53-465c-9861-069d6dc54714",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import streamlit as st\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "3c37a529-b0b4-4aed-a198-49f5e5bdbe02",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "import os\n",
+    "import torch\n",
+    "from torch import nn\n",
+    "import torchmetrics\n",
+    "from transformers import LlamaModel, LlamaConfig\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import warnings\n",
+    "import lightning as L\n",
+    "torch.set_float32_matmul_precision('high')\n",
+    "warnings.filterwarnings(\"ignore\", module=\"pl_bolts\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "1daba56d-a0e2-4be7-a2ea-52579726c201",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sys.path.append( '../')\n",
+    "\n",
+    "import tokenizer_sl, datamodule_finetune_sl, model_finetune_sl, chemllama_mtr, utils_sl\n",
+    "import auto_evaluator_sl\n",
+    "\n",
+    "from torch.utils.data import Dataset, DataLoader\n",
+    "from transformers import DataCollatorWithPadding\n",
+    "\n",
+    "torch.manual_seed(1004)\n",
+    "np.random.seed(1004)\n",
+    "\n",
+    "smiles_str = \"COO2\"\n",
+    "\n",
+    "solute_or_solvent = \"Solvent\"\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "7d3d996c-59b3-4079-83ef-818651add7ba",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class ChemLlama(nn.Module):\n",
+    "    def __init__(\n",
+    "        self,\n",
+    "        max_position_embeddings=512,\n",
+    "        vocab_size=591,\n",
+    "        pad_token_id=0,\n",
+    "        bos_token_id=12,\n",
+    "        eos_token_id=13,\n",
+    "        hidden_size=768,\n",
+    "        intermediate_size=768,\n",
+    "        num_labels=105,\n",
+    "        attention_dropout=0.144,\n",
+    "        num_hidden_layers=7,\n",
+    "        num_attention_heads=8,\n",
+    "        learning_rate=0.0001,\n",
+    "    ):\n",
+    "        super(ChemLlama, self).__init__()\n",
+    "        \n",
+    "        self.hidden_size = hidden_size\n",
+    "        self.intermediate_size = intermediate_size\n",
+    "        self.num_labels = num_labels\n",
+    "        self.vocab_size = vocab_size\n",
+    "        self.pad_token_id = pad_token_id\n",
+    "        self.bos_token_id = bos_token_id\n",
+    "        self.eos_token_id = eos_token_id\n",
+    "        self.num_hidden_layers = num_hidden_layers\n",
+    "        self.num_attention_heads = num_attention_heads\n",
+    "        self.attention_dropout = attention_dropout\n",
+    "        self.max_position_embeddings = max_position_embeddings\n",
+    "\n",
+    "        self.mae = torchmetrics.MeanAbsoluteError()\n",
+    "        self.mse = torchmetrics.MeanSquaredError()\n",
+    "\n",
+    "        self.config_llama = LlamaConfig(\n",
+    "            max_position_embeddings=self.max_position_embeddings,\n",
+    "            vocab_size=self.vocab_size,\n",
+    "            hidden_size=self.hidden_size,\n",
+    "            intermediate_size=self.intermediate_size,\n",
+    "            num_hidden_layers=self.num_hidden_layers,\n",
+    "            num_attention_heads=self.num_attention_heads,\n",
+    "            attention_dropout=self.attention_dropout,\n",
+    "            pad_token_id=self.pad_token_id,\n",
+    "            bos_token_id=self.bos_token_id,\n",
+    "            eos_token_id=self.eos_token_id,\n",
+    "        )\n",
+    "\n",
+    "        self.loss_fn = nn.L1Loss()\n",
+    "\n",
+    "        self.llama = LlamaModel(self.config_llama)\n",
+    "        self.gelu = nn.GELU()\n",
+    "        self.score = nn.Linear(self.hidden_size, self.num_labels)\n",
+    "\n",
+    "    def forward(self, input_ids, attention_mask, labels=None):\n",
+    "\n",
+    "        transformer_outputs = self.llama(\n",
+    "            input_ids=input_ids, attention_mask=attention_mask\n",
+    "        )\n",
+    "\n",
+    "        hidden_states = transformer_outputs[0]\n",
+    "        hidden_states = self.gelu(hidden_states)\n",
+    "        logits = self.score(hidden_states)\n",
+    "\n",
+    "        if input_ids is not None:\n",
+    "            batch_size = input_ids.shape[0]\n",
+    "        else:\n",
+    "            batch_size = inputs_embeds.shape[0]\n",
+    "\n",
+    "        if self.config_llama.pad_token_id is None and batch_size != 1:\n",
+    "            raise ValueError(\n",
+    "                \"Cannot handle batch sizes > 1 if no padding token is defined.\"\n",
+    "            )\n",
+    "        if self.config_llama.pad_token_id is None:\n",
+    "            sequence_lengths = -1\n",
+    "        else:\n",
+    "            if input_ids is not None:\n",
+    "                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility\n",
+    "                sequence_lengths = (\n",
+    "                    torch.eq(input_ids, self.config_llama.pad_token_id).int().argmax(-1)\n",
+    "                    - 1\n",
+    "                )\n",
+    "                sequence_lengths = sequence_lengths % input_ids.shape[-1]\n",
+    "                sequence_lengths = sequence_lengths.to(logits.device)\n",
+    "            else:\n",
+    "                sequence_lengths = -1\n",
+    "        # raise ValueError(len(sequence_lengths), sequence_lengths)\n",
+    "\n",
+    "        pooled_logits = logits[\n",
+    "            torch.arange(batch_size, device=logits.device), sequence_lengths\n",
+    "        ]\n",
+    "        return pooled_logits\n",
+    "\n",
+    "\n",
+    "chemllama_mtr = ChemLlama()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "da586e81-ace8-489d-a11a-ae44a0ed2369",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "llama.embed_tokens.weight False\n",
+      "llama.layers.0.self_attn.q_proj.weight False\n",
+      "llama.layers.0.self_attn.k_proj.weight False\n",
+      "llama.layers.0.self_attn.v_proj.weight False\n",
+      "llama.layers.0.self_attn.o_proj.weight False\n",
+      "llama.layers.0.mlp.gate_proj.weight False\n",
+      "llama.layers.0.mlp.up_proj.weight False\n",
+      "llama.layers.0.mlp.down_proj.weight False\n",
+      "llama.layers.0.input_layernorm.weight False\n",
+      "llama.layers.0.post_attention_layernorm.weight False\n",
+      "llama.layers.1.self_attn.q_proj.weight False\n",
+      "llama.layers.1.self_attn.k_proj.weight False\n",
+      "llama.layers.1.self_attn.v_proj.weight False\n",
+      "llama.layers.1.self_attn.o_proj.weight False\n",
+      "llama.layers.1.mlp.gate_proj.weight False\n",
+      "llama.layers.1.mlp.up_proj.weight False\n",
+      "llama.layers.1.mlp.down_proj.weight False\n",
+      "llama.layers.1.input_layernorm.weight False\n",
+      "llama.layers.1.post_attention_layernorm.weight False\n",
+      "llama.layers.2.self_attn.q_proj.weight False\n",
+      "llama.layers.2.self_attn.k_proj.weight False\n",
+      "llama.layers.2.self_attn.v_proj.weight False\n",
+      "llama.layers.2.self_attn.o_proj.weight False\n",
+      "llama.layers.2.mlp.gate_proj.weight False\n",
+      "llama.layers.2.mlp.up_proj.weight False\n",
+      "llama.layers.2.mlp.down_proj.weight False\n",
+      "llama.layers.2.input_layernorm.weight False\n",
+      "llama.layers.2.post_attention_layernorm.weight False\n",
+      "llama.layers.3.self_attn.q_proj.weight False\n",
+      "llama.layers.3.self_attn.k_proj.weight False\n",
+      "llama.layers.3.self_attn.v_proj.weight False\n",
+      "llama.layers.3.self_attn.o_proj.weight False\n",
+      "llama.layers.3.mlp.gate_proj.weight False\n",
+      "llama.layers.3.mlp.up_proj.weight False\n",
+      "llama.layers.3.mlp.down_proj.weight False\n",
+      "llama.layers.3.input_layernorm.weight False\n",
+      "llama.layers.3.post_attention_layernorm.weight False\n",
+      "llama.layers.4.self_attn.q_proj.weight False\n",
+      "llama.layers.4.self_attn.k_proj.weight False\n",
+      "llama.layers.4.self_attn.v_proj.weight False\n",
+      "llama.layers.4.self_attn.o_proj.weight False\n",
+      "llama.layers.4.mlp.gate_proj.weight False\n",
+      "llama.layers.4.mlp.up_proj.weight False\n",
+      "llama.layers.4.mlp.down_proj.weight False\n",
+      "llama.layers.4.input_layernorm.weight False\n",
+      "llama.layers.4.post_attention_layernorm.weight False\n",
+      "llama.layers.5.self_attn.q_proj.weight False\n",
+      "llama.layers.5.self_attn.k_proj.weight False\n",
+      "llama.layers.5.self_attn.v_proj.weight False\n",
+      "llama.layers.5.self_attn.o_proj.weight False\n",
+      "llama.layers.5.mlp.gate_proj.weight False\n",
+      "llama.layers.5.mlp.up_proj.weight False\n",
+      "llama.layers.5.mlp.down_proj.weight False\n",
+      "llama.layers.5.input_layernorm.weight False\n",
+      "llama.layers.5.post_attention_layernorm.weight False\n",
+      "llama.layers.6.self_attn.q_proj.weight False\n",
+      "llama.layers.6.self_attn.k_proj.weight False\n",
+      "llama.layers.6.self_attn.v_proj.weight False\n",
+      "llama.layers.6.self_attn.o_proj.weight False\n",
+      "llama.layers.6.mlp.gate_proj.weight False\n",
+      "llama.layers.6.mlp.up_proj.weight False\n",
+      "llama.layers.6.mlp.down_proj.weight False\n",
+      "llama.layers.6.input_layernorm.weight False\n",
+      "llama.layers.6.post_attention_layernorm.weight False\n",
+      "llama.norm.weight False\n",
+      "score.weight False\n",
+      "score.bias False\n"
+     ]
+    }
+   ],
+   "source": [
+    "class ChemLlama_FT(nn.Module):\n",
+    "    def __init__(\n",
+    "        self,\n",
+    "        model_mtr,\n",
+    "        linear_param:int=64,\n",
+    "        use_freeze:bool=True,\n",
+    "        *args, **kwargs\n",
+    "    ):\n",
+    "        super(ChemLlama_FT, self).__init__()\n",
+    "        # self.save_hyperparameters()\n",
+    "\n",
+    "        self.model_mtr = model_mtr\n",
+    "        if use_freeze:\n",
+    "            # self.model_mtr.freeze()\n",
+    "            for name, param in model_mtr.named_parameters():\n",
+    "                param.requires_grad = False\n",
+    "                print(name, param.requires_grad)\n",
+    "        \n",
+    "        self.gelu = nn.GELU()\n",
+    "        self.linear1 = nn.Linear(self.model_mtr.num_labels, linear_param)\n",
+    "        self.linear2 = nn.Linear(linear_param, linear_param)\n",
+    "        self.regression = nn.Linear(linear_param, 5)\n",
+    "\n",
+    "        self.loss_fn = nn.L1Loss()\n",
+    "\n",
+    "    def forward(self, input_ids, attention_mask, labels=None):\n",
+    "        x = self.model_mtr(input_ids=input_ids, attention_mask=attention_mask)\n",
+    "        x = self.gelu(x)\n",
+    "        x = self.linear1(x)\n",
+    "        x = self.gelu(x)\n",
+    "        x = self.linear2(x)\n",
+    "        x = self.gelu(x)\n",
+    "        x = self.regression(x)\n",
+    "        \n",
+    "        return x\n",
+    "        \n",
+    "chemllama_ft = ChemLlama_FT(model_mtr=chemllama_mtr)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "49537588-bad0-44ff-b7fd-73683cdb2f6c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# I just reused our previous research code with some modifications.\n",
+    "dir_main = \"../\"\n",
+    "\n",
+    "max_seq_length = 512\n",
+    "\n",
+    "tokenizer = tokenizer_sl.fn_load_tokenizer_llama(\n",
+    "    max_seq_length=max_seq_length,\n",
+    ")\n",
+    "max_length = max_seq_length\n",
+    "num_workers = 2\n",
+    "\n",
+    "## FT\n",
+    "\n",
+    "dir_model_ft_to_save = f\"{dir_main}/SolLlama-mtr\"\n",
+    "# name_model_ft = 'Solvent.pt'\n",
+    "name_model_ft = f\"{solute_or_solvent}.pt\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "cc155008-a7f1-4dd1-8fc3-ad299a5938a6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "device = 'cpu'\n",
+    "# Predict\n",
+    "local_model_ft = utils_sl.load_model_ft_with(\n",
+    "    class_model_ft=chemllama_ft, \n",
+    "    dir_model_ft=dir_model_ft_to_save,\n",
+    "    name_model_ft=name_model_ft\n",
+    ").to(device)\n",
+    "\n",
+    "# result = trainer.predict(local_model_ft, data_module)\n",
+    "# result_pred = list()\n",
+    "# result_label = list()\n",
+    "# for bat in range(len(result)):\n",
+    "#     result_pred.append(result[bat][0].squeeze())\n",
+    "#     result_label.append(result[bat][1])\n",
+    "\n",
+    "# with open('./smiles_str.txt', 'r') as file:\n",
+    "#     smiles_str = file.readline()\n",
+    "    \n",
+    "dataset_test = datamodule_finetune_sl.CustomLlamaDatasetAbraham(\n",
+    "    df=pd.DataFrame([smiles_str]),\n",
+    "    tokenizer=tokenizer,\n",
+    "    max_seq_length=max_length\n",
+    ")\n",
+    "\n",
+    "data_collator = DataCollatorWithPadding(tokenizer)\n",
+    "dataloader_test = DataLoader(dataset_test, batch_size=1, shuffle=False, collate_fn=data_collator)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "69baeffd-a2cb-439c-be46-69ee4fc5fea1",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0    COO2\n",
+       "Name: 0, dtype: object"
+      ]
+     },
+     "execution_count": 31,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pd.DataFrame([smiles_str]).iloc[:,0:].iloc[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "7994f626-ca68-4ef1-811d-c2b684cd62ce",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<datamodule_finetune_sl.CustomLlamaDatasetAbraham at 0x7f81a4f6cf10>"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dataset_test"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3b173642-1695-40dc-82b5-0e7b775fff38",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data_loader_valid = DataLoader(dataset_valid, batch_size=int(batch_size*1.5), shuffle=False, collate_fn=data_collator, num_workers=4, pin_memory=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "a6d6145b-d5f9-44e4-85ca-f27b8c8a339d",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ValueError",
+     "evalue": "Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`labels` in this case) have excessive nesting (inputs type `list` where type `int` is expected).",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
+      "File \u001b[0;32m~/chemllm/lib/python3.11/site-packages/transformers/tokenization_utils_base.py:759\u001b[0m, in \u001b[0;36mBatchEncoding.convert_to_tensors\u001b[0;34m(self, tensor_type, prepend_batch_axis)\u001b[0m\n\u001b[1;32m    758\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_tensor(value):\n\u001b[0;32m--> 759\u001b[0m     tensor \u001b[38;5;241m=\u001b[39m \u001b[43mas_tensor\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    761\u001b[0m     \u001b[38;5;66;03m# Removing this for now in favor of controlling the shape with `prepend_batch_axis`\u001b[39;00m\n\u001b[1;32m    762\u001b[0m     \u001b[38;5;66;03m# # at-least2d\u001b[39;00m\n\u001b[1;32m    763\u001b[0m     \u001b[38;5;66;03m# if tensor.ndim > 2:\u001b[39;00m\n\u001b[1;32m    764\u001b[0m     \u001b[38;5;66;03m#     tensor = tensor.squeeze(0)\u001b[39;00m\n\u001b[1;32m    765\u001b[0m     \u001b[38;5;66;03m# elif tensor.ndim < 2:\u001b[39;00m\n\u001b[1;32m    766\u001b[0m     \u001b[38;5;66;03m#     tensor = tensor[None, :]\u001b[39;00m\n",
+      "File \u001b[0;32m~/chemllm/lib/python3.11/site-packages/transformers/tokenization_utils_base.py:721\u001b[0m, in \u001b[0;36mBatchEncoding.convert_to_tensors.<locals>.as_tensor\u001b[0;34m(value, dtype)\u001b[0m\n\u001b[1;32m    720\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mtensor(np\u001b[38;5;241m.\u001b[39marray(value))\n\u001b[0;32m--> 721\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtensor\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m)\u001b[49m\n",
+      "\u001b[0;31mRuntimeError\u001b[0m: Could not infer dtype of NoneType",
+      "\nThe above exception was the direct cause of the following exception:\n",
+      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[21], line 4\u001b[0m\n\u001b[1;32m      2\u001b[0m local_model_ft\u001b[38;5;241m.\u001b[39meval()\n\u001b[1;32m      3\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39minference_mode():\n\u001b[0;32m----> 4\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m i, v_batch \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(dataloader_test):\n\u001b[1;32m      5\u001b[0m         \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[1;32m      6\u001b[0m         \u001b[38;5;66;03m# v_input_ids = v_batch['input_ids'].to(device)\u001b[39;00m\n\u001b[1;32m      7\u001b[0m         \u001b[38;5;66;03m# v_attention_mask = v_batch['attention_mask'].to(device)\u001b[39;00m\n\u001b[1;32m      8\u001b[0m         \u001b[38;5;66;03m# # v_y_labels = v_batch['labels'].to(device)\u001b[39;00m\n\u001b[1;32m      9\u001b[0m         \u001b[38;5;66;03m# v_y_logits = local_model_ft(input_ids=v_input_ids, attention_mask=v_attention_mask)\u001b[39;00m\n\u001b[1;32m     10\u001b[0m         \u001b[38;5;66;03m# list_predictions.append(v_y_logits[0][0].tolist())\u001b[39;00m\n",
+      "File \u001b[0;32m~/chemllm/lib/python3.11/site-packages/torch/utils/data/dataloader.py:634\u001b[0m, in \u001b[0;36m_BaseDataLoaderIter.__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    631\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_sampler_iter \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    632\u001b[0m     \u001b[38;5;66;03m# TODO(https://github.com/pytorch/pytorch/issues/76750)\u001b[39;00m\n\u001b[1;32m    633\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_reset()  \u001b[38;5;66;03m# type: ignore[call-arg]\u001b[39;00m\n\u001b[0;32m--> 634\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_next_data\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    635\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_num_yielded \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m    636\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_dataset_kind \u001b[38;5;241m==\u001b[39m _DatasetKind\u001b[38;5;241m.\u001b[39mIterable \u001b[38;5;129;01mand\u001b[39;00m \\\n\u001b[1;32m    637\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_IterableDataset_len_called \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \\\n\u001b[1;32m    638\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_num_yielded \u001b[38;5;241m>\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_IterableDataset_len_called:\n",
+      "File \u001b[0;32m~/chemllm/lib/python3.11/site-packages/torch/utils/data/dataloader.py:678\u001b[0m, in \u001b[0;36m_SingleProcessDataLoaderIter._next_data\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    676\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_next_data\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m    677\u001b[0m     index \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_next_index()  \u001b[38;5;66;03m# may raise StopIteration\u001b[39;00m\n\u001b[0;32m--> 678\u001b[0m     data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_dataset_fetcher\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfetch\u001b[49m\u001b[43m(\u001b[49m\u001b[43mindex\u001b[49m\u001b[43m)\u001b[49m  \u001b[38;5;66;03m# may raise StopIteration\u001b[39;00m\n\u001b[1;32m    679\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_pin_memory:\n\u001b[1;32m    680\u001b[0m         data \u001b[38;5;241m=\u001b[39m _utils\u001b[38;5;241m.\u001b[39mpin_memory\u001b[38;5;241m.\u001b[39mpin_memory(data, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_pin_memory_device)\n",
+      "File \u001b[0;32m~/chemllm/lib/python3.11/site-packages/torch/utils/data/_utils/fetch.py:54\u001b[0m, in \u001b[0;36m_MapDatasetFetcher.fetch\u001b[0;34m(self, possibly_batched_index)\u001b[0m\n\u001b[1;32m     52\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m     53\u001b[0m     data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset[possibly_batched_index]\n\u001b[0;32m---> 54\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcollate_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/chemllm/lib/python3.11/site-packages/transformers/data/data_collator.py:271\u001b[0m, in \u001b[0;36mDataCollatorWithPadding.__call__\u001b[0;34m(self, features)\u001b[0m\n\u001b[1;32m    270\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, features: List[Dict[\u001b[38;5;28mstr\u001b[39m, Any]]) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Dict[\u001b[38;5;28mstr\u001b[39m, Any]:\n\u001b[0;32m--> 271\u001b[0m     batch \u001b[38;5;241m=\u001b[39m \u001b[43mpad_without_fast_tokenizer_warning\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    272\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtokenizer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    273\u001b[0m \u001b[43m        \u001b[49m\u001b[43mfeatures\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    274\u001b[0m \u001b[43m        \u001b[49m\u001b[43mpadding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpadding\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    275\u001b[0m \u001b[43m        \u001b[49m\u001b[43mmax_length\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmax_length\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    276\u001b[0m \u001b[43m        \u001b[49m\u001b[43mpad_to_multiple_of\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpad_to_multiple_of\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    277\u001b[0m \u001b[43m        \u001b[49m\u001b[43mreturn_tensors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mreturn_tensors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    278\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    279\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlabel\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m batch:\n\u001b[1;32m    280\u001b[0m         batch[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlabels\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m batch[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlabel\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n",
+      "File \u001b[0;32m~/chemllm/lib/python3.11/site-packages/transformers/data/data_collator.py:66\u001b[0m, in \u001b[0;36mpad_without_fast_tokenizer_warning\u001b[0;34m(tokenizer, *pad_args, **pad_kwargs)\u001b[0m\n\u001b[1;32m     63\u001b[0m tokenizer\u001b[38;5;241m.\u001b[39mdeprecation_warnings[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAsking-to-pad-a-fast-tokenizer\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m     65\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m---> 66\u001b[0m     padded \u001b[38;5;241m=\u001b[39m \u001b[43mtokenizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpad\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mpad_args\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mpad_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     67\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m     68\u001b[0m     \u001b[38;5;66;03m# Restore the state of the warning.\u001b[39;00m\n\u001b[1;32m     69\u001b[0m     tokenizer\u001b[38;5;241m.\u001b[39mdeprecation_warnings[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAsking-to-pad-a-fast-tokenizer\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m warning_state\n",
+      "File \u001b[0;32m~/chemllm/lib/python3.11/site-packages/transformers/tokenization_utils_base.py:3369\u001b[0m, in \u001b[0;36mPreTrainedTokenizerBase.pad\u001b[0;34m(self, encoded_inputs, padding, max_length, pad_to_multiple_of, return_attention_mask, return_tensors, verbose)\u001b[0m\n\u001b[1;32m   3366\u001b[0m             batch_outputs[key] \u001b[38;5;241m=\u001b[39m []\n\u001b[1;32m   3367\u001b[0m         batch_outputs[key]\u001b[38;5;241m.\u001b[39mappend(value)\n\u001b[0;32m-> 3369\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mBatchEncoding\u001b[49m\u001b[43m(\u001b[49m\u001b[43mbatch_outputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtensor_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_tensors\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/chemllm/lib/python3.11/site-packages/transformers/tokenization_utils_base.py:224\u001b[0m, in \u001b[0;36mBatchEncoding.__init__\u001b[0;34m(self, data, encoding, tensor_type, prepend_batch_axis, n_sequences)\u001b[0m\n\u001b[1;32m    220\u001b[0m     n_sequences \u001b[38;5;241m=\u001b[39m encoding[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;241m.\u001b[39mn_sequences\n\u001b[1;32m    222\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_n_sequences \u001b[38;5;241m=\u001b[39m n_sequences\n\u001b[0;32m--> 224\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconvert_to_tensors\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtensor_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtensor_type\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mprepend_batch_axis\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mprepend_batch_axis\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/chemllm/lib/python3.11/site-packages/transformers/tokenization_utils_base.py:775\u001b[0m, in \u001b[0;36mBatchEncoding.convert_to_tensors\u001b[0;34m(self, tensor_type, prepend_batch_axis)\u001b[0m\n\u001b[1;32m    770\u001b[0m         \u001b[38;5;28;01mif\u001b[39;00m key \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124moverflowing_tokens\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m    771\u001b[0m             \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m    772\u001b[0m                 \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUnable to create tensor returning overflowing tokens of different lengths. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    773\u001b[0m                 \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPlease see if a fast version of this tokenizer is available to have this feature available.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    774\u001b[0m             ) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01me\u001b[39;00m\n\u001b[0;32m--> 775\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m    776\u001b[0m             \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUnable to create tensor, you should probably activate truncation and/or padding with\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    777\u001b[0m             \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mpadding=True\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtruncation=True\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m to have batched tensors with the same length. Perhaps your\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    778\u001b[0m             \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m features (`\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m` in this case) have excessive nesting (inputs type `list` where type `int` is\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    779\u001b[0m             \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m expected).\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    780\u001b[0m         ) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01me\u001b[39;00m\n\u001b[1;32m    782\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\n",
+      "\u001b[0;31mValueError\u001b[0m: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`labels` in this case) have excessive nesting (inputs type `list` where type `int` is expected)."
+     ]
+    }
+   ],
+   "source": [
+    "list_predictions = []\n",
+    "local_model_ft.eval()\n",
+    "with torch.inference_mode():\n",
+    "    for i, v_batch in enumerate(dataloader_test):\n",
+    "        break\n",
+    "        # v_input_ids = v_batch['input_ids'].to(device)\n",
+    "        # v_attention_mask = v_batch['attention_mask'].to(device)\n",
+    "        # # v_y_labels = v_batch['labels'].to(device)\n",
+    "        # v_y_logits = local_model_ft(input_ids=v_input_ids, attention_mask=v_attention_mask)\n",
+    "        # list_predictions.append(v_y_logits[0][0].tolist())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7bc3e296-6871-45fb-8459-78eadc36bb61",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "chemllm",
+   "language": "python",
+   "name": "chemllm"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

__pycache__/auto_evaluator_sl.cpython-311.pyc ADDED Viewed

Binary file (9.12 kB). View file

__pycache__/chemllama_mtr.cpython-311.pyc ADDED Viewed

Binary file (7.89 kB). View file

__pycache__/datamodule_finetune_sl.cpython-311.pyc ADDED Viewed

Binary file (5.45 kB). View file

__pycache__/model_finetune_sl.cpython-311.pyc ADDED Viewed

Binary file (6.43 kB). View file

__pycache__/tokenizer_sl.cpython-311.pyc ADDED Viewed

Binary file (1.43 kB). View file

__pycache__/utils_sl.cpython-311.pyc ADDED Viewed

Binary file (3.55 kB). View file

datamodule_finetune_sl.py CHANGED Viewed

@@ -61,7 +61,7 @@ class CustomLlamaDatasetAbraham(Dataset):
         return {
             "input_ids": torch.tensor(local_encoded["input_ids"]),
             "attention_mask": torch.tensor(local_encoded["attention_mask"]),
-            "labels": None,
         }
 class CustomFinetuneDataModule(L.LightningDataModule):

         return {
             "input_ids": torch.tensor(local_encoded["input_ids"]),
             "attention_mask": torch.tensor(local_encoded["attention_mask"]),
+            "labels": torch.tensor(local_encoded["input_ids"]), # this one does not matter for sl
         }
 class CustomFinetuneDataModule(L.LightningDataModule):