BrightBlueCheese commited on
Commit
7f92264
1 Parent(s): 90d0c74
.ipynb_checkpoints/Untitled-checkpoint.ipynb ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [],
3
+ "metadata": {},
4
+ "nbformat": 4,
5
+ "nbformat_minor": 5
6
+ }
.ipynb_checkpoints/datamodule_finetune_sl-checkpoint.py CHANGED
@@ -61,7 +61,7 @@ class CustomLlamaDatasetAbraham(Dataset):
61
  return {
62
  "input_ids": torch.tensor(local_encoded["input_ids"]),
63
  "attention_mask": torch.tensor(local_encoded["attention_mask"]),
64
- "labels": None,
65
  }
66
 
67
  class CustomFinetuneDataModule(L.LightningDataModule):
 
61
  return {
62
  "input_ids": torch.tensor(local_encoded["input_ids"]),
63
  "attention_mask": torch.tensor(local_encoded["attention_mask"]),
64
+ "labels": torch.tensor(local_encoded["input_ids"]), # this one does not matter for sl
65
  }
66
 
67
  class CustomFinetuneDataModule(L.LightningDataModule):
Untitled.ipynb ADDED
@@ -0,0 +1,467 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "7e38e8a0-ff53-465c-9861-069d6dc54714",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import streamlit as st\n"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": 2,
16
+ "id": "3c37a529-b0b4-4aed-a198-49f5e5bdbe02",
17
+ "metadata": {},
18
+ "outputs": [],
19
+ "source": [
20
+ "import sys\n",
21
+ "import os\n",
22
+ "import torch\n",
23
+ "from torch import nn\n",
24
+ "import torchmetrics\n",
25
+ "from transformers import LlamaModel, LlamaConfig\n",
26
+ "import numpy as np\n",
27
+ "import pandas as pd\n",
28
+ "import warnings\n",
29
+ "import lightning as L\n",
30
+ "torch.set_float32_matmul_precision('high')\n",
31
+ "warnings.filterwarnings(\"ignore\", module=\"pl_bolts\")"
32
+ ]
33
+ },
34
+ {
35
+ "cell_type": "code",
36
+ "execution_count": 5,
37
+ "id": "1daba56d-a0e2-4be7-a2ea-52579726c201",
38
+ "metadata": {},
39
+ "outputs": [],
40
+ "source": [
41
+ "sys.path.append( '../')\n",
42
+ "\n",
43
+ "import tokenizer_sl, datamodule_finetune_sl, model_finetune_sl, chemllama_mtr, utils_sl\n",
44
+ "import auto_evaluator_sl\n",
45
+ "\n",
46
+ "from torch.utils.data import Dataset, DataLoader\n",
47
+ "from transformers import DataCollatorWithPadding\n",
48
+ "\n",
49
+ "torch.manual_seed(1004)\n",
50
+ "np.random.seed(1004)\n",
51
+ "\n",
52
+ "smiles_str = \"COO2\"\n",
53
+ "\n",
54
+ "solute_or_solvent = \"Solvent\"\n",
55
+ "\n"
56
+ ]
57
+ },
58
+ {
59
+ "cell_type": "code",
60
+ "execution_count": 6,
61
+ "id": "7d3d996c-59b3-4079-83ef-818651add7ba",
62
+ "metadata": {},
63
+ "outputs": [],
64
+ "source": [
65
+ "class ChemLlama(nn.Module):\n",
66
+ " def __init__(\n",
67
+ " self,\n",
68
+ " max_position_embeddings=512,\n",
69
+ " vocab_size=591,\n",
70
+ " pad_token_id=0,\n",
71
+ " bos_token_id=12,\n",
72
+ " eos_token_id=13,\n",
73
+ " hidden_size=768,\n",
74
+ " intermediate_size=768,\n",
75
+ " num_labels=105,\n",
76
+ " attention_dropout=0.144,\n",
77
+ " num_hidden_layers=7,\n",
78
+ " num_attention_heads=8,\n",
79
+ " learning_rate=0.0001,\n",
80
+ " ):\n",
81
+ " super(ChemLlama, self).__init__()\n",
82
+ " \n",
83
+ " self.hidden_size = hidden_size\n",
84
+ " self.intermediate_size = intermediate_size\n",
85
+ " self.num_labels = num_labels\n",
86
+ " self.vocab_size = vocab_size\n",
87
+ " self.pad_token_id = pad_token_id\n",
88
+ " self.bos_token_id = bos_token_id\n",
89
+ " self.eos_token_id = eos_token_id\n",
90
+ " self.num_hidden_layers = num_hidden_layers\n",
91
+ " self.num_attention_heads = num_attention_heads\n",
92
+ " self.attention_dropout = attention_dropout\n",
93
+ " self.max_position_embeddings = max_position_embeddings\n",
94
+ "\n",
95
+ " self.mae = torchmetrics.MeanAbsoluteError()\n",
96
+ " self.mse = torchmetrics.MeanSquaredError()\n",
97
+ "\n",
98
+ " self.config_llama = LlamaConfig(\n",
99
+ " max_position_embeddings=self.max_position_embeddings,\n",
100
+ " vocab_size=self.vocab_size,\n",
101
+ " hidden_size=self.hidden_size,\n",
102
+ " intermediate_size=self.intermediate_size,\n",
103
+ " num_hidden_layers=self.num_hidden_layers,\n",
104
+ " num_attention_heads=self.num_attention_heads,\n",
105
+ " attention_dropout=self.attention_dropout,\n",
106
+ " pad_token_id=self.pad_token_id,\n",
107
+ " bos_token_id=self.bos_token_id,\n",
108
+ " eos_token_id=self.eos_token_id,\n",
109
+ " )\n",
110
+ "\n",
111
+ " self.loss_fn = nn.L1Loss()\n",
112
+ "\n",
113
+ " self.llama = LlamaModel(self.config_llama)\n",
114
+ " self.gelu = nn.GELU()\n",
115
+ " self.score = nn.Linear(self.hidden_size, self.num_labels)\n",
116
+ "\n",
117
+ " def forward(self, input_ids, attention_mask, labels=None):\n",
118
+ "\n",
119
+ " transformer_outputs = self.llama(\n",
120
+ " input_ids=input_ids, attention_mask=attention_mask\n",
121
+ " )\n",
122
+ "\n",
123
+ " hidden_states = transformer_outputs[0]\n",
124
+ " hidden_states = self.gelu(hidden_states)\n",
125
+ " logits = self.score(hidden_states)\n",
126
+ "\n",
127
+ " if input_ids is not None:\n",
128
+ " batch_size = input_ids.shape[0]\n",
129
+ " else:\n",
130
+ " batch_size = inputs_embeds.shape[0]\n",
131
+ "\n",
132
+ " if self.config_llama.pad_token_id is None and batch_size != 1:\n",
133
+ " raise ValueError(\n",
134
+ " \"Cannot handle batch sizes > 1 if no padding token is defined.\"\n",
135
+ " )\n",
136
+ " if self.config_llama.pad_token_id is None:\n",
137
+ " sequence_lengths = -1\n",
138
+ " else:\n",
139
+ " if input_ids is not None:\n",
140
+ " # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility\n",
141
+ " sequence_lengths = (\n",
142
+ " torch.eq(input_ids, self.config_llama.pad_token_id).int().argmax(-1)\n",
143
+ " - 1\n",
144
+ " )\n",
145
+ " sequence_lengths = sequence_lengths % input_ids.shape[-1]\n",
146
+ " sequence_lengths = sequence_lengths.to(logits.device)\n",
147
+ " else:\n",
148
+ " sequence_lengths = -1\n",
149
+ " # raise ValueError(len(sequence_lengths), sequence_lengths)\n",
150
+ "\n",
151
+ " pooled_logits = logits[\n",
152
+ " torch.arange(batch_size, device=logits.device), sequence_lengths\n",
153
+ " ]\n",
154
+ " return pooled_logits\n",
155
+ "\n",
156
+ "\n",
157
+ "chemllama_mtr = ChemLlama()"
158
+ ]
159
+ },
160
+ {
161
+ "cell_type": "code",
162
+ "execution_count": 7,
163
+ "id": "da586e81-ace8-489d-a11a-ae44a0ed2369",
164
+ "metadata": {},
165
+ "outputs": [
166
+ {
167
+ "name": "stdout",
168
+ "output_type": "stream",
169
+ "text": [
170
+ "llama.embed_tokens.weight False\n",
171
+ "llama.layers.0.self_attn.q_proj.weight False\n",
172
+ "llama.layers.0.self_attn.k_proj.weight False\n",
173
+ "llama.layers.0.self_attn.v_proj.weight False\n",
174
+ "llama.layers.0.self_attn.o_proj.weight False\n",
175
+ "llama.layers.0.mlp.gate_proj.weight False\n",
176
+ "llama.layers.0.mlp.up_proj.weight False\n",
177
+ "llama.layers.0.mlp.down_proj.weight False\n",
178
+ "llama.layers.0.input_layernorm.weight False\n",
179
+ "llama.layers.0.post_attention_layernorm.weight False\n",
180
+ "llama.layers.1.self_attn.q_proj.weight False\n",
181
+ "llama.layers.1.self_attn.k_proj.weight False\n",
182
+ "llama.layers.1.self_attn.v_proj.weight False\n",
183
+ "llama.layers.1.self_attn.o_proj.weight False\n",
184
+ "llama.layers.1.mlp.gate_proj.weight False\n",
185
+ "llama.layers.1.mlp.up_proj.weight False\n",
186
+ "llama.layers.1.mlp.down_proj.weight False\n",
187
+ "llama.layers.1.input_layernorm.weight False\n",
188
+ "llama.layers.1.post_attention_layernorm.weight False\n",
189
+ "llama.layers.2.self_attn.q_proj.weight False\n",
190
+ "llama.layers.2.self_attn.k_proj.weight False\n",
191
+ "llama.layers.2.self_attn.v_proj.weight False\n",
192
+ "llama.layers.2.self_attn.o_proj.weight False\n",
193
+ "llama.layers.2.mlp.gate_proj.weight False\n",
194
+ "llama.layers.2.mlp.up_proj.weight False\n",
195
+ "llama.layers.2.mlp.down_proj.weight False\n",
196
+ "llama.layers.2.input_layernorm.weight False\n",
197
+ "llama.layers.2.post_attention_layernorm.weight False\n",
198
+ "llama.layers.3.self_attn.q_proj.weight False\n",
199
+ "llama.layers.3.self_attn.k_proj.weight False\n",
200
+ "llama.layers.3.self_attn.v_proj.weight False\n",
201
+ "llama.layers.3.self_attn.o_proj.weight False\n",
202
+ "llama.layers.3.mlp.gate_proj.weight False\n",
203
+ "llama.layers.3.mlp.up_proj.weight False\n",
204
+ "llama.layers.3.mlp.down_proj.weight False\n",
205
+ "llama.layers.3.input_layernorm.weight False\n",
206
+ "llama.layers.3.post_attention_layernorm.weight False\n",
207
+ "llama.layers.4.self_attn.q_proj.weight False\n",
208
+ "llama.layers.4.self_attn.k_proj.weight False\n",
209
+ "llama.layers.4.self_attn.v_proj.weight False\n",
210
+ "llama.layers.4.self_attn.o_proj.weight False\n",
211
+ "llama.layers.4.mlp.gate_proj.weight False\n",
212
+ "llama.layers.4.mlp.up_proj.weight False\n",
213
+ "llama.layers.4.mlp.down_proj.weight False\n",
214
+ "llama.layers.4.input_layernorm.weight False\n",
215
+ "llama.layers.4.post_attention_layernorm.weight False\n",
216
+ "llama.layers.5.self_attn.q_proj.weight False\n",
217
+ "llama.layers.5.self_attn.k_proj.weight False\n",
218
+ "llama.layers.5.self_attn.v_proj.weight False\n",
219
+ "llama.layers.5.self_attn.o_proj.weight False\n",
220
+ "llama.layers.5.mlp.gate_proj.weight False\n",
221
+ "llama.layers.5.mlp.up_proj.weight False\n",
222
+ "llama.layers.5.mlp.down_proj.weight False\n",
223
+ "llama.layers.5.input_layernorm.weight False\n",
224
+ "llama.layers.5.post_attention_layernorm.weight False\n",
225
+ "llama.layers.6.self_attn.q_proj.weight False\n",
226
+ "llama.layers.6.self_attn.k_proj.weight False\n",
227
+ "llama.layers.6.self_attn.v_proj.weight False\n",
228
+ "llama.layers.6.self_attn.o_proj.weight False\n",
229
+ "llama.layers.6.mlp.gate_proj.weight False\n",
230
+ "llama.layers.6.mlp.up_proj.weight False\n",
231
+ "llama.layers.6.mlp.down_proj.weight False\n",
232
+ "llama.layers.6.input_layernorm.weight False\n",
233
+ "llama.layers.6.post_attention_layernorm.weight False\n",
234
+ "llama.norm.weight False\n",
235
+ "score.weight False\n",
236
+ "score.bias False\n"
237
+ ]
238
+ }
239
+ ],
240
+ "source": [
241
+ "class ChemLlama_FT(nn.Module):\n",
242
+ " def __init__(\n",
243
+ " self,\n",
244
+ " model_mtr,\n",
245
+ " linear_param:int=64,\n",
246
+ " use_freeze:bool=True,\n",
247
+ " *args, **kwargs\n",
248
+ " ):\n",
249
+ " super(ChemLlama_FT, self).__init__()\n",
250
+ " # self.save_hyperparameters()\n",
251
+ "\n",
252
+ " self.model_mtr = model_mtr\n",
253
+ " if use_freeze:\n",
254
+ " # self.model_mtr.freeze()\n",
255
+ " for name, param in model_mtr.named_parameters():\n",
256
+ " param.requires_grad = False\n",
257
+ " print(name, param.requires_grad)\n",
258
+ " \n",
259
+ " self.gelu = nn.GELU()\n",
260
+ " self.linear1 = nn.Linear(self.model_mtr.num_labels, linear_param)\n",
261
+ " self.linear2 = nn.Linear(linear_param, linear_param)\n",
262
+ " self.regression = nn.Linear(linear_param, 5)\n",
263
+ "\n",
264
+ " self.loss_fn = nn.L1Loss()\n",
265
+ "\n",
266
+ " def forward(self, input_ids, attention_mask, labels=None):\n",
267
+ " x = self.model_mtr(input_ids=input_ids, attention_mask=attention_mask)\n",
268
+ " x = self.gelu(x)\n",
269
+ " x = self.linear1(x)\n",
270
+ " x = self.gelu(x)\n",
271
+ " x = self.linear2(x)\n",
272
+ " x = self.gelu(x)\n",
273
+ " x = self.regression(x)\n",
274
+ " \n",
275
+ " return x\n",
276
+ " \n",
277
+ "chemllama_ft = ChemLlama_FT(model_mtr=chemllama_mtr)"
278
+ ]
279
+ },
280
+ {
281
+ "cell_type": "code",
282
+ "execution_count": 9,
283
+ "id": "49537588-bad0-44ff-b7fd-73683cdb2f6c",
284
+ "metadata": {},
285
+ "outputs": [],
286
+ "source": [
287
+ "# I just reused our previous research code with some modifications.\n",
288
+ "dir_main = \"../\"\n",
289
+ "\n",
290
+ "max_seq_length = 512\n",
291
+ "\n",
292
+ "tokenizer = tokenizer_sl.fn_load_tokenizer_llama(\n",
293
+ " max_seq_length=max_seq_length,\n",
294
+ ")\n",
295
+ "max_length = max_seq_length\n",
296
+ "num_workers = 2\n",
297
+ "\n",
298
+ "## FT\n",
299
+ "\n",
300
+ "dir_model_ft_to_save = f\"{dir_main}/SolLlama-mtr\"\n",
301
+ "# name_model_ft = 'Solvent.pt'\n",
302
+ "name_model_ft = f\"{solute_or_solvent}.pt\""
303
+ ]
304
+ },
305
+ {
306
+ "cell_type": "code",
307
+ "execution_count": 20,
308
+ "id": "cc155008-a7f1-4dd1-8fc3-ad299a5938a6",
309
+ "metadata": {},
310
+ "outputs": [],
311
+ "source": [
312
+ "device = 'cpu'\n",
313
+ "# Predict\n",
314
+ "local_model_ft = utils_sl.load_model_ft_with(\n",
315
+ " class_model_ft=chemllama_ft, \n",
316
+ " dir_model_ft=dir_model_ft_to_save,\n",
317
+ " name_model_ft=name_model_ft\n",
318
+ ").to(device)\n",
319
+ "\n",
320
+ "# result = trainer.predict(local_model_ft, data_module)\n",
321
+ "# result_pred = list()\n",
322
+ "# result_label = list()\n",
323
+ "# for bat in range(len(result)):\n",
324
+ "# result_pred.append(result[bat][0].squeeze())\n",
325
+ "# result_label.append(result[bat][1])\n",
326
+ "\n",
327
+ "# with open('./smiles_str.txt', 'r') as file:\n",
328
+ "# smiles_str = file.readline()\n",
329
+ " \n",
330
+ "dataset_test = datamodule_finetune_sl.CustomLlamaDatasetAbraham(\n",
331
+ " df=pd.DataFrame([smiles_str]),\n",
332
+ " tokenizer=tokenizer,\n",
333
+ " max_seq_length=max_length\n",
334
+ ")\n",
335
+ "\n",
336
+ "data_collator = DataCollatorWithPadding(tokenizer)\n",
337
+ "dataloader_test = DataLoader(dataset_test, batch_size=1, shuffle=False, collate_fn=data_collator)"
338
+ ]
339
+ },
340
+ {
341
+ "cell_type": "code",
342
+ "execution_count": 31,
343
+ "id": "69baeffd-a2cb-439c-be46-69ee4fc5fea1",
344
+ "metadata": {},
345
+ "outputs": [
346
+ {
347
+ "data": {
348
+ "text/plain": [
349
+ "0 COO2\n",
350
+ "Name: 0, dtype: object"
351
+ ]
352
+ },
353
+ "execution_count": 31,
354
+ "metadata": {},
355
+ "output_type": "execute_result"
356
+ }
357
+ ],
358
+ "source": [
359
+ "pd.DataFrame([smiles_str]).iloc[:,0:].iloc[0]"
360
+ ]
361
+ },
362
+ {
363
+ "cell_type": "code",
364
+ "execution_count": 22,
365
+ "id": "7994f626-ca68-4ef1-811d-c2b684cd62ce",
366
+ "metadata": {},
367
+ "outputs": [
368
+ {
369
+ "data": {
370
+ "text/plain": [
371
+ "<datamodule_finetune_sl.CustomLlamaDatasetAbraham at 0x7f81a4f6cf10>"
372
+ ]
373
+ },
374
+ "execution_count": 22,
375
+ "metadata": {},
376
+ "output_type": "execute_result"
377
+ }
378
+ ],
379
+ "source": [
380
+ "dataset_test"
381
+ ]
382
+ },
383
+ {
384
+ "cell_type": "code",
385
+ "execution_count": null,
386
+ "id": "3b173642-1695-40dc-82b5-0e7b775fff38",
387
+ "metadata": {},
388
+ "outputs": [],
389
+ "source": [
390
+ "data_loader_valid = DataLoader(dataset_valid, batch_size=int(batch_size*1.5), shuffle=False, collate_fn=data_collator, num_workers=4, pin_memory=True)"
391
+ ]
392
+ },
393
+ {
394
+ "cell_type": "code",
395
+ "execution_count": 21,
396
+ "id": "a6d6145b-d5f9-44e4-85ca-f27b8c8a339d",
397
+ "metadata": {},
398
+ "outputs": [
399
+ {
400
+ "ename": "ValueError",
401
+ "evalue": "Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`labels` in this case) have excessive nesting (inputs type `list` where type `int` is expected).",
402
+ "output_type": "error",
403
+ "traceback": [
404
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
405
+ "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)",
406
+ "File \u001b[0;32m~/chemllm/lib/python3.11/site-packages/transformers/tokenization_utils_base.py:759\u001b[0m, in \u001b[0;36mBatchEncoding.convert_to_tensors\u001b[0;34m(self, tensor_type, prepend_batch_axis)\u001b[0m\n\u001b[1;32m 758\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_tensor(value):\n\u001b[0;32m--> 759\u001b[0m tensor \u001b[38;5;241m=\u001b[39m \u001b[43mas_tensor\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 761\u001b[0m \u001b[38;5;66;03m# Removing this for now in favor of controlling the shape with `prepend_batch_axis`\u001b[39;00m\n\u001b[1;32m 762\u001b[0m \u001b[38;5;66;03m# # at-least2d\u001b[39;00m\n\u001b[1;32m 763\u001b[0m \u001b[38;5;66;03m# if tensor.ndim > 2:\u001b[39;00m\n\u001b[1;32m 764\u001b[0m \u001b[38;5;66;03m# tensor = tensor.squeeze(0)\u001b[39;00m\n\u001b[1;32m 765\u001b[0m \u001b[38;5;66;03m# elif tensor.ndim < 2:\u001b[39;00m\n\u001b[1;32m 766\u001b[0m \u001b[38;5;66;03m# tensor = tensor[None, :]\u001b[39;00m\n",
407
+ "File \u001b[0;32m~/chemllm/lib/python3.11/site-packages/transformers/tokenization_utils_base.py:721\u001b[0m, in \u001b[0;36mBatchEncoding.convert_to_tensors.<locals>.as_tensor\u001b[0;34m(value, dtype)\u001b[0m\n\u001b[1;32m 720\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mtensor(np\u001b[38;5;241m.\u001b[39marray(value))\n\u001b[0;32m--> 721\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtensor\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m)\u001b[49m\n",
408
+ "\u001b[0;31mRuntimeError\u001b[0m: Could not infer dtype of NoneType",
409
+ "\nThe above exception was the direct cause of the following exception:\n",
410
+ "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
411
+ "Cell \u001b[0;32mIn[21], line 4\u001b[0m\n\u001b[1;32m 2\u001b[0m local_model_ft\u001b[38;5;241m.\u001b[39meval()\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39minference_mode():\n\u001b[0;32m----> 4\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i, v_batch \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(dataloader_test):\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[1;32m 6\u001b[0m \u001b[38;5;66;03m# v_input_ids = v_batch['input_ids'].to(device)\u001b[39;00m\n\u001b[1;32m 7\u001b[0m \u001b[38;5;66;03m# v_attention_mask = v_batch['attention_mask'].to(device)\u001b[39;00m\n\u001b[1;32m 8\u001b[0m \u001b[38;5;66;03m# # v_y_labels = v_batch['labels'].to(device)\u001b[39;00m\n\u001b[1;32m 9\u001b[0m \u001b[38;5;66;03m# v_y_logits = local_model_ft(input_ids=v_input_ids, attention_mask=v_attention_mask)\u001b[39;00m\n\u001b[1;32m 10\u001b[0m \u001b[38;5;66;03m# list_predictions.append(v_y_logits[0][0].tolist())\u001b[39;00m\n",
412
+ "File \u001b[0;32m~/chemllm/lib/python3.11/site-packages/torch/utils/data/dataloader.py:634\u001b[0m, in \u001b[0;36m_BaseDataLoaderIter.__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 631\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_sampler_iter \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 632\u001b[0m \u001b[38;5;66;03m# TODO(https://github.com/pytorch/pytorch/issues/76750)\u001b[39;00m\n\u001b[1;32m 633\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_reset() \u001b[38;5;66;03m# type: ignore[call-arg]\u001b[39;00m\n\u001b[0;32m--> 634\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_next_data\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 635\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_num_yielded \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m 636\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_dataset_kind \u001b[38;5;241m==\u001b[39m _DatasetKind\u001b[38;5;241m.\u001b[39mIterable \u001b[38;5;129;01mand\u001b[39;00m \\\n\u001b[1;32m 637\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_IterableDataset_len_called \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \\\n\u001b[1;32m 638\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_num_yielded \u001b[38;5;241m>\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_IterableDataset_len_called:\n",
413
+ "File \u001b[0;32m~/chemllm/lib/python3.11/site-packages/torch/utils/data/dataloader.py:678\u001b[0m, in \u001b[0;36m_SingleProcessDataLoaderIter._next_data\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 676\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_next_data\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m 677\u001b[0m index \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_next_index() \u001b[38;5;66;03m# may raise StopIteration\u001b[39;00m\n\u001b[0;32m--> 678\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_dataset_fetcher\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfetch\u001b[49m\u001b[43m(\u001b[49m\u001b[43mindex\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# may raise StopIteration\u001b[39;00m\n\u001b[1;32m 679\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_pin_memory:\n\u001b[1;32m 680\u001b[0m data \u001b[38;5;241m=\u001b[39m _utils\u001b[38;5;241m.\u001b[39mpin_memory\u001b[38;5;241m.\u001b[39mpin_memory(data, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_pin_memory_device)\n",
414
+ "File \u001b[0;32m~/chemllm/lib/python3.11/site-packages/torch/utils/data/_utils/fetch.py:54\u001b[0m, in \u001b[0;36m_MapDatasetFetcher.fetch\u001b[0;34m(self, possibly_batched_index)\u001b[0m\n\u001b[1;32m 52\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 53\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset[possibly_batched_index]\n\u001b[0;32m---> 54\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcollate_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m)\u001b[49m\n",
415
+ "File \u001b[0;32m~/chemllm/lib/python3.11/site-packages/transformers/data/data_collator.py:271\u001b[0m, in \u001b[0;36mDataCollatorWithPadding.__call__\u001b[0;34m(self, features)\u001b[0m\n\u001b[1;32m 270\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, features: List[Dict[\u001b[38;5;28mstr\u001b[39m, Any]]) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Dict[\u001b[38;5;28mstr\u001b[39m, Any]:\n\u001b[0;32m--> 271\u001b[0m batch \u001b[38;5;241m=\u001b[39m \u001b[43mpad_without_fast_tokenizer_warning\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 272\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtokenizer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 273\u001b[0m \u001b[43m \u001b[49m\u001b[43mfeatures\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 274\u001b[0m \u001b[43m \u001b[49m\u001b[43mpadding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpadding\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 275\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_length\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmax_length\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 276\u001b[0m \u001b[43m \u001b[49m\u001b[43mpad_to_multiple_of\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpad_to_multiple_of\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 277\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_tensors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mreturn_tensors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 278\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 279\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlabel\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m batch:\n\u001b[1;32m 280\u001b[0m batch[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlabels\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m batch[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlabel\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n",
416
+ "File \u001b[0;32m~/chemllm/lib/python3.11/site-packages/transformers/data/data_collator.py:66\u001b[0m, in \u001b[0;36mpad_without_fast_tokenizer_warning\u001b[0;34m(tokenizer, *pad_args, **pad_kwargs)\u001b[0m\n\u001b[1;32m 63\u001b[0m tokenizer\u001b[38;5;241m.\u001b[39mdeprecation_warnings[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAsking-to-pad-a-fast-tokenizer\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 65\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m---> 66\u001b[0m padded \u001b[38;5;241m=\u001b[39m \u001b[43mtokenizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpad\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mpad_args\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mpad_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 67\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 68\u001b[0m \u001b[38;5;66;03m# Restore the state of the warning.\u001b[39;00m\n\u001b[1;32m 69\u001b[0m tokenizer\u001b[38;5;241m.\u001b[39mdeprecation_warnings[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAsking-to-pad-a-fast-tokenizer\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m warning_state\n",
417
+ "File \u001b[0;32m~/chemllm/lib/python3.11/site-packages/transformers/tokenization_utils_base.py:3369\u001b[0m, in \u001b[0;36mPreTrainedTokenizerBase.pad\u001b[0;34m(self, encoded_inputs, padding, max_length, pad_to_multiple_of, return_attention_mask, return_tensors, verbose)\u001b[0m\n\u001b[1;32m 3366\u001b[0m batch_outputs[key] \u001b[38;5;241m=\u001b[39m []\n\u001b[1;32m 3367\u001b[0m batch_outputs[key]\u001b[38;5;241m.\u001b[39mappend(value)\n\u001b[0;32m-> 3369\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mBatchEncoding\u001b[49m\u001b[43m(\u001b[49m\u001b[43mbatch_outputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtensor_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_tensors\u001b[49m\u001b[43m)\u001b[49m\n",
418
+ "File \u001b[0;32m~/chemllm/lib/python3.11/site-packages/transformers/tokenization_utils_base.py:224\u001b[0m, in \u001b[0;36mBatchEncoding.__init__\u001b[0;34m(self, data, encoding, tensor_type, prepend_batch_axis, n_sequences)\u001b[0m\n\u001b[1;32m 220\u001b[0m n_sequences \u001b[38;5;241m=\u001b[39m encoding[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;241m.\u001b[39mn_sequences\n\u001b[1;32m 222\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_n_sequences \u001b[38;5;241m=\u001b[39m n_sequences\n\u001b[0;32m--> 224\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconvert_to_tensors\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtensor_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtensor_type\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mprepend_batch_axis\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mprepend_batch_axis\u001b[49m\u001b[43m)\u001b[49m\n",
419
+ "File \u001b[0;32m~/chemllm/lib/python3.11/site-packages/transformers/tokenization_utils_base.py:775\u001b[0m, in \u001b[0;36mBatchEncoding.convert_to_tensors\u001b[0;34m(self, tensor_type, prepend_batch_axis)\u001b[0m\n\u001b[1;32m 770\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m key \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124moverflowing_tokens\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 771\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 772\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUnable to create tensor returning overflowing tokens of different lengths. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 773\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPlease see if a fast version of this tokenizer is available to have this feature available.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 774\u001b[0m ) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01me\u001b[39;00m\n\u001b[0;32m--> 775\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 776\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUnable to create tensor, you should probably activate truncation and/or padding with\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 777\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mpadding=True\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtruncation=True\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m to have batched tensors with the same length. Perhaps your\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 778\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m features (`\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m` in this case) have excessive nesting (inputs type `list` where type `int` is\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 779\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m expected).\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 780\u001b[0m ) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01me\u001b[39;00m\n\u001b[1;32m 782\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\n",
420
+ "\u001b[0;31mValueError\u001b[0m: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`labels` in this case) have excessive nesting (inputs type `list` where type `int` is expected)."
421
+ ]
422
+ }
423
+ ],
424
+ "source": [
425
+ "list_predictions = []\n",
426
+ "local_model_ft.eval()\n",
427
+ "with torch.inference_mode():\n",
428
+ " for i, v_batch in enumerate(dataloader_test):\n",
429
+ " break\n",
430
+ " # v_input_ids = v_batch['input_ids'].to(device)\n",
431
+ " # v_attention_mask = v_batch['attention_mask'].to(device)\n",
432
+ " # # v_y_labels = v_batch['labels'].to(device)\n",
433
+ " # v_y_logits = local_model_ft(input_ids=v_input_ids, attention_mask=v_attention_mask)\n",
434
+ " # list_predictions.append(v_y_logits[0][0].tolist())"
435
+ ]
436
+ },
437
+ {
438
+ "cell_type": "code",
439
+ "execution_count": null,
440
+ "id": "7bc3e296-6871-45fb-8459-78eadc36bb61",
441
+ "metadata": {},
442
+ "outputs": [],
443
+ "source": []
444
+ }
445
+ ],
446
+ "metadata": {
447
+ "kernelspec": {
448
+ "display_name": "chemllm",
449
+ "language": "python",
450
+ "name": "chemllm"
451
+ },
452
+ "language_info": {
453
+ "codemirror_mode": {
454
+ "name": "ipython",
455
+ "version": 3
456
+ },
457
+ "file_extension": ".py",
458
+ "mimetype": "text/x-python",
459
+ "name": "python",
460
+ "nbconvert_exporter": "python",
461
+ "pygments_lexer": "ipython3",
462
+ "version": "3.11.3"
463
+ }
464
+ },
465
+ "nbformat": 4,
466
+ "nbformat_minor": 5
467
+ }
__pycache__/auto_evaluator_sl.cpython-311.pyc ADDED
Binary file (9.12 kB). View file
 
__pycache__/chemllama_mtr.cpython-311.pyc ADDED
Binary file (7.89 kB). View file
 
__pycache__/datamodule_finetune_sl.cpython-311.pyc ADDED
Binary file (5.45 kB). View file
 
__pycache__/model_finetune_sl.cpython-311.pyc ADDED
Binary file (6.43 kB). View file
 
__pycache__/tokenizer_sl.cpython-311.pyc ADDED
Binary file (1.43 kB). View file
 
__pycache__/utils_sl.cpython-311.pyc ADDED
Binary file (3.55 kB). View file
 
datamodule_finetune_sl.py CHANGED
@@ -61,7 +61,7 @@ class CustomLlamaDatasetAbraham(Dataset):
61
  return {
62
  "input_ids": torch.tensor(local_encoded["input_ids"]),
63
  "attention_mask": torch.tensor(local_encoded["attention_mask"]),
64
- "labels": None,
65
  }
66
 
67
  class CustomFinetuneDataModule(L.LightningDataModule):
 
61
  return {
62
  "input_ids": torch.tensor(local_encoded["input_ids"]),
63
  "attention_mask": torch.tensor(local_encoded["attention_mask"]),
64
+ "labels": torch.tensor(local_encoded["input_ids"]), # this one does not matter for sl
65
  }
66
 
67
  class CustomFinetuneDataModule(L.LightningDataModule):