diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..c6ae8ff1d4d61bc316f00f441ff787c0dcea4949 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,20 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +finetune_data/human_chr1/train.csv filter=lfs diff=lfs merge=lfs -text +finetune_data/human_mouse_superclass/train.csv filter=lfs diff=lfs merge=lfs -text +finetune_data/human_mouse_superclass_allchr/dev.csv filter=lfs diff=lfs merge=lfs -text +finetune_data/human_mouse_superclass_allchr/test.csv filter=lfs diff=lfs merge=lfs -text +finetune_data/human_mouse_superclass_allchr/train.csv filter=lfs diff=lfs merge=lfs -text +finetune_data/super_binary/EXC/dev.csv filter=lfs diff=lfs merge=lfs -text +finetune_data/super_binary/EXC/test.csv filter=lfs diff=lfs merge=lfs -text +finetune_data/super_binary/EXC/train.csv filter=lfs diff=lfs merge=lfs -text +finetune_data/super_binary/GLIA/dev.csv filter=lfs diff=lfs merge=lfs -text +finetune_data/super_binary/GLIA/test.csv filter=lfs diff=lfs merge=lfs -text +finetune_data/super_binary/GLIA/train.csv filter=lfs diff=lfs merge=lfs -text +finetune_data/super_binary/INH/dev.csv filter=lfs diff=lfs merge=lfs -text +finetune_data/super_binary/INH/test.csv filter=lfs diff=lfs merge=lfs -text +finetune_data/super_binary/INH/train.csv filter=lfs diff=lfs merge=lfs -text +finetune_data/super_binary/VASC/dev.csv filter=lfs diff=lfs merge=lfs -text +finetune_data/super_binary/VASC/test.csv filter=lfs diff=lfs merge=lfs -text +finetune_data/super_binary/VASC/train.csv filter=lfs diff=lfs merge=lfs -text diff --git a/Finetune-species/super_all/super_log_1e-5_len2_4096.txt b/Finetune-species/super_all/super_log_1e-5_len2_4096.txt new file mode 100644 index 0000000000000000000000000000000000000000..8db99640a641a8146cd6062c55969e35059b6b44 --- /dev/null +++ b/Finetune-species/super_all/super_log_1e-5_len2_4096.txt @@ -0,0 +1,832 @@ +nohup: ignoring input +The provided data_path is finetune_data/human_mouse_superclass_allchr +wandb: Appending key for api.wandb.ai to your netrc file: /root/.netrc +wandb: Currently logged in as: n5huang (n5huang-uc-san-diego) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin +/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/transformers/utils/generic.py:441: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. + _torch_pytree._register_pytree_node( +/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/transformers/utils/generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. + _torch_pytree._register_pytree_node( +/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/transformers/utils/generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. + _torch_pytree._register_pytree_node( +wandb: Tracking run with wandb version 0.23.1 +wandb: Run data is saved locally in /root/NaN/dna-tokenizer/SFT/wandb/run-20260209_054513-r5m4bef6 +wandb: Run `wandb offline` to turn off syncing. +wandb: Syncing run breezy-sky-5 +wandb: โญ๏ธ View project at https://wandb.ai/n5huang-uc-san-diego/mouse_all +wandb: ๐Ÿš€ View run at https://wandb.ai/n5huang-uc-san-diego/mouse_all/runs/r5m4bef6 +WARNING:root:Perform single sequence classification... +WARNING:root:Perform single sequence classification... +WARNING:root:Perform single sequence classification... +Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /root/NaN/dna-tokenizer/pretrain/models/model_len2_4096/checkpoint-100000 and are newly initialized: ['bert.pooler.dense.bias', 'classifier.bias', 'bert.pooler.dense.weight', 'classifier.weight'] +You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. +/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/accelerate/accelerator.py:439: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead. + self.scaler = torch.cuda.amp.GradScaler(**kwargs) +Using auto half precision backend +***** Running training ***** + Num examples = 499,426 + Num Epochs = 4 + Instantaneous batch size per device = 128 + Total train batch size (w. parallel, distributed & accumulation) = 128 + Gradient Accumulation steps = 1 + Total optimization steps = 15,608 + Number of trainable parameters = 89,190,148 +Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true" + 0%| | 0/15608 [00:00 + sys.exit(main()) + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 357, in wrapper + return f(*args, **kwargs) + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/run.py", line 901, in main + run(args) + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/run.py", line 892, in run + elastic_launch( + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 143, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 268, in launch_agent + result = agent.run() + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/elastic/metrics/api.py", line 138, in wrapper + result = f(*args, **kwargs) + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/elastic/agent/server/api.py", line 715, in run + result = self._invoke_run(role) + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/elastic/agent/server/api.py", line 879, in _invoke_run + time.sleep(monitor_interval) + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 84, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 280883 got signal: 1 diff --git a/Finetune-species/super_all/super_log_2e-5_len2_4096.txt b/Finetune-species/super_all/super_log_2e-5_len2_4096.txt new file mode 100644 index 0000000000000000000000000000000000000000..598ea890e8ab6df1b52b2c33a509c01c3e57cc73 --- /dev/null +++ b/Finetune-species/super_all/super_log_2e-5_len2_4096.txt @@ -0,0 +1,833 @@ +nohup: ignoring input +The provided data_path is finetune_data/human_mouse_superclass_allchr +wandb: Appending key for api.wandb.ai to your netrc file: /root/.netrc +wandb: Currently logged in as: n5huang (n5huang-uc-san-diego) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin +/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/transformers/utils/generic.py:441: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. + _torch_pytree._register_pytree_node( +/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/transformers/utils/generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. + _torch_pytree._register_pytree_node( +/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/transformers/utils/generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. + _torch_pytree._register_pytree_node( +wandb: setting up run 9fcvvu94 +wandb: Tracking run with wandb version 0.23.1 +wandb: Run data is saved locally in /root/NaN/dna-tokenizer/SFT/wandb/run-20260209_054347-9fcvvu94 +wandb: Run `wandb offline` to turn off syncing. +wandb: Syncing run happy-bush-4 +wandb: โญ๏ธ View project at https://wandb.ai/n5huang-uc-san-diego/mouse_all +wandb: ๐Ÿš€ View run at https://wandb.ai/n5huang-uc-san-diego/mouse_all/runs/9fcvvu94 +WARNING:root:Perform single sequence classification... +WARNING:root:Perform single sequence classification... +WARNING:root:Perform single sequence classification... +Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /root/NaN/dna-tokenizer/pretrain/models/model_len2_4096/checkpoint-100000 and are newly initialized: ['classifier.bias', 'bert.pooler.dense.weight', 'classifier.weight', 'bert.pooler.dense.bias'] +You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. +/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/accelerate/accelerator.py:439: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead. + self.scaler = torch.cuda.amp.GradScaler(**kwargs) +Using auto half precision backend +***** Running training ***** + Num examples = 499,426 + Num Epochs = 4 + Instantaneous batch size per device = 128 + Total train batch size (w. parallel, distributed & accumulation) = 128 + Gradient Accumulation steps = 1 + Total optimization steps = 15,608 + Number of trainable parameters = 89,190,148 +Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true" + 0%| | 0/15608 [00:00 + sys.exit(main()) + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 357, in wrapper + return f(*args, **kwargs) + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/run.py", line 901, in main + run(args) + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/run.py", line 892, in run + elastic_launch( + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 143, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 268, in launch_agent + result = agent.run() + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/elastic/metrics/api.py", line 138, in wrapper + result = f(*args, **kwargs) + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/elastic/agent/server/api.py", line 715, in run + result = self._invoke_run(role) + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/elastic/agent/server/api.py", line 879, in _invoke_run + time.sleep(monitor_interval) + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 84, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 279533 got signal: 1 diff --git a/Finetune-species/super_all/super_log_3e-5_len2_3072.txt b/Finetune-species/super_all/super_log_3e-5_len2_3072.txt new file mode 100644 index 0000000000000000000000000000000000000000..c0c9d91b9a384f4b80c93f83f6ecc3a14496463c --- /dev/null +++ b/Finetune-species/super_all/super_log_3e-5_len2_3072.txt @@ -0,0 +1,827 @@ +nohup: ignoring input +The provided data_path is finetune_data/human_mouse_superclass_allchr +wandb: Appending key for api.wandb.ai to your netrc file: /root/.netrc +wandb: Currently logged in as: n5huang (n5huang-uc-san-diego) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin +/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/transformers/utils/generic.py:441: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. + _torch_pytree._register_pytree_node( +/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/transformers/utils/generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. + _torch_pytree._register_pytree_node( +/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/transformers/utils/generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. + _torch_pytree._register_pytree_node( +wandb: setting up run 3e95uwyp +wandb: Tracking run with wandb version 0.23.1 +wandb: Run data is saved locally in /root/NaN/dna-tokenizer/SFT/wandb/run-20260209_052301-3e95uwyp +wandb: Run `wandb offline` to turn off syncing. +wandb: Syncing run driven-puddle-2 +wandb: โญ๏ธ View project at https://wandb.ai/n5huang-uc-san-diego/mouse_all +wandb: ๐Ÿš€ View run at https://wandb.ai/n5huang-uc-san-diego/mouse_all/runs/3e95uwyp +WARNING:root:Perform single sequence classification... +WARNING:root:Perform single sequence classification... +WARNING:root:Perform single sequence classification... +Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /root/NaN/dna-tokenizer/pretrain/models/model_len2_4096/checkpoint-100000 and are newly initialized: ['bert.pooler.dense.weight', 'classifier.weight', 'bert.pooler.dense.bias', 'classifier.bias'] +You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. +/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/accelerate/accelerator.py:439: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead. + self.scaler = torch.cuda.amp.GradScaler(**kwargs) +Using auto half precision backend +***** Running training ***** + Num examples = 499,426 + Num Epochs = 4 + Instantaneous batch size per device = 128 + Total train batch size (w. parallel, distributed & accumulation) = 128 + Gradient Accumulation steps = 1 + Total optimization steps = 15,608 + Number of trainable parameters = 89,190,148 +Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true" + 0%| | 0/15608 [00:00 + sys.exit(main()) + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 357, in wrapper + return f(*args, **kwargs) + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/run.py", line 901, in main + run(args) + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/run.py", line 892, in run + elastic_launch( + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 143, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 268, in launch_agent + result = agent.run() + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/elastic/metrics/api.py", line 138, in wrapper + result = f(*args, **kwargs) + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/elastic/agent/server/api.py", line 715, in run + result = self._invoke_run(role) + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/elastic/agent/server/api.py", line 879, in _invoke_run + time.sleep(monitor_interval) + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 84, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 283626 got signal: 1 +Running fine-tune for GLIA from finetune_data/super_binary/GLIA +wandb: Appending key for api.wandb.ai to your netrc file: /root/.netrc +wandb: Currently logged in as: n5huang (n5huang-uc-san-diego) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin +/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/transformers/utils/generic.py:441: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. + _torch_pytree._register_pytree_node( +/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/transformers/utils/generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. + _torch_pytree._register_pytree_node( +/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/transformers/utils/generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. + _torch_pytree._register_pytree_node( +wandb: setting up run z9fhixvx +wandb: Tracking run with wandb version 0.23.1 +wandb: Run data is saved locally in /root/NaN/dna-tokenizer/SFT/wandb/run-20260209_061052-z9fhixvx +wandb: Run `wandb offline` to turn off syncing. +wandb: Syncing run zany-elevator-4 +wandb: โญ๏ธ View project at https://wandb.ai/n5huang-uc-san-diego/super_binary +wandb: ๐Ÿš€ View run at https://wandb.ai/n5huang-uc-san-diego/super_binary/runs/z9fhixvx +WARNING:root:Perform single sequence classification... +WARNING:root:Perform single sequence classification... +WARNING:root:Perform single sequence classification... +Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /root/NaN/dna-tokenizer/pretrain/models/model_len2_4096/checkpoint-100000 and are newly initialized: ['classifier.weight', 'classifier.bias', 'bert.pooler.dense.weight', 'bert.pooler.dense.bias'] +You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. +/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/accelerate/accelerator.py:439: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead. + self.scaler = torch.cuda.amp.GradScaler(**kwargs) +Using auto half precision backend +***** Running training ***** + Num examples = 307,529 + Num Epochs = 8 + Instantaneous batch size per device = 128 + Total train batch size (w. parallel, distributed & accumulation) = 128 + Gradient Accumulation steps = 1 + Total optimization steps = 19,224 + Number of trainable parameters = 89,188,610 +Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true" + 0%| | 0/19224 [00:00 + sys.exit(main()) + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 357, in wrapper + return f(*args, **kwargs) + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/run.py", line 901, in main + run(args) + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/run.py", line 892, in run + elastic_launch( + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 143, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 268, in launch_agent + result = agent.run() + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/elastic/metrics/api.py", line 138, in wrapper + result = f(*args, **kwargs) + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/elastic/agent/server/api.py", line 715, in run + result = self._invoke_run(role) + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/elastic/agent/server/api.py", line 879, in _invoke_run + time.sleep(monitor_interval) + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 84, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 459978 got signal: 1 +Running fine-tune for INH from finetune_data/super_binary/INH +wandb: Appending key for api.wandb.ai to your netrc file: /root/.netrc +wandb: Currently logged in as: n5huang (n5huang-uc-san-diego) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin +/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/transformers/utils/generic.py:441: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. + _torch_pytree._register_pytree_node( +/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/transformers/utils/generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. + _torch_pytree._register_pytree_node( +/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/transformers/utils/generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. + _torch_pytree._register_pytree_node( +wandb: Tracking run with wandb version 0.23.1 +wandb: Run data is saved locally in /root/NaN/dna-tokenizer/SFT/wandb/run-20260209_112356-zajgrrb5 +wandb: Run `wandb offline` to turn off syncing. +wandb: Syncing run dazzling-elevator-17 +wandb: โญ๏ธ View project at https://wandb.ai/n5huang-uc-san-diego/super_binary +wandb: ๐Ÿš€ View run at https://wandb.ai/n5huang-uc-san-diego/super_binary/runs/zajgrrb5 +WARNING:root:Perform single sequence classification... +WARNING:root:Perform single sequence classification... +WARNING:root:Perform single sequence classification... +Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /root/NaN/dna-tokenizer/pretrain/models/base_3072/checkpoint-100000 and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias', 'classifier.bias', 'classifier.weight'] +You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. +/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/accelerate/accelerator.py:439: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead. + self.scaler = torch.cuda.amp.GradScaler(**kwargs) +Using auto half precision backend +***** Running training ***** + Num examples = 307,529 + Num Epochs = 8 + Instantaneous batch size per device = 128 + Total train batch size (w. parallel, distributed & accumulation) = 128 + Gradient Accumulation steps = 1 + Total optimization steps = 19,224 + Number of trainable parameters = 88,402,178 +Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true" + 0%| | 0/19224 [00:00 + sys.exit(main()) + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 357, in wrapper + return f(*args, **kwargs) + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/run.py", line 901, in main + run(args) + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/run.py", line 892, in run + elastic_launch( + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 143, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 268, in launch_agent + result = agent.run() + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/elastic/metrics/api.py", line 138, in wrapper + result = f(*args, **kwargs) + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/elastic/agent/server/api.py", line 715, in run + result = self._invoke_run(role) + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/elastic/agent/server/api.py", line 879, in _invoke_run + time.sleep(monitor_interval) + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 84, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 460790 got signal: 1 +Running fine-tune for INH from finetune_data/super_binary/INH +wandb: Appending key for api.wandb.ai to your netrc file: /root/.netrc +wandb: Currently logged in as: n5huang (n5huang-uc-san-diego) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin +/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/transformers/utils/generic.py:441: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. + _torch_pytree._register_pytree_node( +/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/transformers/utils/generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. + _torch_pytree._register_pytree_node( +/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/transformers/utils/generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. + _torch_pytree._register_pytree_node( +wandb: setting up run miv0rxse +wandb: Tracking run with wandb version 0.23.1 +wandb: Run data is saved locally in /root/NaN/dna-tokenizer/SFT/wandb/run-20260209_112356-miv0rxse +wandb: Run `wandb offline` to turn off syncing. +wandb: Syncing run radiant-spaceship-17 +wandb: โญ๏ธ View project at https://wandb.ai/n5huang-uc-san-diego/super_binary +wandb: ๐Ÿš€ View run at https://wandb.ai/n5huang-uc-san-diego/super_binary/runs/miv0rxse +WARNING:root:Perform single sequence classification... +WARNING:root:Perform single sequence classification... +WARNING:root:Perform single sequence classification... +Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /root/NaN/dna-tokenizer/pretrain/models/base_4096/checkpoint-100000 and are newly initialized: ['classifier.weight', 'bert.pooler.dense.bias', 'classifier.bias', 'bert.pooler.dense.weight'] +You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. +/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/accelerate/accelerator.py:439: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead. + self.scaler = torch.cuda.amp.GradScaler(**kwargs) +Using auto half precision backend +***** Running training ***** + Num examples = 307,529 + Num Epochs = 8 + Instantaneous batch size per device = 128 + Total train batch size (w. parallel, distributed & accumulation) = 128 + Gradient Accumulation steps = 1 + Total optimization steps = 19,224 + Number of trainable parameters = 89,188,610 +Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true" + 0%| | 0/19224 [00:00 + sys.exit(main()) + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 357, in wrapper + return f(*args, **kwargs) + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/run.py", line 901, in main + run(args) + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/run.py", line 892, in run + elastic_launch( + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 143, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 268, in launch_agent + result = agent.run() + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/elastic/metrics/api.py", line 138, in wrapper + result = f(*args, **kwargs) + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/elastic/agent/server/api.py", line 715, in run + result = self._invoke_run(role) + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/elastic/agent/server/api.py", line 879, in _invoke_run + time.sleep(monitor_interval) + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 84, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 282776 got signal: 1 +Running fine-tune for GLIA from finetune_data/super_binary/GLIA +wandb: Appending key for api.wandb.ai to your netrc file: /root/.netrc +wandb: Currently logged in as: n5huang (n5huang-uc-san-diego) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin +/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/transformers/utils/generic.py:441: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. + _torch_pytree._register_pytree_node( +/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/transformers/utils/generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. + _torch_pytree._register_pytree_node( +/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/transformers/utils/generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. + _torch_pytree._register_pytree_node( +wandb: Tracking run with wandb version 0.23.1 +wandb: Run data is saved locally in /root/NaN/dna-tokenizer/SFT/wandb/run-20260209_061052-p54wnjbw +wandb: Run `wandb offline` to turn off syncing. +wandb: Syncing run prime-forest-6 +wandb: โญ๏ธ View project at https://wandb.ai/n5huang-uc-san-diego/super_binary +wandb: ๐Ÿš€ View run at https://wandb.ai/n5huang-uc-san-diego/super_binary/runs/p54wnjbw +WARNING:root:Perform single sequence classification... +WARNING:root:Perform single sequence classification... +WARNING:root:Perform single sequence classification... +Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /root/NaN/dna-tokenizer/pretrain/models/model_len2_4096/checkpoint-100000 and are newly initialized: ['classifier.bias', 'bert.pooler.dense.bias', 'classifier.weight', 'bert.pooler.dense.weight'] +You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. +/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/accelerate/accelerator.py:439: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead. + self.scaler = torch.cuda.amp.GradScaler(**kwargs) +Using auto half precision backend +***** Running training ***** + Num examples = 307,529 + Num Epochs = 8 + Instantaneous batch size per device = 128 + Total train batch size (w. parallel, distributed & accumulation) = 128 + Gradient Accumulation steps = 1 + Total optimization steps = 19,224 + Number of trainable parameters = 89,188,610 +Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true" + 0%| | 0/19224 [00:00 + sys.exit(main()) + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 357, in wrapper + return f(*args, **kwargs) + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/run.py", line 901, in main + run(args) + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/run.py", line 892, in run + elastic_launch( + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 143, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 268, in launch_agent + result = agent.run() + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/elastic/metrics/api.py", line 138, in wrapper + result = f(*args, **kwargs) + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/elastic/agent/server/api.py", line 715, in run + result = self._invoke_run(role) + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/elastic/agent/server/api.py", line 879, in _invoke_run + time.sleep(monitor_interval) + File "/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 84, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 284379 got signal: 1 +Running fine-tune for GLIA from finetune_data/super_binary/GLIA +wandb: Appending key for api.wandb.ai to your netrc file: /root/.netrc +wandb: Currently logged in as: n5huang (n5huang-uc-san-diego) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin +/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/transformers/utils/generic.py:441: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. + _torch_pytree._register_pytree_node( +/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/transformers/utils/generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. + _torch_pytree._register_pytree_node( +/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/transformers/utils/generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. + _torch_pytree._register_pytree_node( +wandb: Tracking run with wandb version 0.23.1 +wandb: Run data is saved locally in /root/NaN/dna-tokenizer/SFT/wandb/run-20260209_061052-64ywiw8s +wandb: Run `wandb offline` to turn off syncing. +wandb: Syncing run generous-star-5 +wandb: โญ๏ธ View project at https://wandb.ai/n5huang-uc-san-diego/super_binary +wandb: ๐Ÿš€ View run at https://wandb.ai/n5huang-uc-san-diego/super_binary/runs/64ywiw8s +WARNING:root:Perform single sequence classification... +WARNING:root:Perform single sequence classification... +WARNING:root:Perform single sequence classification... +Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /root/NaN/dna-tokenizer/pretrain/models/model_len2_4096/checkpoint-100000 and are newly initialized: ['classifier.weight', 'bert.pooler.dense.bias', 'classifier.bias', 'bert.pooler.dense.weight'] +You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. +/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/accelerate/accelerator.py:439: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead. + self.scaler = torch.cuda.amp.GradScaler(**kwargs) +Using auto half precision backend +***** Running training ***** + Num examples = 307,529 + Num Epochs = 8 + Instantaneous batch size per device = 128 + Total train batch size (w. parallel, distributed & accumulation) = 128 + Gradient Accumulation steps = 1 + Total optimization steps = 19,224 + Number of trainable parameters = 89,188,610 +Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true" + 0%| | 0/19224 [00:00= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126): + return True + cat = unicodedata.category(char) + if cat.startswith("P"): + return True + return False diff --git a/tokenization_motif.py b/tokenization_motif.py new file mode 100644 index 0000000000000000000000000000000000000000..5691227559817356840e321cffbbe3406a118833 --- /dev/null +++ b/tokenization_motif.py @@ -0,0 +1,400 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes.""" + + +import collections +import logging +import os +import math +import unicodedata +import json + +from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast + + +logger = logging.getLogger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": os.getenv("VOCAB_NAME")} + +PRETRAINED_VOCAB_FILES_MAP = {"vocab_file": { + 'motif' : os.getenv("VOCAB_PATH") + } + } + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {'motif': os.getenv("POSITIONAL_EMBEDDINGS_SIZE")} +PRETRAINED_INIT_CONFIGURATION = {'motif': {"do_lower_case": False}} + + +def load_vocab(vocab_file): + """Loads a vocabulary file into a dictionary.""" + vocab = collections.OrderedDict() + with open(vocab_file, "r", encoding="utf-8") as reader: + tokens = reader.readlines() + for index, token in enumerate(tokens): + token = token.rstrip("\n") + vocab[token] = index + return vocab + + +def whitespace_tokenize(text): + """Runs basic whitespace cleaning and splitting on a piece of text.""" + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + +class MotifTokenizer(PreTrainedTokenizer): + r""" + Constructs a BertTokenizer. + :class:`~transformers.BertTokenizer` runs end-to-end tokenization: punctuation splitting + wordpiece + Args: + vocab_file: Path to a one-wordpiece-per-line vocabulary file + do_lower_case: Whether to lower case the input. Only has an effect when do_basic_tokenize=True + do_basic_tokenize: Whether to do basic tokenization before wordpiece. + max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the + minimum of this value (if specified) and the underlying BERT model's sequence length. + never_split: List of tokens which will never be split during tokenization. Only has an effect when + do_basic_tokenize=True + """ + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + + def __init__( + self, + vocab_file, + do_lower_case=False, + never_split=None, + unk_token="[UNK]", + sep_token="[SEP]", + pad_token="[PAD]", + cls_token="[CLS]", + mask_token="[MASK]", + tokenize_chinese_chars=False, + additional_special_tokens = None, + **kwargs + ): + """Constructs a BertTokenizer. + Args: + **vocab_file**: Path to a one-wordpiece-per-line vocabulary file + **do_lower_case**: (`optional`) boolean (default True) + Whether to lower case the input + Only has an effect when do_basic_tokenize=True + **do_basic_tokenize**: (`optional`) boolean (default True) + Whether to do basic tokenization before wordpiece. + **never_split**: (`optional`) list of string + List of tokens which will never be split during tokenization. + Only has an effect when do_basic_tokenize=True + **tokenize_chinese_chars**: (`optional`) boolean (default True) + Whether to tokenize Chinese characters. + This should likely be deactivated for Japanese: + see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328 + """ + super().__init__( + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + **kwargs, + ) + self.vocab = load_vocab(vocab_file) + self.max_len_single_sentence = self.model_max_length - 2 # take into account special tokens + self.max_len_sentences_pair = self.model_max_length - 3 # take into account special tokens + if not os.path.isfile(vocab_file): + raise ValueError( + "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained " + "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file) + ) + self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) + self._additional_special_tokens = additional_special_tokens or [] + self.basic_tokenizer = BasicTokenizer( + do_lower_case=do_lower_case, never_split=never_split, tokenize_chinese_chars=tokenize_chinese_chars + ) + + def add_tokens(self, new_tokens): + """Method to add custom tokens to the tokenizer.""" + self._additional_special_tokens.extend(new_tokens) + self._additional_special_tokens = list(set(self._additional_special_tokens)) # Remove duplicates + print(f"Custom tokens added: {new_tokens}") + print(f"Updated additional_special_tokens: {self._additional_special_tokens}") + + @property + def all_special_tokens(self): + """ List all the special tokens ('', ''...) mapped to class attributes + (cls_token, unk_token...) and custom special tokens (additional_special_tokens). + """ + set_attr = self.special_tokens_map + all_toks = [] + + # Add standard special tokens + for attr_value in set_attr.values(): + all_toks += (list(attr_value) if isinstance(attr_value, (list, tuple)) else [attr_value]) + + # Add custom special tokens + all_toks += self._additional_special_tokens + + # Remove duplicates by converting to a set and back to a list + all_toks = list(set(all_toks)) + + return all_toks + + @property + def vocab_size(self): + return len(self.vocab) + + def _tokenize(self, text): + split_tokens = [] + for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens): + split_tokens.append(token) + # print(split_tokens) + return split_tokens + def _convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + return self.vocab.get(token, self.vocab.get(self.unk_token)) + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.ids_to_tokens.get(index, self.unk_token) + def convert_tokens_to_string(self, tokens): + """ Converts a sequence of tokens (string) in a single string. """ + out_string = " ".join(tokens).replace(" ##", "").strip() + return out_string + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks + by concatenating and adding special tokens. + A BERT sequence has the following format: + single sequence: [CLS] X [SEP] + pair of sequences: [CLS] A [SEP] B [SEP] + """ + cls = [self.cls_token_id] + sep = [self.sep_token_id] + if token_ids_1 is None: + if len(token_ids_0) < 510: + return cls + token_ids_0 + sep + else: + output = [] + num_pieces = int(len(token_ids_0)//510) + 1 + for i in range(num_pieces): + output.extend(cls + token_ids_0[510*i:min(len(token_ids_0), 510*(i+1))] + sep) + return output + return cls + token_ids_0 + sep + token_ids_1 + sep + def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): + """ + Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. + Args: + token_ids_0: list of ids (must not contain special tokens) + token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids + for sequence pairs + already_has_special_tokens: (default False) Set to True if the token list is already formated with + special tokens for the model + Returns: + A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + if already_has_special_tokens: + if token_ids_1 is not None: + raise ValueError( + "You should not supply a second sequence if the provided sequence of " + "ids is already formated with special tokens for the model." + ) + return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) + if token_ids_1 is not None: + return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] + if len(token_ids_0) < 510: + return [1] + ([0] * len(token_ids_0)) + [1] + else: + output = [] + num_pieces = int(len(token_ids_0)//510) + 1 + for i in range(num_pieces): + output.extend([1] + ([0] * (min(len(token_ids_0), 510*(i+1))-510*i)) + [1]) + return output + return [1] + ([0] * len(token_ids_0)) + [1] + def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): + """ + Creates a mask from the two sequences passed to be used in a sequence-pair classification task. + A BERT sequence pair mask has the following format: + 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 + | first sequence | second sequence + if token_ids_1 is None, only returns the first portion of the mask (0's). + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + if token_ids_1 is None: + if len(token_ids_0) < 510: + return len(cls + token_ids_0 + sep) * [0] + else: + num_pieces = int(len(token_ids_0)//510) + 1 + return (len(cls + token_ids_0 + sep) + 2*(num_pieces-1)) * [0] + return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + def save_vocabulary(self, vocab_path, filename_prefix=None): + """Save the tokenizer vocabulary to a directory or file.""" + index = 0 + if os.path.isdir(vocab_path): + vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["vocab_file"]) + else: + vocab_file = vocab_path + with open(vocab_file, "w", encoding="utf-8") as writer: + for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning( + "Saving vocabulary to {}: vocabulary indices are not consecutive." + " Please check that the vocabulary is not corrupted!".format(vocab_file) + ) + index = token_index + writer.write(token + "\n") + index += 1 + return (vocab_file,) + + +class BasicTokenizer(object): + """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" + + def __init__(self, do_lower_case=False, never_split=None, tokenize_chinese_chars=True): + """ Constructs a BasicTokenizer. + Args: + **do_lower_case**: Whether to lower case the input. + **never_split**: (`optional`) list of str + Kept for backward compatibility purposes. + Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`) + List of token not to split. + **tokenize_chinese_chars**: (`optional`) boolean (default True) + Whether to tokenize Chinese characters. + This should likely be deactivated for Japanese: + see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328 + """ + if never_split is None: + never_split = [] + self.do_lower_case = do_lower_case + self.never_split = never_split + self.tokenize_chinese_chars = tokenize_chinese_chars + + def tokenize(self, text, never_split=None): + """ Basic Tokenization of a piece of text. + Split on "white spaces" only, for sub-word tokenization, see WordPieceTokenizer. + Args: + **never_split**: (`optional`) list of str + Kept for backward compatibility purposes. + Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`) + List of token not to split. + """ + never_split = self.never_split + (never_split if never_split is not None else []) + text = self._clean_text(text) + # This was added on November 1st, 2018 for the multilingual and Chinese + # models. This is also applied to the English models now, but it doesn't + # matter since the English models were not trained on any Chinese data + # and generally don't have any Chinese data in them (there are Chinese + # characters in the vocabulary because Wikipedia does have some Chinese + # words in the English Wikipedia.). + orig_tokens = whitespace_tokenize(text) + split_tokens = [] + for token in orig_tokens: + if token not in never_split: + token = self._run_strip_accents(token) + split_tokens.extend(self._run_split_on_punc(token, never_split)) + + output_tokens = whitespace_tokenize(" ".join(split_tokens)) + return output_tokens + + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize("NFD", text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == "Mn": + continue + output.append(char) + return "".join(output) + + def _run_split_on_punc(self, text, never_split=None): + """Splits punctuation on a piece of text.""" + if never_split is not None and text in never_split: + return [text] + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return ["".join(x) for x in output] + + + + def _clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xFFFD or _is_control(char): + continue + if _is_whitespace(char): + output.append(" ") + else: + output.append(char) + return "".join(output) + + +def _is_whitespace(char): + """Checks whether `chars` is a whitespace character.""" + # \t, \n, and \r are technically contorl characters but we treat them + # as whitespace since they are generally considered as such. + if char == " " or char == "\t" or char == "\n" or char == "\r": + return True + cat = unicodedata.category(char) + if cat == "Zs": + return True + return False + + +def _is_control(char): + """Checks whether `chars` is a control character.""" + # These are technically control characters but we count them as whitespace + # characters. + if char == "\t" or char == "\n" or char == "\r": + return False + cat = unicodedata.category(char) + if cat.startswith("C"): + return True + return False + + +def _is_punctuation(char): + """Checks whether `chars` is a punctuation character.""" + cp = ord(char) + # We treat all non-letter/number ASCII as punctuation. + # Characters such as "^", "$", and "`" are not in the Unicode + # Punctuation class but we treat them as punctuation anyways, for + # consistency. + if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126): + return True + cat = unicodedata.category(char) + if cat.startswith("P"): + return True + return False \ No newline at end of file diff --git a/train.py b/train.py new file mode 100644 index 0000000000000000000000000000000000000000..9ac1bee6b3d8922cf6648c1881ba1a634ac4914e --- /dev/null +++ b/train.py @@ -0,0 +1,491 @@ +import wandb +# wandb.login(key="293cdcc20c72cb7e8cc5a077eaacf86b254e46ed") +# Nancy +wandb.login(key="04fa40f46e9b09c72fc2dcb1457767c7ad809037") +import os +import sys +os.environ["DISABLE_TRITON"] = "1" +sys.modules['triton'] = None +sys.modules['flash_attn_triton'] = None +#os.environ['WANDB_DIR'] = '/storage2/fs1/btc/Active/yeli/xiaoxiao.zhou/apps/transformers_cache' +#os.environ['WANDB_CACHE_DIR'] = '/storage2/fs1/btc/Active/yeli/xiaoxiao.zhou/apps/transformers_cache' + +import csv +import copy +import json +import logging +from dataclasses import dataclass, field +from typing import Any, Optional, Dict, Sequence, Tuple, List, Union + +import torch +import transformers +import sklearn +import numpy as np +from torch.utils.data import Dataset +import importlib +from pathlib import Path +import itertools + +from transformers import BertConfig, BertForSequenceClassification + +from transformers import ( + WEIGHTS_NAME, + AdamW, + BertConfig, + BertForMaskedLM, + BertTokenizer, + CamembertConfig, + CamembertForMaskedLM, + CamembertTokenizer, + DistilBertConfig, + DistilBertForMaskedLM, + DistilBertTokenizer, + GPT2Config, + GPT2LMHeadModel, + GPT2Tokenizer, + OpenAIGPTConfig, + OpenAIGPTLMHeadModel, + OpenAIGPTTokenizer, + PreTrainedModel, + PreTrainedTokenizer, + RobertaConfig, + RobertaForMaskedLM, + RobertaTokenizer, + get_linear_schedule_with_warmup, +) +from tokenizers.models import Unigram as TokenizersUnigram + +from tokenization_motif import MotifTokenizer +from tokenization_dna import DNATokenizer + + +MODEL_CLASSES = { + "dna": (BertConfig, BertForMaskedLM, DNATokenizer), + "bert": (BertConfig, BertForMaskedLM, BertTokenizer), + "motifBert": (BertConfig, BertForMaskedLM, MotifTokenizer) +} + +@dataclass +class ModelArguments: + model_name_or_path: Optional[str] = field(default="facebook/opt-125m") + trust_remote_code: bool = field(default=False, metadata={"help": "for custom models(has custom code that needs to be executed (e.g., custom architectures, tokenizers, or modeling files)), whether local or from the Hub"}) + use_lora: bool = field(default=False, metadata={"help": "whether to use LoRA"}) + lora_r: int = field(default=8, metadata={"help": "hidden dimension for LoRA"}) + lora_alpha: int = field(default=32, metadata={"help": "alpha for LoRA"}) + lora_dropout: float = field(default=0.05, metadata={"help": "dropout rate for LoRA"}) + lora_target_modules: str = field(default="query,value", metadata={"help": "where to perform LoRA"}) + tokenizer_path: Optional[str] = field(default="facebook/opt-125m") + + +@dataclass +class DataArguments: + data_path: str = field(default=None, metadata={"help": "Path to the training data."}) + kmer: int = field(default=-1, metadata={"help": "k-mer for input sequence. -1 means not using k-mer."}) + customized_tokenizer: Optional[str] = field(default=None) + + +@dataclass +class TrainingArguments(transformers.TrainingArguments): + vocab_file: Optional[str] = field( + default=None, + metadata={"help": "Path to custom vocabulary file (overrides Hugging Face default)"} + ) + cache_dir: Optional[str] = field(default=None) + run_name: str = field(default="run") + optim: str = field(default="adamw_torch") + model_max_length: int = field(default=512, metadata={"help": "Maximum sequence length."}) + gradient_accumulation_steps: int = field(default=1) + per_device_train_batch_size: int = field(default=1) + per_device_eval_batch_size: int = field(default=1) + num_train_epochs: int = field(default=1) + fp16: bool = field(default=False) + logging_steps: int = field(default=100) + save_steps: int = field(default=100) + eval_steps: int = field(default=100) + evaluation_strategy: str = field(default="steps"), + warmup_steps: int = field(default=50) + weight_decay: float = field(default=0.01) + learning_rate: float = field(default=1e-4) + save_total_limit: int = field(default=3) + load_best_model_at_end: bool = field(default=False) + output_dir: str = field(default="output") + find_unused_parameters: bool = field(default=False) + checkpointing: bool = field(default=False) + dataloader_pin_memory: bool = field(default=False) + eval_and_save_results: bool = field(default=True) + save_model: bool = field(default=False) + seed: int = field(default=42) + project_name: str = field(default=None) + + +def safe_save_model_for_hf_trainer(trainer: transformers.Trainer, output_dir: str): + """Collects the state dict and dump to disk.""" + state_dict = trainer.model.state_dict() + if trainer.args.should_save: + cpu_state_dict = {key: value.cpu() for key, value in state_dict.items()} + del state_dict + trainer._save(output_dir, state_dict=cpu_state_dict) # noqa + + +""" +Get the reversed complement of the original DNA sequence. +""" +def get_alter_of_dna_sequence(sequence: str): + MAP = {"A": "T", "T": "A", "C": "G", "G": "C"} + # return "".join([MAP[c] for c in reversed(sequence)]) + return "".join([MAP[c] for c in sequence]) + +""" +Transform a dna sequence to k-mer string +""" +def generate_kmer_str(sequence: str, k: int) -> str: + """Generate k-mer string from DNA sequence.""" + return " ".join([sequence[i:i+k] for i in range(len(sequence) - k + 1)]) + + +""" +Load or generate k-mer string for each DNA sequence. The generated k-mer string will be saved to the same directory as the original data with the same name but with a suffix of "_{k}mer". +""" +def load_or_generate_kmer(data_path: str, texts: List[str], k: int) -> List[str]: + """Load or generate k-mer string for each DNA sequence.""" + kmer_path = data_path.replace(".csv", f"_{k}mer.json") + if os.path.exists(kmer_path): + logging.warning(f"Loading k-mer from {kmer_path}...") + with open(kmer_path, "r") as f: + kmer = json.load(f) + else: + logging.warning(f"Generating k-mer...") + kmer = [generate_kmer_str(text, k) for text in texts] + with open(kmer_path, "w") as f: + logging.warning(f"Saving k-mer to {kmer_path}...") + json.dump(kmer, f) + + return kmer + +def load_customized_data(data_path: str, texts: List[str], customized_tokenizer: str) -> List[str]: + """Load or generate k-mer string for each DNA sequence.""" + customize_path = data_path.replace(".csv", f"_{customized_tokenizer}.json") + print(customize_path) + if os.path.exists(customize_path): + logging.warning(f"Loading data by customized tokenizer from {customize_path}...") + with open(customize_path, "r") as f: + data = json.load(f) + + return data + + +class SupervisedDataset(Dataset): + """Dataset for supervised fine-tuning.""" + + def __init__(self, + data_path: str, + tokenizer: transformers.PreTrainedTokenizer, + kmer: int = -1, + customized_tokenizer = None): + + super(SupervisedDataset, self).__init__() + + # load data from the disk + with open(data_path, "r") as f: + data = list(csv.reader(f, delimiter=','))[1:] + if len(data[0]) == 2: + # data is in the format of [text, label] + logging.warning("Perform single sequence classification...") + texts = [d[0] for d in data] + # labels = [int(d[1]) for d in data] + label_set = sorted(set(d[1] for d in data)) # get unique labels + label2id = {label: idx for idx, label in enumerate(label_set)} # map labels to integers + labels = [label2id[d[1]] for d in data] + elif len(data[0]) == 3: + # data is in the format of [text1, text2, label] + logging.warning("Perform sequence-pair classification...") + texts = [[d[0], d[1]] for d in data] + # labels = [int(d[2]) for d in data] + label_set = sorted(set(d[1] for d in data)) # get unique labels + label2id = {label: idx for idx, label in enumerate(label_set)} # map labels to integers + labels = [label2id[d[1]] for d in data] + elif len(data[0]) == 5: + logging.warning("Perform single sequence classification on Genomic Benchmarks...") + texts = [d[4] for d in data] + # labels = [int(d[0]) for d in data] + label_set = sorted(set(d[1] for d in data)) # get unique labels + label2id = {label: idx for idx, label in enumerate(label_set)} # map labels to integers + labels = [label2id[d[1]] for d in data] + else: + raise ValueError("Data format not supported.") + + if kmer != -1: + + logging.warning(f"Using {kmer}-mer as input...") + texts = load_or_generate_kmer(data_path, texts, kmer) + + elif kmer == -1 and customized_tokenizer: + logging.warning(f"Using {customized_tokenizer} as input...") + texts = load_customized_data(data_path, texts, customized_tokenizer) + + output = tokenizer( + texts, + return_tensors="pt", + padding="longest", + max_length=tokenizer.model_max_length, + truncation=True, + ) + # print(texts, output["input_ids"]) + + self.input_ids = output["input_ids"] + self.attention_mask = output["attention_mask"] + self.labels = labels + self.num_labels = len(set(labels)) + + def __len__(self): + return len(self.input_ids) + + def __getitem__(self, i) -> Dict[str, torch.Tensor]: + return dict(input_ids=self.input_ids[i], labels=self.labels[i]) + + +@dataclass +class DataCollatorForSupervisedDataset(object): + """Collate examples for supervised fine-tuning.""" + + tokenizer: transformers.PreTrainedTokenizer + + def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]: + input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels")) + input_ids = torch.nn.utils.rnn.pad_sequence( + input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id + ) + labels = torch.Tensor(labels).long() + return dict( + input_ids=input_ids, + labels=labels, + attention_mask=input_ids.ne(self.tokenizer.pad_token_id), + ) + +""" +Manually calculate the accuracy, f1, matthews_correlation, precision, recall with sklearn. +""" +def calculate_metric_with_sklearn(predictions: np.ndarray, labels: np.ndarray): + valid_mask = labels != -100 # Exclude padding tokens (assuming -100 is the padding token ID) + valid_predictions = predictions[valid_mask] + valid_labels = labels[valid_mask] + return { + "accuracy": sklearn.metrics.accuracy_score(valid_labels, valid_predictions), + "f1": sklearn.metrics.f1_score( + valid_labels, valid_predictions, average="macro", zero_division=0 + ), + "matthews_correlation": sklearn.metrics.matthews_corrcoef( + valid_labels, valid_predictions + ), + "precision": sklearn.metrics.precision_score( + valid_labels, valid_predictions, average="macro", zero_division=0 + ), + "recall": sklearn.metrics.recall_score( + valid_labels, valid_predictions, average="macro", zero_division=0 + ), + } + +# from: https://discuss.huggingface.co/t/cuda-out-of-memory-when-using-trainer-with-compute-metrics/2941/13 +def preprocess_logits_for_metrics(logits:Union[torch.Tensor, Tuple[torch.Tensor, Any]], _): + if isinstance(logits, tuple): # Unpack logits if it's a tuple + logits = logits[0] + + if logits.ndim == 3: + # Reshape logits to 2D if needed + logits = logits.reshape(-1, logits.shape[-1]) + + return torch.argmax(logits, dim=-1) + + +""" +Compute metrics used for huggingface trainer. +""" +def compute_metrics(eval_pred): + predictions, labels = eval_pred + return calculate_metric_with_sklearn(predictions, labels) + +def load_token_v5_1(tokenizer_kwargs): + config_class, model_class, tokenizer_class = MODEL_CLASSES['motifBert'] + tokenizer = MotifTokenizer(**tokenizer_kwargs) + + bases = ['A', 'T', 'C', 'G'] + + token_wc = [ + f"{operator}_POS_{i}_*_{char}" + for operator, i, char in itertools.product(['WC'], range(12), bases) + ] + + motif_wildcarded = [] + with open(os.path.join('/storage2/fs1/btc/Active/yeli/xiaoxiao.zhou/tokenize/tokenizers/tokenizer_v5.1/hg38_NOOP', "motifs_wildcard.txt"), "r") as file: + for line in file: + seq, operations = line.strip().split(maxsplit=1) # Split only on the first space + motif_wildcarded.append(operations.split()[0]) # Store in dictionary + + tokenizer.add_tokens(token_wc + motif_wildcarded) + return tokenizer + +def load_token_v4(tokenizer_kwargs): + config_class, model_class, tokenizer_class = MODEL_CLASSES['motifBert'] + tokenizer = MotifTokenizer(**tokenizer_kwargs) + + bases = ['A', 'T', 'C', 'G'] + token_del = [ + f"{operator}_POS_{i}_{char}" + for operator, i, char in itertools.product(['DEL'], range(12), bases) + ] + token_rep = [ + f"{operator}_POS_{i}_{char1}_{char2}" + for operator, i, char1, char2 in itertools.product(['SUB'], range(12), bases, bases) + if char1 != char2 + ] + + token_wc = [ + f"{operator}_POS_{i}_*_{char}" + for operator, i, char in itertools.product(['WC'], range(12), bases) + ] + + token_ins = [ + f"{operator}_POS_{i}_{char}" + for operator, i, char in itertools.product(['INS'], range(13), bases) + ] + + motif_wildcarded = [] + with open(os.path.join('/storage2/fs1/btc/Active/yeli/xiaoxiao.zhou/tokenize/tokenizers/tokenizer_v4/hg38', "motifs_wildcard.txt"), "r") as file: + for line in file: + seq, operations = line.strip().split(maxsplit=1) # Split only on the first space + motif_wildcarded.append(operations.split()[0]) # Store in dictionary + + tokenizer.add_tokens(token_del + token_rep + token_wc + token_ins + motif_wildcarded) + return tokenizer + +def train(): + + parser = transformers.HfArgumentParser((ModelArguments, DataArguments, TrainingArguments)) + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + + wandb.init( + project=training_args.project_name, + ) + + tokenizer_kwargs = { + "cache_dir": training_args.cache_dir, + "model_max_length": training_args.model_max_length, + "padding_side": "right", + "use_fast": True, + "trust_remote_code": model_args.trust_remote_code # ้™ค้žๅฟ…่ฆๅฆๅˆ™ๅปบ่ฎฎไฟๆŒFalse + } + + if training_args.vocab_file is not None: + if not os.path.exists(training_args.vocab_file): + raise ValueError(f"Vocab file not found at: {training_args.vocab_file}") + tokenizer_kwargs["vocab_file"] = training_args.vocab_file + + if data_args.customized_tokenizer == 'token_v4': + tokenizer = load_token_v4(tokenizer_kwargs) + + elif data_args.customized_tokenizer == 'token_v5_1': + tokenizer = load_token_v5_1(tokenizer_kwargs) + + else: + tokenizer = transformers.PreTrainedTokenizerFast( + tokenizer_file=model_args.tokenizer_path, + **tokenizer_kwargs + ) + + tokenizer.pad_token = "[PAD]" + tokenizer.unk_token = "[UNK]" + tokenizer.cls_token = "[CLS]" + tokenizer.sep_token = "[SEP]" + tokenizer.mask_token = "[MASK]" + # Guard for Unigram tokenizers missing unk_id. + try: + backend = getattr(tokenizer, "_tokenizer", None) + if backend is not None and backend.model.__class__.__name__ == "Unigram": + tok_path = model_args.tokenizer_path + if tok_path and os.path.exists(tok_path): + with open(tok_path, "r") as handle: + tok_json = json.load(handle) + model_cfg = tok_json.get("model", {}) + if model_cfg.get("type") == "Unigram" and model_cfg.get("unk_id") is None: + vocab = [tuple(item) for item in model_cfg.get("vocab", [])] + unk_id = next((i for i, item in enumerate(vocab) if item[0] == "[UNK]"), None) + if unk_id is not None: + backend.model = TokenizersUnigram(vocab, unk_id=unk_id) + except Exception as exc: + logging.warning(f"Unigram unk_id guard skipped: {exc}") + + if "InstaDeepAI" in model_args.model_name_or_path: + tokenizer.eos_token = tokenizer.pad_token + + # define datasets and data collator + train_dataset = SupervisedDataset(tokenizer=tokenizer, + data_path=os.path.join(data_args.data_path, "train.csv"), + kmer=data_args.kmer, + customized_tokenizer=data_args.customized_tokenizer) + val_dataset = SupervisedDataset(tokenizer=tokenizer, + data_path=os.path.join(data_args.data_path, "dev.csv"), + kmer=data_args.kmer, + customized_tokenizer=data_args.customized_tokenizer) + test_dataset = SupervisedDataset(tokenizer=tokenizer, + data_path=os.path.join(data_args.data_path, "test.csv"), + kmer=data_args.kmer, + customized_tokenizer=data_args.customized_tokenizer) + data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer) + + + config = transformers.AutoConfig.from_pretrained( + model_args.model_name_or_path, + num_labels = train_dataset.num_labels, + trust_remote_code=model_args.trust_remote_code + ) + + model = transformers.AutoModelForSequenceClassification.from_pretrained( + model_args.model_name_or_path, + cache_dir=training_args.cache_dir, + config=config, # pass the adjusted config + trust_remote_code=model_args.trust_remote_code + ).to("cuda") + + # configure LoRA + if model_args.use_lora: + lora_config = LoraConfig( + r=model_args.lora_r, + lora_alpha=model_args.lora_alpha, + target_modules=list(model_args.lora_target_modules.split(",")), + lora_dropout=model_args.lora_dropout, + bias="none", + task_type="SEQ_CLS", + inference_mode=False, + ) + model = get_peft_model(model, lora_config) + model.print_trainable_parameters() + + # define trainer + trainer = transformers.Trainer(model=model, + tokenizer=tokenizer, + args=training_args, + preprocess_logits_for_metrics=preprocess_logits_for_metrics, + compute_metrics=compute_metrics, + train_dataset=train_dataset, + eval_dataset=val_dataset, + data_collator=data_collator) + trainer.train() + + if training_args.save_model: + trainer.save_state() + safe_save_model_for_hf_trainer(trainer=trainer, output_dir=training_args.output_dir) + + # get the evaluation results from trainer + if training_args.eval_and_save_results: + results_path = os.path.join(training_args.output_dir, "results", training_args.run_name) + results = trainer.evaluate(eval_dataset=test_dataset) + os.makedirs(results_path, exist_ok=True) + with open(os.path.join(results_path, "eval_results.json"), "w") as f: + json.dump(results, f) + + + + +if __name__ == "__main__": + + train()