Spaces:

AI-BIO
/

ProteinGPT-Llama3

Running

App Files Files Community

ProteinGPT-Llama3 / protein_gpt.py

EdwardoSunny

finished

85ab89d 2 months ago

raw

history blame

3.84 kB

	"""
	Copyright (c) 2022, salesforce.com, inc.
	All rights reserved.
	SPDX-License-Identifier: BSD-3-Clause
	For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
	"""

	import argparse
	import os
	import random

	import numpy as np
	import torch
	import torch.backends.cudnn as cudnn
	import yaml
	# import esm
	import minigpt4.tasks as tasks
	from minigpt4.esm.esm_config import Config
	from minigpt4.common.dist_utils import get_rank, init_distributed_mode
	from minigpt4.common.logger import setup_logger
	from minigpt4.common.optims import (
	LinearWarmupCosineLRScheduler,
	LinearWarmupStepLRScheduler,
	)
	from minigpt4.common.registry import registry
	from minigpt4.common.utils import now

	# imports modules for registration
	from minigpt4.datasets.builders import *
	from minigpt4.datasets.pdb_dataset import ESMDataset
	from minigpt4.datasets.qa_dataset import QADataset
	from minigpt4.models import *
	from minigpt4.processors import *
	from minigpt4.runners import *
	from minigpt4.tasks import *


	def parse_args():
	parser = argparse.ArgumentParser(description="Training")

	parser.add_argument("--cfg-path", required=False, help="path to configuration file.",
	default='configs/train_modality_alignment.yaml')
	parser.add_argument(
	"--options",
	nargs="+",
	help="override some settings in the used config, the key-value pair "
	"in xxx=yyy format will be merged into config file (deprecate), "
	"change to --cfg-options instead.",
	)

	args = parser.parse_args()
	# if 'LOCAL_RANK' not in os.environ:
	# os.environ['LOCAL_RANK'] = str(args.local_rank)

	return args


	def setup_seeds(config):
	seed = config.run_cfg.seed + get_rank()

	random.seed(seed)
	np.random.seed(seed)
	torch.manual_seed(seed)

	cudnn.benchmark = False
	cudnn.deterministic = True


	def get_runner_class(cfg):
	"""
	Get runner class from config. Default to epoch-based runner.
	"""
	runner_cls = registry.get_runner_class(cfg.run_cfg.get("runner", "runner_base"))

	return runner_cls

	def is_stage_1_training(cfg):
	return cfg.to_dict()["run"]["stage"] == 1


	def main():
	# allow auto-dl completes on main process without timeout when using NCCL backend.
	# os.environ["NCCL_BLOCKING_WAIT"] = "1"

	# set before init_distributed_mode() to ensure the same job_id shared across all ranks.
	job_id = now()
	cfg = Config(parse_args())
	init_distributed_mode(cfg.run_cfg)
	setup_seeds(cfg)
	# set after init_distributed_mode() to only log on master.
	setup_logger()
	cfg.pretty_print()
	task = tasks.setup_task(cfg)

	datasets_raw = []
	if (is_stage_1_training(cfg)):
	datasets_raw = ESMDataset(pdb_root="/home/ubuntu/pt/",
	seq_root="/home/ubuntu/seq/",
	ann_paths="/home/ubuntu/proteinchat/data/esm_subset/abstract.json",
	dataset_description="/home/ubuntu/dataset.json",
	chain="A")
	else:
	datasets_raw = QADataset(pdb_root="/home/ubuntu/pt/",
	seq_root="/home/ubuntu/seq/",
	# ann_paths="/home/ubuntu/proteinchat/data/esm_subset/qa_all.json",
	ann_paths="/home/ubuntu/proteinchat/data/esm_subset/GPT_merged_summary.json",
	# dataset_description="/home/ubuntu/dataset.json",
	chain="A")

	datasets = {'esm': {'train': datasets_raw}}
	model = task.build_model(cfg)

	runner = get_runner_class(cfg)(
	cfg=cfg, job_id=job_id, task=task, model=model, datasets=datasets
	)

	runner.train()


	if __name__ == "__main__":
	main()