Spaces:

AI-BIO
/

ProteinGPT-Llama3

Sleeping

App Files Files Community

ProteinGPT-Llama3 / eval /eval_esm.py

EdwardoSunny

finished

85ab89d 3 months ago

raw

history blame contribute delete

3.95 kB

	import argparse
	import os
	import random
	import sys
	import time
	import tqdm
	sys.path.insert(0, "..")

	import numpy as np
	import torch
	import torch.backends.cudnn as cudnn

	from minigpt4.common.config import Config
	from minigpt4.common.dist_utils import get_rank
	from minigpt4.common.registry import registry
	from minigpt4.conversation.conversation_esm import Chat, CONV_VISION

	# imports modules for registration
	from minigpt4.datasets.builders import *
	from minigpt4.models import *
	from minigpt4.processors import *
	from minigpt4.runners import *
	from minigpt4.tasks import *
	import sys

	import esm
	import json

	DATASET_SPEC = "/home/ubuntu/proteinchat/dataset.json"

	ANN_PATH = "/home/ubuntu/proteinchat/data/qa_all.json"
	PDB_PATH = "/home/ubuntu/pt"
	SEQ_PATH = "/home/ubuntu/seq"

	OUTPUT_SAVE_PATH = "/home/ubuntu/proteinchat/eval/results/outputs"
	annotation = open(ANN_PATH, "r")
	annotation = json.load(annotation)

	dataset = open(DATASET_SPEC, "r")
	dataset = json.load(dataset)
	all_prots = dataset["test"]

	def parse_args():
	parser = argparse.ArgumentParser(description="Demo")
	parser.add_argument("--cfg-path", required=True, help="path to configuration file.")
	parser.add_argument("--gpu-id", type=int, default=0, help="specify the gpu to load the model.")
	parser.add_argument("--model", type=str, required=True, help="specify the model to load the model.")
	parser.add_argument(
	"--options",
	nargs="+",
	help="override some settings in the used config, the key-value pair "
	"in xxx=yyy format will be merged into config file (deprecate), "
	"change to --cfg-options instead.",
	)
	args = parser.parse_args()
	return args


	def setup_seeds(config):
	seed = config.run_cfg.seed + get_rank()

	random.seed(seed)
	np.random.seed(seed)
	torch.manual_seed(seed)

	cudnn.benchmark = False
	cudnn.deterministic = True

	print('Initializing Chat')
	args = parse_args()
	cfg = Config(args)

	model_config = cfg.model_cfg
	model_config.device_8bit = args.gpu_id
	model_cls = registry.get_model_class(model_config.arch)
	model = model_cls.from_config(model_config).to('cuda:{}'.format(args.gpu_id))

	vis_processor_cfg = cfg.datasets_cfg.cc_sbu_align.vis_processor.train
	vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg)
	chat = Chat(model, vis_processor, device='cuda:{}'.format(args.gpu_id))
	print('Initialization Finished')

	raw_output = {}
	score_output = {}

	START_SAMPLES = 0
	# END_SAMPLES = 8806
	END_SAMPLES = 160

	all_prots = all_prots[START_SAMPLES : END_SAMPLES]

	for prot in tqdm.tqdm(all_prots):
	curr_prot_ann = annotation[prot]
	pdb_path = os.path.join(PDB_PATH, f"{prot}.pt")
	seq_path = os.path.join(SEQ_PATH, f"{prot}.pt")
	seq_embedding = torch.load(seq_path, map_location=torch.device('cpu'))
	sample_seq = seq_embedding.to('cuda:{}'.format(args.gpu_id))

	if (seq_embedding.shape[1] > 384):
	continue
	raw_output[prot] = []
	pdb_embedding = torch.load(pdb_path, map_location=torch.device('cpu'))
	sample_pdb = pdb_embedding.to('cuda:{}'.format(args.gpu_id))


	for ann in curr_prot_ann:
	d = {}
	d["Q"] = ann["Q"]
	chat_state = CONV_VISION.copy()
	img_list = []
	llm_message = chat.upload_protein(sample_pdb, sample_seq, chat_state, img_list)
	img_list = [mat.half() for mat in img_list]
	chat.ask(ann["Q"], chat_state)
	ans = chat.answer(conv=chat_state,
	img_list=img_list,
	num_beams=1,
	temperature=0.7,
	max_new_tokens=384,
	max_length=2048)[0]
	d["A"] = ans
	raw_output[prot].append(d)

	with open(os.path.join(OUTPUT_SAVE_PATH, f"{args.model}_eval_output.json"), 'w') as fp:
	json.dump(raw_output, fp, indent=4)