| |
|
| | from transformers import AutoTokenizer, AutoModelForCausalLM |
| | import torch |
| | import random |
| |
|
| | |
| | tokenizer = AutoTokenizer.from_pretrained("nferruz/ProtGPT2") |
| | model = AutoModelForCausalLM.from_pretrained("nferruz/ProtGPT2") |
| |
|
| | |
| | if tokenizer.pad_token is None: |
| | tokenizer.pad_token = tokenizer.eos_token |
| | model.config.pad_token_id = model.config.eos_token_id |
| |
|
| | def generate_binders(fusion_context, strategy='low_shot', num_candidates=10): |
| | seed_sequence = fusion_context['embedding_vector'][:10] |
| | seed = ''.join([chr(int(65 + abs(int(x * 10)) % 20)) for x in seed_sequence]) |
| | |
| | |
| | inputs = tokenizer(seed, return_tensors="pt", padding=True) |
| | input_ids = inputs["input_ids"] |
| | attention_mask = inputs["attention_mask"] |
| |
|
| | outputs = model.generate( |
| | input_ids, |
| | attention_mask=attention_mask, |
| | do_sample=True, |
| | top_k=950, |
| | top_p=0.96, |
| | temperature=1.0, |
| | max_length=200, |
| | num_return_sequences=num_candidates, |
| | pad_token_id=tokenizer.pad_token_id |
| | ) |
| |
|
| | binders = [] |
| | for output in outputs: |
| | sequence = tokenizer.decode(output, skip_special_tokens=True) |
| | sequence = ''.join([aa for aa in sequence if aa in "ACDEFGHIKLMNPQRSTVWY"]) |
| | if len(sequence) > 30: |
| | binders.append({"sequence": sequence}) |
| |
|
| | return {"generated_binders": binders} |
| |
|