brief
DNA sequence and Human Language mixed large language model with fine tuning
You can use 'chat' to conduct bioinformatics research:)
basic use
import torch
from transformers import GPT2Tokenizer,GPT2Model,GPT2LMHeadModel
model = GPT2LMHeadModel.from_pretrained("dnagpt/dna_promoter_sft")
tokenizer = GPT2Tokenizer.from_pretrained("dnagpt/dna_eng_bpe")
input = """
Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
Determine the following dna sequence is promoter or terminator
### Input:
TCTTTCTCTTCTGTATCATTCTACTTCTATGACTGCTCCTTCTCGAGTAAAACAGAATGTGTCTCAGGATTACTTTAAAACAAGACAAAGTATAGAGTTAAAATACATTTTTGTTTAGGTAAATATCATCAAATAATCCAATTTGGAAACCAACATTCTTACTTCTTTTTCCAACAGTTGTTCCTATCATCATAAAAACAGAGAGGGTAAGCCTTGGGGAAAGCTACTTTAAAAAAATGGCCTCTAAGGATATTCAGGGTGCAAACAGTAACCTGTTCAGGCACAGATTCTTCTCCTTGAAATGAACTGAAGTTGGCAAATATTCCTCACAGCCTGTTGGAGGGTTCAGCAGTTTATTACAGAAGTATGAAATGCTTTTATTTAAAAAATGTATTTTGGTACACATGTTAGTCTTTTTTGTGATGAACTTCACAGATACATTTGACATTGGTATGCCTCATTTATTTGTTGAAATTTTTTTCTTTGGCTTCCATGAAGTTAAGATTTTTCAAGCAAGATGAAGTCCATCATCCTCTTTGTCCTTTCCCTGCTCCTTATCTTGGAGAAGCAAGCAGCTGTGATGGGACAAAAAGGTGAGTGATGGTTTCAGCTGGACTCCATCCTTCAACTGACAACCCAACACAACTGTATTCATCTCTCGTTAACATTACTAGCAGTGAGTAACATCAGAAAGTTTTTGAGTCTCCTTCCTACTCCGGACTGAATTTCTTCTGTATAATGCAAGCGATCTGGCATGATGATATACAAAGACCGATAAAATTTTGCTGGGGATTCTGAAAGTAAAAAAAATTGCCTTTGATATTATGTCCCCATGCTAAGTCCCTGGGGACTTTGACATTATCCCCCACTGAGCAGGGGTGAGGAAGCTGGCATTTACTAAAAAGTCAGCAATATGATTTGAATGCCCTACATAAGGCGACAAAATCAAAACAACACCTAGGTGGAAGTCAACAACTGCTCAATTATAAACAAGAAGGCAATAAGCTATGAAAGGGCAGTGCCTTTTGACATTTCAGCTCCACCCATAGCACACCCACTCAAGGAACATATAAATGAAGAGATCCGCTCAGTTCTCAGACCTATTTTGGACAAAAAGACCAACAACATACTAAATCCAAAGGCAGTTTTTCTATTCAACACACATATCATGTAGACATCAATGATCATGACTGGACCCGATGAGAATTGATTTTTCTCCACCCAACGCTGTAGGCTTTTGGAAATATCAGAAATTTGTTGGGAAAAGGTGGGAGGTAAGAGTTGCAAGAGAGCTTTGGAGATAATGAATGCATACATTTCTATTATCAATTACCAGGTGGATCAAAAGGCCAATTGCCAAGCGGATCTTCCCAATTTCCACATGGACAAAAGGGCCAGCATACCTAAAATTTAGTATTTAAGTTATTGGATCAGAAAGGAAACTCGCATTTAGAGTATGAAGGCATTGTCAGCCACCAATTACTTTTGTAACCTGAAGCTGAATAGCCCATCTGGAAAGGGATTATCCAGTCAATGTTCAAACACAGAAAAAAGGCTATGGGTTCATGGACTAAGTAAAGAACAAGCTTCAGCCTCTGGTGAGACCATGATAAATCAAAAGGTCATTTTCACATGATAGTTATACATCATAAAGGAGGCCAAGCTCATCATGGGACACAAAATCCTTCTCAAGATCAGGGTGTTAAATTTTTCTCATCCTTTCAAAAAATCTCTGGAGCTTACCTCACCCTCCAGATACAGCCTCACCTCTCATTCCACCATGAAATCAGACTTCTTGAGGCACAAAAAGGTAGAACACAAGGTGGATCCCAAAGCAGTTATGTTCTCCAAACTGAAGAACTAGTAGTTAACAAACAACAACGTGAGACTAAAAATTCTCATCAAAATAAAGGGCATTACCAAAATGTGGTTGACGTGAGAGAGGAACATTCAAGTAAACTACAAACTTCACTCCATCCTGCACATCAAGACAGACTCCA
### Response:
"""
input_ids = tokenizer.encode(
input,
return_tensors="pt",
truncation=True,
max_length=1000
)
# Generate
device = model.device
generated_tokens_with_prompt = model.generate(
input_ids=input_ids.to(device),
max_new_tokens=3,
)
generated_text_with_prompt = tokenizer.decode(generated_tokens_with_prompt[0], skip_special_tokens=True)
generated_text_answer = generated_text_with_prompt[len(input):]
print(generated_text_answer)
#terminator
github
- Downloads last month
- 7