import gradio as gr import os import pandas as pd from langchain.llms import OpenAI from langchain.prompts import PromptTemplate from langchain.chains import LLMChain from pydantic import BaseModel, Field from langchain.tools import BaseTool, StructuredTool, Tool, tool from langchain.memory import ChatMessageHistory from langchain.agents import initialize_agent from langchain.agents import AgentType from langchain.agents import load_tools #from langchain.tools import PubmedQueryRun from langchain.utilities import WikipediaAPIWrapper from langchain.agents import create_pandas_dataframe_agent from Bio import Entrez, SeqIO from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio.SeqFeature import SeqFeature, FeatureLocation import random #set API key os.environ["OPENAI_API_KEY"] = "" #openai key #prep database of reactions biochem_data_url = "https://bkms.brenda-enzymes.org/download/Reactions_BKMS.tar.gz" biochem_df = pd.read_csv(biochem_data_url, compression='gzip', header=0, sep="\t") def biosynthesize(molecule,organism): molecule = molecule organism = organism #initialize LLM llm = OpenAI(temperature=0.5) #initialize memory history = ChatMessageHistory() #create prompt template for an openAI search prompt = PromptTemplate( input_variables=["molecule",], template="how do you biosynthesize {molecule}. Output a table of necessary genes with two columns: Gene and Function", ) #and initiate its associated chain chain = LLMChain(llm=llm, prompt=prompt) this_output = (chain.run(molecule)) #parse this output into a dataframe this_output = this_output.split('\n') df_array = [] for line in this_output: if len(line) > 0: df_dict = {} line = line.split(' | ') if line[0] != 'Gene' or line[0] != 'gene' or line[0] != '----': #fucking weird ass edge cases df_dict['gene'] = line[0] if len(line) > 1: df_dict['function'] = line[1] df_array.append(df_dict) #next augment this table by asking wikipedia #initialize tools #schema for wikipedia search class SearchInput(BaseModel): query: str = Field(description="should be molecule name") wikipedia = WikipediaAPIWrapper() tools = [ Tool.from_function( func=wikipedia.run, name = "SearchWikipedia", description="Searching wikipedia for genes that produce a molecule", args_schema=SearchInput # coroutine= ... <- you can specify an async method if desired as well ), ] #initialize agent for the wikipedia search #agent = initialize_agent(tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=True) #agent.run('how do you biosynthesize {}. output the gene names.'.format(molecule)) gene_df = pd.DataFrame(df_array) #pull out just the needed columns from the reaction database for now to make it easier biochem_reactions = biochem_df[['Reaction', 'Reaction_ID_MetaCyc', 'Recommended_Name']] #initialize the pandas agent pandas_agent = create_pandas_dataframe_agent(OpenAI(temperature=0), biochem_reactions, verbose=False) #figure out the EC number of the gene new_array = [] for i, gene in gene_df.iterrows(): answer = pandas_agent.run('which element most closely matches {}'.format(gene['function'])) biochem_info = biochem_df[(biochem_df.Reaction == answer) | (biochem_df.Reaction_ID_MetaCyc == answer) | (biochem_df.Recommended_Name == answer)] #see if anything was even found if biochem_info.shape[0] > 0: new_dict = {} new_dict['EC_Number'] = biochem_info.iloc[0]['EC_Number'] new_dict['gene'] = gene['gene'] new_dict['function'] = gene['function'] new_array.append(new_dict) EC_df = pd.DataFrame(new_array) #find protein sequences for these predictions Entrez.email = 'jayman1466@gmail.com' gb_array = [] for i,gene in EC_df.iterrows(): #search NCBI for any record available gb_dict = {} info = Entrez.einfo() info = Entrez.esearch(db = "protein",term = gene['EC_Number']) record = Entrez.read(info) #arbitrarily pick the first ID for now this_id = record['IdList'][0] #pull the sequence of that gene handle = Entrez.efetch( db = "protein", id = this_id, rettype = "fasta") record = SeqIO.read( handle, "fasta" ) gb_dict['sequence'] = str(record.seq) gb_dict['actual_name'] = record.name gb_dict['ID'] = record.id gb_dict['EC'] = gene['EC_Number'] gb_dict['desired_name'] = gene['gene'] gb_array.append(gb_dict) #start to create the actual genbank file #to create random DNA sequences def DNA(length): return ''.join(random.choice('CGTA') for _ in range(length)) #back translate def orf_recode(sequence): sequence = sequence.upper() this_dict = {"A":"GCA","C":"TGT","D":"GAT","E":"GAA","F":"TTT","G":"GGA","H":"CAC","I":"ATA","K":"AAA","L":"CTT","M":"ATG","N":"AAT","P":"CCC","Q":"CAA","R":"AGG","S":"TCA","T":"ACA","V":"GTA","Y":"TAT","W":"TGG","*":"TAA"}; output='' for c in sequence: output+=this_dict[c] output+='TAA' return output current_DNA_coord = 0 this_name = molecule #initiate a record for a genbank file output record = SeqRecord(Seq(""), id='1243', name=this_name, description='Generated with synXNA', annotations={"molecule_type": "DNA"}) full_sequence = "" #iterate through the individual genes for item in gb_array: #promoter pro = DNA(50) feature = SeqFeature(FeatureLocation(start=current_DNA_coord, end=current_DNA_coord + len(pro)), type='promoter', qualifiers={'label':'promoter'}) record.features.append(feature) full_sequence = full_sequence + pro current_DNA_coord = current_DNA_coord + len(pro) rbs = DNA(35) feature = SeqFeature(FeatureLocation(start=current_DNA_coord, end=current_DNA_coord + len(rbs)), type='RBS', qualifiers={'label':'RBS'}) record.features.append(feature) full_sequence = full_sequence + rbs current_DNA_coord = current_DNA_coord + len(rbs) cds = item['sequence'] cds_nuc = orf_recode(cds) feature = SeqFeature(FeatureLocation(start=current_DNA_coord, end=current_DNA_coord + len(cds_nuc)), type='CDS', qualifiers={'label':item['EC'] + " " + item['desired_name'], 'translation':cds}) record.features.append(feature) full_sequence = full_sequence + cds_nuc current_DNA_coord = current_DNA_coord + len(cds_nuc) record.seq = Seq(full_sequence) #write genbank file output_file = open('{} Biosynthesis.gb'.format(molecule), 'w') SeqIO.write(record, output_file, 'genbank') output_file.close() record = open('{} Biosynthesis.gb'.format(molecule), "r") return(record.read()) #create the interface demo = gr.Interface( biosynthesize, inputs=[gr.Textbox(lines=1, placeholder="Biosynthesize this molecule"),gr.Textbox(lines=1, placeholder="In this bacteria")], outputs="text", title="What do you want to make", ) demo.launch()