Spaces:

jayman1466
/

synChat

Runtime error

App Files Files Community

synChat / app.py

jayman1466

Update app.py

bb74fd7 over 1 year ago

raw

history blame contribute delete

7.53 kB

	import gradio as gr
	import os
	import pandas as pd
	from langchain.llms import OpenAI
	from langchain.prompts import PromptTemplate
	from langchain.chains import LLMChain
	from pydantic import BaseModel, Field
	from langchain.tools import BaseTool, StructuredTool, Tool, tool

	from langchain.memory import ChatMessageHistory

	from langchain.agents import initialize_agent
	from langchain.agents import AgentType
	from langchain.agents import load_tools
	#from langchain.tools import PubmedQueryRun
	from langchain.utilities import WikipediaAPIWrapper
	from langchain.agents import create_pandas_dataframe_agent

	from Bio import Entrez, SeqIO
	from Bio.Seq import Seq
	from Bio.SeqRecord import SeqRecord
	from Bio.SeqFeature import SeqFeature, FeatureLocation
	import random

	#set API key
	os.environ["OPENAI_API_KEY"] = "" #openai key
	#prep database of reactions
	biochem_data_url = "https://bkms.brenda-enzymes.org/download/Reactions_BKMS.tar.gz"
	biochem_df = pd.read_csv(biochem_data_url, compression='gzip', header=0, sep="\t")

	def biosynthesize(molecule,organism):

	molecule = molecule
	organism = organism

	#initialize LLM
	llm = OpenAI(temperature=0.5)

	#initialize memory
	history = ChatMessageHistory()

	#create prompt template for an openAI search
	prompt = PromptTemplate(
	input_variables=["molecule",],
	template="how do you biosynthesize {molecule}. Output a table of necessary genes with two columns: Gene and Function",
	)
	#and initiate its associated chain
	chain = LLMChain(llm=llm, prompt=prompt)
	this_output = (chain.run(molecule))

	#parse this output into a dataframe
	this_output = this_output.split('\n')
	df_array = []
	for line in this_output:
	if len(line) > 0:
	df_dict = {}
	line = line.split(' \| ')
	if line[0] != 'Gene' or line[0] != 'gene' or line[0] != '----': #fucking weird ass edge cases
	df_dict['gene'] = line[0]
	if len(line) > 1:
	df_dict['function'] = line[1]
	df_array.append(df_dict)


	#next augment this table by asking wikipedia
	#initialize tools
	#schema for wikipedia search
	class SearchInput(BaseModel):
	query: str = Field(description="should be molecule name")

	wikipedia = WikipediaAPIWrapper()
	tools = [
	Tool.from_function(
	func=wikipedia.run,
	name = "SearchWikipedia",
	description="Searching wikipedia for genes that produce a molecule",
	args_schema=SearchInput
	# coroutine= ... <- you can specify an async method if desired as well
	),
	]
	#initialize agent for the wikipedia search
	#agent = initialize_agent(tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=True)
	#agent.run('how do you biosynthesize {}. output the gene names.'.format(molecule))

	gene_df = pd.DataFrame(df_array)

	#pull out just the needed columns from the reaction database for now to make it easier
	biochem_reactions = biochem_df[['Reaction', 'Reaction_ID_MetaCyc', 'Recommended_Name']]

	#initialize the pandas agent
	pandas_agent = create_pandas_dataframe_agent(OpenAI(temperature=0), biochem_reactions, verbose=False)

	#figure out the EC number of the gene
	new_array = []
	for i, gene in gene_df.iterrows():
	answer = pandas_agent.run('which element most closely matches {}'.format(gene['function']))

	biochem_info = biochem_df[(biochem_df.Reaction == answer) \| (biochem_df.Reaction_ID_MetaCyc == answer) \| (biochem_df.Recommended_Name == answer)]
	#see if anything was even found
	if biochem_info.shape[0] > 0:
	new_dict = {}
	new_dict['EC_Number'] = biochem_info.iloc[0]['EC_Number']
	new_dict['gene'] = gene['gene']
	new_dict['function'] = gene['function']
	new_array.append(new_dict)
	EC_df = pd.DataFrame(new_array)

	#find protein sequences for these predictions
	Entrez.email = 'jayman1466@gmail.com'

	gb_array = []
	for i,gene in EC_df.iterrows():
	#search NCBI for any record available
	gb_dict = {}
	info = Entrez.einfo()
	info = Entrez.esearch(db = "protein",term = gene['EC_Number'])
	record = Entrez.read(info)

	#arbitrarily pick the first ID for now
	this_id = record['IdList'][0]

	#pull the sequence of that gene
	handle = Entrez.efetch(
	db = "protein", id = this_id, rettype = "fasta")
	record = SeqIO.read( handle, "fasta" )
	gb_dict['sequence'] = str(record.seq)
	gb_dict['actual_name'] = record.name
	gb_dict['ID'] = record.id
	gb_dict['EC'] = gene['EC_Number']
	gb_dict['desired_name'] = gene['gene']
	gb_array.append(gb_dict)



	#start to create the actual genbank file

	#to create random DNA sequences
	def DNA(length):
	return ''.join(random.choice('CGTA') for _ in range(length))

	#back translate
	def orf_recode(sequence):
	sequence = sequence.upper()

	this_dict = {"A":"GCA","C":"TGT","D":"GAT","E":"GAA","F":"TTT","G":"GGA","H":"CAC","I":"ATA","K":"AAA","L":"CTT","M":"ATG","N":"AAT","P":"CCC","Q":"CAA","R":"AGG","S":"TCA","T":"ACA","V":"GTA","Y":"TAT","W":"TGG","*":"TAA"};

	output=''

	for c in sequence:
	output+=this_dict[c]
	output+='TAA'
	return output


	current_DNA_coord = 0
	this_name = molecule
	#initiate a record for a genbank file output
	record = SeqRecord(Seq(""),
	id='1243',
	name=this_name,
	description='Generated with synXNA',
	annotations={"molecule_type": "DNA"})
	full_sequence = ""

	#iterate through the individual genes
	for item in gb_array:
	#promoter
	pro = DNA(50)
	feature = SeqFeature(FeatureLocation(start=current_DNA_coord, end=current_DNA_coord + len(pro)), type='promoter', qualifiers={'label':'promoter'})
	record.features.append(feature)
	full_sequence = full_sequence + pro
	current_DNA_coord = current_DNA_coord + len(pro)

	rbs = DNA(35)
	feature = SeqFeature(FeatureLocation(start=current_DNA_coord, end=current_DNA_coord + len(rbs)), type='RBS', qualifiers={'label':'RBS'})
	record.features.append(feature)
	full_sequence = full_sequence + rbs
	current_DNA_coord = current_DNA_coord + len(rbs)

	cds = item['sequence']
	cds_nuc = orf_recode(cds)
	feature = SeqFeature(FeatureLocation(start=current_DNA_coord, end=current_DNA_coord + len(cds_nuc)), type='CDS', qualifiers={'label':item['EC'] + " " + item['desired_name'], 'translation':cds})
	record.features.append(feature)
	full_sequence = full_sequence + cds_nuc
	current_DNA_coord = current_DNA_coord + len(cds_nuc)

	record.seq = Seq(full_sequence)

	#write genbank file
	output_file = open('{} Biosynthesis.gb'.format(molecule), 'w')
	SeqIO.write(record, output_file, 'genbank')
	output_file.close()
	record = open('{} Biosynthesis.gb'.format(molecule), "r")
	return(record.read())

	#create the interface
	demo = gr.Interface(
	biosynthesize,
	inputs=[gr.Textbox(lines=1, placeholder="Biosynthesize this molecule"),gr.Textbox(lines=1, placeholder="In this bacteria")],
	outputs="text",
	title="What do you want to make",
	)
	demo.launch()