Spaces:

maykcaldas
/

MAPI_LLM

Build error

App Files Files Community

maykcaldas commited on Mar 30, 2023

Commit

f274d93

•

1 Parent(s): 5216067

First commit

Browse files

Files changed (4) hide show

agent.py +68 -0
app.py +52 -0
mapi_tools.py +215 -0
utils.py +96 -0

agent.py ADDED Viewed

	@@ -0,0 +1,68 @@

+from mapi_tools import MAPI_class_tools, MAPI_reg_tools
+from utils import common_tools
+from langchain import OpenAI
+from gpt_index import GPTListIndex, GPTIndexMemory
+from langchain import agents
+from langchain.agents import initialize_agent
+stability = MAPI_class_tools(
+    "is_stable","stable","Stable","Unstable"
+    )
+magnetism = MAPI_class_tools(
+    "is_magnetic","magnetic","Magnetic","Not magnetic"
+    )
+metal = MAPI_class_tools(
+    "is_metal","metallic","Metal","Not metal"
+    )
+gap_direct = MAPI_class_tools(
+    "is_gap_direct","gap direct","Gap direct","Gap indirect"
+    )
+band_gap = MAPI_reg_tools(
+    "band_gap","band gap"
+    )
+energy_per_atom = MAPI_reg_tools(
+    "energy_per_atom","energy per atom gap"
+    )
+formation_energy_per_atom = MAPI_reg_tools(
+    "formation_energy_per_atom","formation energy per atom gap"
+    )
+volume = MAPI_reg_tools(
+    "volume","volume"
+    )
+density = MAPI_reg_tools(
+    "density","density"
+    )
+atomic_density = MAPI_reg_tools(
+    "density_atomic","atomic density"
+    )
+electronic_energy = MAPI_reg_tools(
+    "e_electronic","electronic energy"
+    )
+ionic_energy = MAPI_reg_tools(
+    "e_ion","cationic energy"
+    )
+total_energy = MAPI_reg_tools(
+    "e_total","total energy"
+    )
+memory = GPTIndexMemory(index=GPTListIndex([]), memory_key="chat_history", query_kwargs={"response_mode": "compact"})
+llm=OpenAI(temperature=0.7)
+tools = (
+          stability.get_tools() +
+          magnetism.get_tools() +
+          gap_direct.get_tools() +
+          metal.get_tools() +
+          band_gap.get_tools() +
+          volume.get_tools() +
+          density.get_tools() +
+          atomic_density.get_tools() +
+          formation_energy_per_atom.get_tools() +
+          energy_per_atom.get_tools() +
+          electronic_energy.get_tools() +
+          ionic_energy.get_tools() +
+          total_energy.get_tools() +
+          agents.load_tools(["llm-math", "python_repl"], llm=llm) +
+          common_tools
+         )
+agent_chain = initialize_agent(tools, llm, agent="zero-shot-react-description", verbose=True, memory=memory)

app.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import gradio as gr
+import numpy as np
+import agent
+import os
+css_style = """
+.gradio-container {
+    font-family: "IBM Plex Mono";
+}
+"""
+def agent_run(q, openai_api_key, mapi_api_key):
+    os.environ["OPENAI_API_KEY"]=openai_api_key
+    os.environ["MAPI_API_KEY"]=mapi_api_key
+    try:
+        out = agent.agent_chain.run(input=q)
+    except:
+        out = "Something went wrong, please try again"
+    return out
+with gr.Blocks(css=css_style) as demo:
+    gr.Markdown(f'''
+    # A LLM application developed during the LLM March *MADNESS* Hackathon
+    - Developed by: Mayk Caldas ([@maykcaldas](https://github.com/maykcaldas)) and Sam Cox ([@SamCox822](https://github.com/SamCox822))
+    ## What is this?
+    - This is a demo of a LLM agent that can answer questions about materials science using the [LangChain🦜️🔗](https://github.com/hwchase17/langchain/) and the [Materials Project API](https://materialsproject.org/).
+    - Its behave is based on Large Language Models (LLM) and aim to be a tool to help scientists with quick predictions of a nunerous of properties of materials.
+    It is a work in progress, so please be patient with it.
+    ### Some keys are needed in order to use it:
+    1. An openAI API key ( [Check it here](https://platform.openai.com/account/api-keys) )
+    2. A material project's API key ( [Check it here](https://materialsproject.org/api#api-key) )
+    ''')
+    with gr.Accordion("List of properties we developed tools for", open=False):
+        gr.Markdown(f"""
+        Classification tasks: Stability, magnetism, gap_direct, metal,
+        regression tasks: band_gap, volume, density, atomic_density, formation energy per atom, energy per atom, electronic energy, ionic energy, total energy
+        """)
+    openai_api_key = gr.Textbox(
+        label="OpenAI API Key", placeholder="sk-...", type="password")
+    mapi_api_key = gr.Textbox(
+        label="Material Project API Key", placeholder="...", type="password")
+    with gr.Tab("MAPI Query"):
+        text_input = gr.Textbox(label="", placeholder="Enter question here...")
+        text_output = gr.Textbox()
+        text_button = gr.Button("Query!")
+    text_button.click(agent_run, inputs=[text_input, openai_api_key, mapi_api_key], outputs=text_output)
+demo.launch()

mapi_tools.py ADDED Viewed

	@@ -0,0 +1,215 @@

+from mp_api.client import MPRester
+from emmet.core.summary import HasProps
+import openai
+import langchain
+from langchain import OpenAI
+from langchain import agents
+from langchain.agents import initialize_agent
+from langchain.agents import Tool, tool
+from langchain import LLMMathChain, SerpAPIWrapper
+from gpt_index import GPTListIndex, GPTIndexMemory
+from langchain import SerpAPIWrapper
+from langchain.prompts.few_shot import FewShotPromptTemplate
+from langchain.prompts.prompt import PromptTemplate
+from langchain.vectorstores import FAISS, Chroma
+from langchain.embeddings import OpenAIEmbeddings
+from langchain.prompts.example_selector import (MaxMarginalRelevanceExampleSelector,
+                                                SemanticSimilarityExampleSelector)
+import requests
+from rdkit import Chem
+import pandas as pd
+import os
+class MAPITools:
+  def __init__(self):
+    self.model = 'text-ada-001' #maybe change to gpt-4 when ready
+    self.k=10
+  def get_material_atoms(self, formula):
+    '''Receives a material formula and returns the atoms symbols present in it separated by comma.'''
+    import re
+    pattern = re.compile(r"([A-Z][a-z]*)(\d*)")
+    matches = pattern.findall(formula)
+    atoms = []
+    for m in matches:
+      atom, count = m
+      count = int(count) if count else 1
+      atoms.append((atom, count))
+    return ",".join([a[0] for a in atoms])
+  def check_prop_by_formula(self, formula):
+    raise NotImplementedError('Should be implemented in children classes')
+  def search_similars_by_atom(self, atoms):
+    '''This function receives a string with the atoms separated by comma as input and returns a list of similar materials'''
+    atoms = atoms.replace(" ", "")
+    with MPRester(os.getenv("MAPI_API_KEY")) as mpr:
+      docs = mpr.summary.search(elements=atoms.split(','), fields=["formula_pretty", self.prop])
+    return docs
+  def create_context_prompt(self, formula):
+    raise NotImplementedError('Should be implemented in children classes')
+  def LLM_predict(self, prompt):
+    ''' This function receives a prompt generate with context by the create_context_prompt tool and request a completion to a language model. Then returns the completion'''
+    llm = OpenAI(
+          model_name=self.model,
+          temperature=0.7,
+          n=1,
+          best_of=5,
+          top_p=1.0,
+          stop=["\n\n", "###", "#", "##"],
+          # model_kwargs=kwargs,
+      )
+    return llm.generate([prompt]).generations[0][0].text
+  def get_tools(self):
+    return [
+        Tool(
+            name = "Get atoms in material",
+            func = self.get_material_atoms,
+            description = (
+              "Receives a material formula and returns the atoms symbols present in it separated by comma."
+              )
+        ),
+        Tool(
+            name = f"Checks if material is {self.prop_name} by formula",
+            func = self.check_prop_by_formula,
+            description = (
+                f"This functions searches in the material project's API for the formula and returns if it is {self.prop_name} or not."
+              )
+        ),
+        # Tool(
+        #     name = "Search similar materials by atom",
+        #     func = self.search_similars_by_atom,
+        #     description = (
+        #       "This function receives a string with the atoms separated by comma as input and returns a list of similar materials."
+        #       )
+        # ),
+        Tool(
+            name = f"Create {self.prop_name} context to LLM search",
+            func = self.create_context_prompt,
+            description = (
+              f"This function received a material formula as input and create a prompt to be inputed in the LLM_predict tool to predict if the material is {self.prop_name}."
+              if isinstance(self, MAPI_class_tools) else
+              f"This function received a material formula as input and create a prompt to be inputed in the LLM_predict tool to predict the {self.prop_name} of a material."
+              )
+        ),
+        Tool(name = "LLM predictiom",
+            func = self.LLM_predict,
+            description = (
+                "This function receives a prompt generate with context by the create_context_prompt tool and request a completion to a language model. Then returns the completion"
+              )
+        )
+    ]
+class MAPI_class_tools(MAPITools):
+  def __init__(self, prop, prop_name, p_label, n_label):
+    super().__init__()
+    self.prop = prop
+    self.prop_name = prop_name
+    self.p_label = p_label
+    self.n_label = n_label
+  def check_prop_by_formula(self, formula):
+    f''' This functions searches in the material project's API for the formula and returns if it is {self.prop_name} or not'''
+    with MPRester(os.getenv("MAPI_API_KEY")) as mpr:
+      docs = mpr.summary.search(formula=formula, fields=["formula_pretty", self.prop])
+    if docs:
+      if docs[0].formula_pretty == formula:
+        return self.p_label if docs[0].dict()[self.prop] else self.n_label
+    return f"Could not find any material while searching {formula}"
+  def create_context_prompt(self, formula):
+    '''This function received a material formula as input and create a prompt to be inputed in the LLM_predict tool to predict if the formula is a stable material '''
+    elements = self.get_material_atoms(formula)
+    similars = self.search_similars_by_atom(elements)
+    similars = [
+        {'formula': ex.formula_pretty,
+        'prop': self.p_label if ex.dict()[self.prop] else self.n_label
+        } for ex in similars
+    ]
+    examples = pd.DataFrame(similars).drop_duplicates().to_dict(orient="records")
+    example_selector = MaxMarginalRelevanceExampleSelector.from_examples(
+                    examples,
+                    OpenAIEmbeddings(),
+                    FAISS,
+                    k=self.k,
+                  )
+    prefix=(
+      f'You are a bot who can predict if a material is {self.prop_name}.\n'
+      f'Given this list of known materials and the information if they are {self.p_label} or {self.n_label}, \n'
+      f'you need to answer the question if the last material is {self.prop_name}:'
+      )
+    prompt_template=PromptTemplate(
+                  input_variables=["formula", "prop"],
+                  template=f"Is {{formula}} a {self.prop_name} material?@@@\n{{prop}}###",
+              )
+    suffix = f"Is {{formula}} a {self.prop_name} material?@@@\n"
+    prompt = FewShotPromptTemplate(
+              # examples=examples,
+              example_prompt=prompt_template,
+              example_selector=example_selector,
+              prefix=prefix,
+              suffix=suffix,
+              input_variables=["formula"])
+    return prompt.format(formula=formula)
+class MAPI_reg_tools(MAPITools):
+  # TODO: deal with units
+  def __init__(self, prop, prop_name):
+    super().__init__()
+    self.prop = prop
+    self.prop_name = prop_name
+  def check_prop_by_formula(self, formula):
+    ''' This functions searches in the material project's API for the formula and returns if it is stable or not'''
+    with MPRester(os.getenv("MAPI_API_KEY")) as mpr:
+      docs = mpr.summary.search(formula=formula, fields=["formula_pretty", self.prop])
+    if docs:
+      if docs[0].formula_pretty == formula:
+        return docs[0].dict()[self.prop]
+      elif docs[0].dict()[self.prop] is None:
+        return f"There is no record of {self.prop_name} for {formula}"
+    return f"Could not find any material while searching {formula}"
+  def create_context_prompt(self, formula):
+    f'''This function received a material formula as input and create a prompt to be inputed in the LLM_predict tool to predict the {self.prop_name} of the material '''
+    elements = self.get_material_atoms(formula)
+    similars = self.search_similars_by_atom(elements)
+    similars = [
+        {'formula': ex.formula_pretty,
+        'prop': f"{ex.dict()[self.prop]:2f}" if ex.dict()[self.prop] is not None else None
+        } for ex in similars
+    ]
+    examples = pd.DataFrame(similars).drop_duplicates().dropna().to_dict(orient="records")
+    example_selector = MaxMarginalRelevanceExampleSelector.from_examples(
+                    examples,
+                    OpenAIEmbeddings(),
+                    FAISS,
+                    k=self.k,
+                  )
+    prefix=(
+      f'You are a bot who can predict the {self.prop_name} of a material .\n'
+      f'Given this list of known materials and the measurement of their {self.prop_name}, \n'
+      f'you need to answer the what is the {self.prop_name} of the material:'
+       'The answer should be numeric and finish with ###'
+      )
+    prompt_template=PromptTemplate(
+                  input_variables=["formula", "prop"],
+                  template=f"What is the {self.prop_name} for {{formula}}?@@@\n{{prop}}###",
+              )
+    suffix = f"What is the {self.prop_name} for {{formula}}?@@@\n"
+    prompt = FewShotPromptTemplate(
+              # examples=examples,
+              example_prompt=prompt_template,
+              example_selector=example_selector,
+              prefix=prefix,
+              suffix=suffix,
+              input_variables=["formula"])
+    return prompt.format(formula=formula)

utils.py ADDED Viewed

	@@ -0,0 +1,96 @@

+from langchain.agents import Tool, tool
+import requests
+from langchain import OpenAI
+from langchain import LLMMathChain, SerpAPIWrapper
+from rdkit import Chem
+@tool
+def query2smiles(text):
+  '''This function queries the one given molecule name and returns a SMILES string from the record'''
+  try:#query the PubChem database
+    r = requests.get('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/' + text + '/property/IsomericSMILES/JSON')
+    #convert the response to a json object
+    data = r.json()
+    #return the SMILES string
+    smi = data['PropertyTable']['Properties'][0]['IsomericSMILES']
+    # remove salts
+    return smi
+  except:
+    f"Could not find the IUPAC name for {text}"
+@tool
+def smiles2IUPAC(text):
+  '''This function queries the one given smiles name and returns a IUPAC name from the record'''
+  #query the PubChem database
+  try:
+    r = requests.get('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/' + text + '/property/IUPACName/JSON')
+    data = r.json()
+    smi = data["PropertyTable"]["Properties"][0]["IUPACName"]
+    return smi
+  except:
+    return f"Could not find the IUPAC name for {text}"
+@tool
+def formula2IUPAC(text):
+  '''This function queries the one given chemical formula and returns a material name from the record.'''
+  try:
+    r = requests.get('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/formula/' + text + '/property/IUPACName/JSON')
+    data = r.json()
+    print(data)
+    smi = data["PropertyTable"]["Properties"][0]["IUPACName"]
+    return smi
+  except:
+    return f"Could not find the IUPAC name for {text}"
+@tool
+def name2formula(text):
+  '''This function queries the one given material name and returns a chemical formula from the record.'''
+  try:
+    r = requests.get('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/' + text + '/property/MolecularFormula/JSON')
+    data = r.json()
+    print(data)
+    smi = data["PropertyTable"]["Properties"][0]["MolecularFormula"]
+    return smi
+  except:
+    return f"Could not find the molecular formula for {text}"
+@tool
+def canonicalizeSMILES(smiles):
+  '''Given a smiles representation, this function returns a canonicalized version of the same smiles.
+  It's better to search for molecules in its canonicalized form'''
+  return Chem.MolToSmiles(Chem.MolFromSmiles(smiles))
+@tool
+def web_search(keywords, search_engine="google"):
+  '''Useful to do a simple google search.
+      Use this tool to find general information from websites.
+      Use keywords for your search.
+  '''
+  return SerpAPIWrapper(
+    serpapi_api_key=os.getenv("SERP_API_KEY"),
+    search_engine=search_engine
+  ).run(keywords)
+@tool
+def LLM_predict(prompt):
+  ''' This function receives a prompt generate with context by the create_context_prompt tool and request a completion to a language model. Then returns the completion'''
+  llm = OpenAI(
+        model_name='text-ada-001',  #TODO: Maybe change to gpt-4 when ready
+        temperature=0.7,
+        n=1,
+        best_of=5,
+        top_p=1.0,
+        stop=["\n\n", "###", "#", "##"],
+        # model_kwargs=kwargs,
+    )
+  return llm.generate([prompt]).generations[0][0].text
+common_tools = [
+    query2smiles,
+    smiles2IUPAC,
+    # formula2IUPAC,
+    # name2formula,
+    canonicalizeSMILES,
+    web_search,
+    LLM_predict
+]