Spaces:

sabazo
/

innoSageAgentOne

Sleeping

App Files Files Community

isayahc commited on Mar 16

Commit

b11b693

•

2 Parent(s): 7b1a83a d41780e

Merge pull request #11 from almutareb/sqlite_for_sources

Browse files

Files changed (9) hide show

app.py +5 -2
hf_mixtral_agent.py +4 -29
innovation_pathfinder_ai/database/db_handler.py +109 -0
innovation_pathfinder_ai/database/schema.py +15 -0
innovation_pathfinder_ai/structured_tools/structured_tools.py +24 -23
innovation_pathfinder_ai/utils.py +0 -42
innovation_pathfinder_ai/utils/logger.py +20 -0
innovation_pathfinder_ai/utils/utils.py +171 -0
requirements.txt +3 -1

app.py CHANGED Viewed

@@ -3,7 +3,10 @@ from hf_mixtral_agent import agent_executor
 from innovation_pathfinder_ai.source_container.container import (
     all_sources
 )
-from innovation_pathfinder_ai.utils import collect_urls
 if __name__ == "__main__":
@@ -13,7 +16,7 @@ if __name__ == "__main__":
     def bot(history):
         response = infer(history[-1][0], history)
-        sources = collect_urls(all_sources)
         src_list = '\n'.join(sources)
         response_w_sources = response['output']+"\n\n\n Sources: \n\n\n"+src_list
         history[-1][1] = response_w_sources

 from innovation_pathfinder_ai.source_container.container import (
     all_sources
 )
+from innovation_pathfinder_ai.utils.utils import extract_urls
+from innovation_pathfinder_ai.utils import logger
+logger = logger.get_console_logger("app")
 if __name__ == "__main__":
     def bot(history):
         response = infer(history[-1][0], history)
+        sources = extract_urls(all_sources)
         src_list = '\n'.join(sources)
         response_w_sources = response['output']+"\n\n\n Sources: \n\n\n"+src_list
         history[-1][1] = response_w_sources

hf_mixtral_agent.py CHANGED Viewed

@@ -1,15 +1,9 @@
 # HF libraries
 from langchain_community.llms import HuggingFaceEndpoint
-from langchain_core.prompts import ChatPromptTemplate
-from langchain import hub
-import gradio as gr
 from langchain.agents import AgentExecutor
 from langchain.agents.format_scratchpad import format_log_to_str
-from langchain.agents.output_parsers import (
-    ReActJsonSingleInputOutputParser,
-)
 # Import things that are needed generically
-from typing import List, Dict
 from langchain.tools.render import render_text_description
 import os
 from dotenv import load_dotenv
@@ -17,12 +11,11 @@ from innovation_pathfinder_ai.structured_tools.structured_tools import (
     arxiv_search, get_arxiv_paper, google_search, wikipedia_search
 )
-# hacky and should be replaced with a database
-from innovation_pathfinder_ai.source_container.container import (
-    all_sources
-)
 from langchain import PromptTemplate
 from innovation_pathfinder_ai.templates.react_json_with_memory import template_system
 config = load_dotenv(".env")
 HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
@@ -49,13 +42,6 @@ tools = [
 #    get_arxiv_paper,
     ]
-tools_papers = [
-    arxiv_search,
-    get_arxiv_paper,
-]
 prompt = PromptTemplate.from_template(
     template=template_system
 )
@@ -87,15 +73,4 @@ agent_executor = AgentExecutor(
     #max_execution_time=60,  # timout at 60 sec
     return_intermediate_steps=True,
     handle_parsing_errors=True,
-    )
-# instantiate AgentExecutor
-agent_executor_noweb = AgentExecutor(
-    agent=agent,
-    tools=tools_papers,
-    verbose=True,
-    max_iterations=6,       # cap number of iterations
-    #max_execution_time=60,  # timout at 60 sec
-    return_intermediate_steps=True,
-    handle_parsing_errors=True,
     )

 # HF libraries
 from langchain_community.llms import HuggingFaceEndpoint
 from langchain.agents import AgentExecutor
 from langchain.agents.format_scratchpad import format_log_to_str
+from langchain.agents.output_parsers import ReActJsonSingleInputOutputParser
 # Import things that are needed generically
 from langchain.tools.render import render_text_description
 import os
 from dotenv import load_dotenv
     arxiv_search, get_arxiv_paper, google_search, wikipedia_search
 )
 from langchain import PromptTemplate
 from innovation_pathfinder_ai.templates.react_json_with_memory import template_system
+from innovation_pathfinder_ai.utils import logger
+logger = logger.get_console_logger("hf_mixtral_agent")
 config = load_dotenv(".env")
 HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
 #    get_arxiv_paper,
     ]
 prompt = PromptTemplate.from_template(
     template=template_system
 )
     #max_execution_time=60,  # timout at 60 sec
     return_intermediate_steps=True,
     handle_parsing_errors=True,
     )

innovation_pathfinder_ai/database/db_handler.py ADDED Viewed

	@@ -0,0 +1,109 @@

+from sqlmodel import SQLModel, create_engine, Session, select
+from innovation_pathfinder_ai.database.schema import Sources
+from innovation_pathfinder_ai.utils.logger import get_console_logger
+sqlite_file_name = "innovation_pathfinder_ai/database/database.sqlite3"
+sqlite_url = f"sqlite:///{sqlite_file_name}"
+engine = create_engine(sqlite_url, echo=False)
+logger = get_console_logger("db_handler")
+SQLModel.metadata.create_all(engine)
+def read_one(hash_id: dict):
+    with Session(engine) as session:
+        statement = select(Sources).where(Sources.hash_id == hash_id)
+        sources = session.exec(statement).first()
+        return sources
+def add_one(data: dict):
+    with Session(engine) as session:
+        if session.exec(
+            select(Sources).where(Sources.hash_id == data.get("hash_id"))
+        ).first():
+            logger.warning(f"Item with hash_id {data.get('hash_id')} already exists")
+            return None  # or raise an exception, or handle as needed
+        sources = Sources(**data)
+        session.add(sources)
+        session.commit()
+        session.refresh(sources)
+        logger.info(f"Item with hash_id {data.get('hash_id')} added to the database")
+        return sources
+def update_one(hash_id: dict, data: dict):
+    with Session(engine) as session:
+        # Check if the item with the given hash_id exists
+        sources = session.exec(
+            select(Sources).where(Sources.hash_id == hash_id)
+        ).first()
+        if not sources:
+            logger.warning(f"No item with hash_id {hash_id} found for update")
+            return None  # or raise an exception, or handle as needed
+        for key, value in data.items():
+            setattr(sources, key, value)
+        session.commit()
+        logger.info(f"Item with hash_id {hash_id} updated in the database")
+        return sources
+def delete_one(id: int):
+    with Session(engine) as session:
+        # Check if the item with the given hash_id exists
+        sources = session.exec(
+            select(Sources).where(Sources.hash_id == id)
+        ).first()
+        if not sources:
+            logger.warning(f"No item with hash_id {id} found for deletion")
+            return None  # or raise an exception, or handle as needed
+        session.delete(sources)
+        session.commit()
+        logger.info(f"Item with hash_id {id} deleted from the database")
+def add_many(data: list):
+    with Session(engine) as session:
+        for info in data:
+            # Reuse add_one function for each item
+            result = add_one(info)
+            if result is None:
+                logger.warning(
+                    f"Item with hash_id {info.get('hash_id')} could not be added"
+                )
+            else:
+                logger.info(
+                    f"Item with hash_id {info.get('hash_id')} added to the database"
+                )
+        session.commit()  # Commit at the end of the loop
+def delete_many(ids: list):
+    with Session(engine) as session:
+        for id in ids:
+            # Reuse delete_one function for each item
+            result = delete_one(id)
+            if result is None:
+                logger.warning(f"No item with hash_id {id} found for deletion")
+            else:
+                logger.info(f"Item with hash_id {id} deleted from the database")
+        session.commit()  # Commit at the end of the loop
+def read_all(query: dict = None):
+    with Session(engine) as session:
+        statement = select(Sources)
+        if query:
+            statement = statement.where(
+                *[getattr(Sources, key) == value for key, value in query.items()]
+            )
+        sources = session.exec(statement).all()
+        return sources
+def delete_all():
+    with Session(engine) as session:
+        session.exec(Sources).delete()
+        session.commit()
+        logger.info("All items deleted from the database")

innovation_pathfinder_ai/database/schema.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from sqlmodel import SQLModel, Field
+from typing import Optional
+import datetime
+class Sources(SQLModel, table=True):
+    id: Optional[int] = Field(default=None, primary_key=True)
+    url: str = Field()
+    title: Optional[str] = Field(default="NA", unique=False)
+    hash_id: str = Field(unique=True)
+    created_at: float = Field(default=datetime.datetime.now().timestamp())
+    summary: str = Field(default="")
+    embedded: bool = Field(default=False)
+    __table_args__ = {"extend_existing": True}

innovation_pathfinder_ai/structured_tools/structured_tools.py CHANGED Viewed

@@ -6,31 +6,32 @@ from langchain_community.utilities import WikipediaAPIWrapper
 #from langchain.tools import Tool
 from langchain_community.utilities import GoogleSearchAPIWrapper
 import arxiv
 # hacky and should be replaced with a database
 from innovation_pathfinder_ai.source_container.container import (
     all_sources
 )
-from innovation_pathfinder_ai.utils import create_wikipedia_urls_from_text
 @tool
 def arxiv_search(query: str) -> str:
     """Search arxiv database for scientific research papers and studies. This is your primary information source.
     always check it first when you search for information, before using any other tool."""
-    # return "LangChain"
     global all_sources
-    arxiv_retriever = ArxivRetriever(load_max_docs=2)
     data = arxiv_retriever.invoke(query)
     meta_data = [i.metadata for i in data]
-    # meta_data += all_sources
-    # all_sources += meta_data
-    all_sources += meta_data
-    # formatted_info = format_info(entry_id, published, title, authors)
-    # formatted_info = format_info_list(all_sources)
-    return meta_data.__str__()
 @tool
 def get_arxiv_paper(paper_id:str) -> None:
@@ -52,17 +53,13 @@ def get_arxiv_paper(paper_id:str) -> None:
 @tool
 def google_search(query: str) -> str:
     """Search Google for additional results when you can't answer questions using arxiv search or wikipedia search."""
-    # return "LangChain"
     global all_sources
     websearch = GoogleSearchAPIWrapper()
-    search_results:dict = websearch.results(query, 5)
-    #organic_source = search_results['organic_results']
-    # formatted_string = "Title: {title}, link: {link}, snippet: {snippet}".format(**organic_source)
-    cleaner_sources = ["Title: {title}, link: {link}, snippet: {snippet}".format(**i) for i in search_results]
     all_sources += cleaner_sources
     return cleaner_sources.__str__()
@@ -75,5 +72,9 @@ def wikipedia_search(query: str) -> str:
     api_wrapper = WikipediaAPIWrapper()
     wikipedia_search = WikipediaQueryRun(api_wrapper=api_wrapper)
     wikipedia_results = wikipedia_search.run(query)
-    all_sources += create_wikipedia_urls_from_text(wikipedia_results)
-    return wikipedia_results

 #from langchain.tools import Tool
 from langchain_community.utilities import GoogleSearchAPIWrapper
 import arxiv
+import ast
 # hacky and should be replaced with a database
 from innovation_pathfinder_ai.source_container.container import (
     all_sources
 )
+from innovation_pathfinder_ai.utils.utils import (
+    parse_list_to_dicts, format_wiki_summaries, format_arxiv_documents, format_search_results
+)
+from innovation_pathfinder_ai.database.db_handler import (
+    add_many
+)
 @tool
 def arxiv_search(query: str) -> str:
     """Search arxiv database for scientific research papers and studies. This is your primary information source.
     always check it first when you search for information, before using any other tool."""
     global all_sources
+    arxiv_retriever = ArxivRetriever(load_max_docs=3)
     data = arxiv_retriever.invoke(query)
     meta_data = [i.metadata for i in data]
+    formatted_sources = format_arxiv_documents(data)
+    all_sources += formatted_sources
+    parsed_sources = parse_list_to_dicts(formatted_sources)
+    add_many(parsed_sources)
+    return data.__str__()
 @tool
 def get_arxiv_paper(paper_id:str) -> None:
 @tool
 def google_search(query: str) -> str:
     """Search Google for additional results when you can't answer questions using arxiv search or wikipedia search."""
     global all_sources
     websearch = GoogleSearchAPIWrapper()
+    search_results:dict = websearch.results(query, 3)
+    cleaner_sources =format_search_results(search_results)
+    parsed_csources = parse_list_to_dicts(cleaner_sources)
+    add_many(parsed_csources)
     all_sources += cleaner_sources
     return cleaner_sources.__str__()
     api_wrapper = WikipediaAPIWrapper()
     wikipedia_search = WikipediaQueryRun(api_wrapper=api_wrapper)
     wikipedia_results = wikipedia_search.run(query)
+    formatted_summaries = format_wiki_summaries(wikipedia_results)
+    all_sources += formatted_summaries
+    parsed_summaries = parse_list_to_dicts(formatted_summaries)
+    add_many(parsed_summaries)
+    return wikipedia_results.__str__()

innovation_pathfinder_ai/utils.py DELETED Viewed

@@ -1,42 +0,0 @@
-def create_wikipedia_urls_from_text(text):
-    """
-    Extracts page titles from a given text and constructs Wikipedia URLs for each title.
-    Args:
-    - text (str): A string containing multiple sections, each starting with "Page:" followed by the title.
-    Returns:
-    - list: A list of Wikipedia URLs constructed from the extracted titles.
-    """
-    # Split the text into sections based on "Page:" prefix
-    sections = text.split("Page: ")
-    # Remove the first item if it's empty (in case the text starts with "Page:")
-    if sections[0].strip() == "":
-        sections = sections[1:]
-    urls = []  # Initialize an empty list to store the URLs
-    for section in sections:
-        # Extract the title, which is the string up to the first newline
-        title = section.split("\n", 1)[0]
-        # Replace spaces with underscores for the URL
-        url_title = title.replace(" ", "_")
-        # Construct the URL and add it to the list
-        url = f"https://en.wikipedia.org/wiki/{url_title}"
-        urls.append(url)
-    return urls
-def collect_urls(data_list):
-    urls = []
-    for item in data_list:
-        # Check if item is a string and contains 'link:'
-        if isinstance(item, str) and 'link:' in item:
-            start = item.find('link:') + len('link: ')
-            end = item.find(',', start)
-            url = item[start:end if end != -1 else None].strip()
-            urls.append(url)
-        # Check if item is a dictionary and has 'Entry ID'
-        elif isinstance(item, dict) and 'Entry ID' in item:
-            urls.append(item['Entry ID'])
-    last_sources = urls[-3:]
-    return last_sources

innovation_pathfinder_ai/utils/logger.py ADDED Viewed

	@@ -0,0 +1,20 @@

+# logger.py
+import logging
+from rich.logging import RichHandler
+from typing import Optional
+def get_console_logger(name: Optional[str] = "default") -> logging.Logger:
+    logger = logging.getLogger(name)
+    if not logger.handlers:
+        logger.setLevel(logging.DEBUG)
+        console_handler = RichHandler()
+        console_handler.setLevel(logging.DEBUG)
+        formatter = logging.Formatter(
+            "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+        )
+        console_handler.setFormatter(formatter)
+        logger.addHandler(console_handler)
+    return logger

innovation_pathfinder_ai/utils/utils.py ADDED Viewed

	@@ -0,0 +1,171 @@

+import hashlib
+import datetime
+from innovation_pathfinder_ai.utils import logger
+logger = logger.get_console_logger("utils")
+def create_wikipedia_urls_from_text(text):
+    """
+    Extracts page titles from a given text and constructs Wikipedia URLs for each title.
+    Args:
+    - text (str): A string containing multiple sections, each starting with "Page:" followed by the title.
+    Returns:
+    - list: A list of Wikipedia URLs constructed from the extracted titles.
+    """
+    # Split the text into sections based on "Page:" prefix
+    sections = text.split("Page: ")
+    # Remove the first item if it's empty (in case the text starts with "Page:")
+    if sections[0].strip() == "":
+        sections = sections[1:]
+    urls = []  # Initialize an empty list to store the URLs
+    for section in sections:
+        # Extract the title, which is the string up to the first newline
+        title = section.split("\n", 1)[0]
+        # Replace spaces with underscores for the URL
+        url_title = title.replace(" ", "_")
+        # Construct the URL and add it to the list
+        url = f"https://en.wikipedia.org/wiki/{url_title}"
+        urls.append(url)
+        print(urls)
+    return urls
+def extract_urls(data_list):
+    """
+    Extracts URLs from a list of of dictionaries.
+    Parameters:
+    - formatted_list (list): A list of dictionaries, each containing 'Title:', 'link:', and 'summary:'.
+    Returns:
+    - list: A list of URLs extracted from the dictionaries.
+    """
+    urls = []
+    print(data_list)
+    for item in data_list:
+        try:
+            # Find the start and end indices of the URL
+            lower_case = item.lower()
+            link_prefix = 'link: '
+            summary_prefix = ', summary:'
+            start_idx = lower_case.index(link_prefix) + len(link_prefix)
+            end_idx = lower_case.index(summary_prefix, start_idx)
+            # Extract the URL using the indices found
+            url = item[start_idx:end_idx]
+            urls.append(url)
+        except ValueError:
+            # Handles the case where 'link: ' or ', summary:' is not found in the string
+            print("Could not find a URL in the item:", item)
+    last_sources = urls[-3:]
+    return last_sources
+def format_wiki_summaries(input_text):
+    """
+    Parses a given text containing page titles and summaries, formats them into a list of strings,
+    and appends Wikipedia URLs based on titles.
+    Parameters:
+    - input_text (str): A string containing titles and summaries separated by specific markers.
+    Returns:
+    - list: A list of formatted strings with titles, summaries, and Wikipedia URLs.
+    """
+    # Splitting the input text into individual records based on double newlines
+    records = input_text.split("\n\n")
+    formatted_records_with_urls = []
+    for record in records:
+        if "Page:" in record and "Summary:" in record:
+            title_line, summary_line = record.split("\n", 1)  # Splitting only on the first newline
+            title = title_line.replace("Page: ", "").strip()
+            summary = summary_line.replace("Summary: ", "").strip()
+            # Replace spaces with underscores for the URL and construct the Wikipedia URL
+            url_title = title.replace(" ", "_")
+            wikipedia_url = f"https://en.wikipedia.org/wiki/{url_title}"
+            # Append formatted string with title, summary, and URL
+            formatted_record = "Title: {title}, Link: {wikipedia_url}, Summary: {summary}".format(
+                title=title, summary=summary, wikipedia_url=wikipedia_url)
+            formatted_records_with_urls.append(formatted_record)
+        else:
+            print("Record format error, skipping record:", record)
+    return formatted_records_with_urls
+def format_arxiv_documents(documents):
+    """
+    Formats a list of document objects into a list of strings.
+    Each document object is assumed to have a 'metadata' dictionary with 'Title' and 'Entry ID',
+    and a 'page_content' attribute for content.
+    Parameters:
+    - documents (list): A list of document objects.
+    Returns:
+    - list: A list of formatted strings with titles, links, and content snippets.
+    """
+    formatted_documents = [
+        "Title: {title}, Link: {link}, Summary: {snippet}".format(
+            title=doc.metadata['Title'],
+            link=doc.metadata['Entry ID'],
+            snippet=doc.page_content  # Adjust the snippet length as needed
+        )
+        for doc in documents
+    ]
+    return formatted_documents
+def format_search_results(search_results):
+    """
+    Formats a list of dictionaries containing search results into a list of strings.
+    Each dictionary is expected to have the keys 'title', 'link', and 'snippet'.
+    Parameters:
+    - search_results (list): A list of dictionaries, each containing 'title', 'link', and 'snippet'.
+    Returns:
+    - list: A list of formatted strings based on the search results.
+    """
+    formatted_results = [
+        "Title: {title}, Link: {link}, Summary: {snippet}".format(**i)
+        for i in search_results
+    ]
+    return formatted_results
+def parse_list_to_dicts(items: list) -> list:
+    parsed_items = []
+    for item in items:
+        # Extract title, link, and summary from each string
+        title_start = item.find('Title: ') + len('Title: ')
+        link_start = item.find('Link: ') + len('Link: ')
+        summary_start = item.find('Summary: ') + len('Summary: ')
+        title_end = item.find(', Link: ')
+        link_end = item.find(', Summary: ')
+        summary_end = len(item)
+        title = item[title_start:title_end]
+        link = item[link_start:link_end]
+        summary = item[summary_start:summary_end]
+        # Use the hash_text function for the hash_id
+        hash_id = hash_text(link)
+        # Construct the dictionary for each item
+        parsed_item = {
+            "url": link,
+            "title": title,
+            "hash_id": hash_id,
+            "summary": summary
+        }
+        parsed_items.append(parsed_item)
+    return parsed_items
+def hash_text(text: str) -> str:
+    return hashlib.md5(text.encode()).hexdigest()
+def convert_timestamp_to_datetime(timestamp: str) -> str:
+    return datetime.datetime.fromtimestamp(int(timestamp)).strftime("%Y-%m-%d %H:%M:%S")

requirements.txt CHANGED Viewed

@@ -8,4 +8,6 @@ wikipedia
 gradio==3.48.0
 chromadb
 google_api_python_client
-pypdf2

 gradio==3.48.0
 chromadb
 google_api_python_client
+pypdf2
+sqlmodel
+rich