Spaces:

elyx
/

unhcr

Runtime error

App Files Files Community

elyxlz commited on Mar 14, 2023

Commit

0c7add2

1 Parent(s): be8dc83

initial commit

Browse files

Files changed (9) hide show

.gitattributes +1 -33
.gitignore +2 -0
app.py +104 -0
config/conf_0.1.yaml +52 -0
data/store.pkl +3 -0
ingest.py +78 -0
modules.py +75 -0
requirements.txt +7 -0
tools.py +76 -0

.gitattributes CHANGED Viewed

@@ -1,34 +1,2 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
 *.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text






















1	*.pkl filter=lfs diff=lfs merge=lfs -text
2	+ .pkl filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ .env
2	+ __pycache__

app.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import os
+from typing import Optional, Tuple
+import gradio as gr
+import argparse
+import datetime
+import pickle
+#import whisper
+import dotenv
+import sys
+from io import StringIO
+import re
+dotenv.load_dotenv()
+from langchain.callbacks import get_openai_callback
+import hydra
+from omegaconf import DictConfig, open_dict, OmegaConf
+class ChatbotAgentGradio():
+    def __init__(
+        self,
+        config_name
+    ):
+        config = OmegaConf.load(f'./config/{config_name}.yaml')
+        self.chatbot = hydra.utils.instantiate(config.model, _convert_="partial")
+    def chat(self,
+             inp: str,
+             history: Optional[Tuple[str, str]],
+             ):
+        """Method for integration with gradio Chatbot"""
+        print("\n==== date/time: " + str(datetime.datetime.now()) + " ====")
+        print("inp: " + inp)
+        history = history or []
+        output = self.chatbot.run(inp)
+        history.append((inp, output))
+        return history, history#, ""
+    def update_foo(self, widget, state):
+        if widget:
+            state = widget
+            return state
+    def launch_app(self):
+        block = gr.Blocks(css=".gradio-container {background-color: lightgray}")
+        with block:
+            instance = gr.State()
+            show_chain_state = gr.State(False)
+            with gr.Row():
+                gr.Markdown("<h3><center>UNHCR</center></h3>")
+            with gr.Row():
+                chatbot = gr.Chatbot()
+            with gr.Row():
+                message = gr.Textbox(
+                    label="What's your question?",
+                    lines=1,
+                )
+                submit = gr.Button(value="Send", variant="secondary").style(full_width=False)
+            state = gr.State()
+            agent_state = gr.State()
+            submit.click(self.chat, inputs=[message, state], outputs=[chatbot, state])
+            message.submit(self.chat, inputs=[message, state], outputs=[chatbot, state])
+        block.launch(debug=True, share=False, server_port=7861)#, server_name='192.168.0.73', )
+def simple(config):
+    config = OmegaConf.load(f'./config/{config}.yaml')
+    chatbot = hydra.utils.instantiate(config.model, _convert_="partial")
+    while True:
+        inp = input("\nUser: ")
+        print(chatbot.run(inp))
+if __name__ == '__main__':
+    #simple('conf_0.1')
+    app = ChatbotAgentGradio('conf_0.1')
+    app.launch_app()
+    #QA = QA(store, k=1)
+    #app = QAGradio(QA)
+    #app.launch_app()

config/conf_0.1.yaml ADDED Viewed

	@@ -0,0 +1,52 @@

+# chatbot model
+model:
+  _target_: modules.initialize_agent
+  agent: "conversational-react-description" # langchain template for agent
+  tools:
+    - _target_: langchain.agents.Tool
+      name: "Content Search"
+      func:
+        _target_: tools.SemanticSearch
+        threshold: 0.5
+        k: 5
+      description: ^
+        A content search through the UNHCR documents, it will return relevant extracts for your query.
+        The action input should be a full english sentence.
+        ALWAYS use this to answer ANY question. If the tool doesn't return anything, say that you don't know.
+  llm:
+    _target_: langchain.llms.OpenAI
+    temperature: 0
+    openai_api_key: ${oc.env:OPENAI_API_KEY} # environment variable
+  memory:
+    _target_: langchain.chains.conversation.memory.ConversationBufferWindowMemory
+    memory_key: "chat_history"
+    k: 5 # how many of the past interactions it keeps
+    #verbose: True
+  prefix: |
+    - You are an AI whose purpose is to help answer questions about the UNHCR documents.
+    - You answer in a factual manner, always basing your answer on the context provided to you
+    - You are free to ignore irrelevant information
+    - If you do not know something, you will say that you don't know.
+    - Give long answers, answering every question with a lot of detail.
+    TOOLS:
+    ------
+    You have access to the following tools:
+  suffix: |
+    Begin!
+    Previous conversation history:
+    {chat_history}
+    New input: {input}
+    {agent_scratchpad}
+  ai_prefix: "AI"
+  verbose: True

data/store.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cccec9eb3ff0488f652e5db4ec9f263979c25deaecf773b4413c108f5493fb0e
+size 6998194

ingest.py ADDED Viewed

	@@ -0,0 +1,78 @@

+from pathlib import Path
+import faiss
+import pickle
+from PyPDF2 import PdfReader
+from tqdm import tqdm
+import glob
+import os
+import re
+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.text_splitter import CharacterTextSplitter
+from langchain.vectorstores import FAISS
+from langchain.document_loaders import TextLoader
+import dotenv
+dotenv.load_dotenv()
+def get_all_pdf_filenames(paths, recursive):
+    extensions = ["pdf"]
+    filenames = []
+    for ext_name in extensions:
+        ext = f"**/*.{ext_name}" if recursive else f"*.{ext_name}"
+        for path in paths:
+            filenames.extend(glob.glob(os.path.join(path, ext), recursive=recursive))
+    return filenames
+#all_pdf_paths = get_all_pdf_filenames(["/mnt/c/users/elio/Downloads/UNHCR Emergency Manual"], recursive=True)
+#print(f"Found {len(all_pdf_paths)} PDF files")
+#assert len(all_pdf_paths) > 0
+#all_pdf_paths = ['/mnt/c/users/elio/Downloads/UNHCR Emergency Manual/UNHCR Emergency Manual/46a9e29a2.pdf']
+class Ingester():
+    """
+    Vectorises chunks of the data and puts source as metadata
+    """
+    def __init__(
+        self,
+        separator='\n',
+        chunk_overlap=200,
+        chunk_size=200,
+    ):
+        self.splitter = CharacterTextSplitter(chunk_size=chunk_size, separator=separator, chunk_overlap=chunk_overlap)
+    def ingest(self, path):
+        #ps = get_all_pdf_filenames([path], recursive=True) # get paths
+        ps = ['/mnt/c/users/elio/Downloads/UNHCR Emergency Manual/UNHCR Emergency Manual/46a9e29a2.pdf']
+        data = []
+        sources = []
+        for p in tqdm(ps): # extract data from paths
+            reader = PdfReader(p)
+            page = '\n'.join([reader.pages[i].extract_text() for i in range(len(reader.pages))])
+            data.append(page)
+            sources.append(p)
+        docs = []
+        metadatas = []
+        for i, d in tqdm(enumerate(data)): # split text and make documents
+            splits = self.splitter.split_text(d)
+            if all(s != "" for s in splits):
+                docs.extend(splits)
+                metadatas.extend([{"source": sources[i]}] * len(splits))
+        assert len(docs) > 0
+        print("Extracting embeddings")
+        store = FAISS.from_texts(docs, OpenAIEmbeddings(), metadatas=metadatas)
+        with open(os.path.join('./data', 'store.pkl'), "wb") as f:
+            pickle.dump(store, f)
+        print(f"Saved store at {os.path.join('./data', 'store.pkl')}.")
+ingester = Ingester(chunk_size=2000)
+ingester.ingest("/mnt/c/users/elio/Downloads/UNHCR Emergency Manual")

modules.py ADDED Viewed

	@@ -0,0 +1,75 @@

+"""Load agent."""
+from typing import Any, List, Optional
+from langchain.agents.agent import AgentExecutor
+from langchain.agents.loading import AGENT_TO_CLASS, load_agent
+from langchain.agents.tools import Tool
+from langchain.callbacks.base import BaseCallbackManager
+from langchain.llms.base import BaseLLM
+def initialize_agent(
+    tools: List[Tool],
+    llm: BaseLLM,
+    agent: Optional[str] = None,
+    callback_manager: Optional[BaseCallbackManager] = None,
+    agent_path: Optional[str] = None,
+    prefix: Optional[str] = None,
+    suffix: Optional[str]= None,
+    ai_prefix: Optional[str] = None,
+    human_prefix: Optional[str] = None,
+    **kwargs: Any,
+) -> AgentExecutor:
+    """Load agent given tools and LLM.
+    Args:
+        tools: List of tools this agent has access to.
+        llm: Language model to use as the agent.
+        agent: The agent to use. Valid options are:
+            `zero-shot-react-description`
+            `react-docstore`
+            `self-ask-with-search`
+            `conversational-react-description`
+            If None and agent_path is also None, will default to
+            `zero-shot-react-description`.
+        callback_manager: CallbackManager to use. Global callback manager is used if
+            not provided. Defaults to None.
+        agent_path: Path to serialized agent to use.
+        **kwargs: Additional key word arguments to pass to the agent.
+    Returns:
+        An agent.
+    """
+    if agent is None and agent_path is None:
+        agent = "zero-shot-react-description"
+    if agent is not None and agent_path is not None:
+        raise ValueError(
+            "Both `agent` and `agent_path` are specified, "
+            "but at most only one should be."
+        )
+    if agent is not None:
+        if agent not in AGENT_TO_CLASS:
+            raise ValueError(
+                f"Got unknown agent type: {agent}. "
+                f"Valid types are: {AGENT_TO_CLASS.keys()}."
+            )
+        agent_cls = AGENT_TO_CLASS[agent]
+        agent_obj = agent_cls.from_llm_and_tools(
+            llm, tools, prefix=prefix, suffix=suffix, ai_prefix=ai_prefix, human_prefix=human_prefix, callback_manager=callback_manager # added prefix and suffix
+        )
+    elif agent_path is not None:
+        agent_obj = load_agent(
+            agent_path, llm=llm, tools=tools, callback_manager=callback_manager
+        )
+    else:
+        raise ValueError(
+            "Somehow both `agent` and `agent_path` are None, "
+            "this should never happen."
+        )
+    return AgentExecutor.from_agent_and_tools(
+        agent=agent_obj,
+        tools=tools,
+        callback_manager=callback_manager,
+        **kwargs,
+    )

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+faiss-cpu
+langchain
+openai
+numpy
+gradio
+PyPDF2
+python-dotenv

tools.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import faiss
+import pickle
+import os
+from PyPDF2 import PdfReader
+import glob
+from pathlib import Path
+import re
+import requests
+from langchain.chains import LLMChain
+from langchain.llms import OpenAI
+from langchain import PromptTemplate
+from langchain.vectorstores import FAISS
+from langchain.embeddings import OpenAIEmbeddings
+import dotenv
+dotenv.load_dotenv()
+def call_semantic_api(query, store_path, k):
+    payload = {
+        "query": query,
+        "store_path": store_path,
+        "k": k,
+    }
+    # response = requests.post("http://localhost:3001/search", json=payload)
+    response = semantic_search.search(payload)
+    return response
+class SemanticSearch():
+    def __init__(
+        self,
+        threshold: float,
+        with_source=False,
+        k=5,
+    ):
+        self.threshold = threshold
+        self.with_source = with_source
+        self.k = k
+        with open('./data/store.pkl', 'rb') as f:
+            self.db = pickle.load(f)
+    def __call__(self, query):
+        documents = self.db.similarity_search_with_score(query, k=self.k)
+        if len(documents) == 0:
+            return None
+        if not self.with_source:
+            output = '\n\n\n'.join([i[0].page_content for i in documents])
+        else:
+            output = '\n\n\n'.join([i[0].page_content + '\n\nSource:' + os.path.basename(
+                str(i[0].metadata['source']) + '\n') for i in documents])
+        return output
+class ContentSearch():
+    def __init__(
+            self,
+            semantic_search,
+            prompt_template,
+    ):
+        self.semantic_search = semantic_search
+        self.prompt_template = prompt_template
+    def __call__(self, query):
+        content = self.semantic_search(query)
+        if content is None:
+            return "No results found"
+        else:
+            return self.prompt_template.format(content=content)