Spaces:

r3gm
/

ConversaDocs

Runtime error

App Files Files Community

Roger Condori commited on Jul 23, 2023

Commit

a9c396e

unverified ·

1 Parent(s): a7536a1

Add files via upload

Browse files

Files changed (4) hide show

app.py +123 -0
conversadocs/bones.py +216 -0
demo_docs/demo.txt +25 -0
requirements.txt +12 -0

app.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import gradio as gr
+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
+from langchain.vectorstores import DocArrayInMemorySearch
+from langchain.chains import RetrievalQA,  ConversationalRetrievalChain
+from langchain.memory import ConversationBufferMemory
+from langchain.chat_models import ChatOpenAI
+from langchain.embeddings import HuggingFaceEmbeddings
+from langchain import HuggingFaceHub
+from langchain.llms import LlamaCpp
+from huggingface_hub import hf_hub_download
+from langchain.document_loaders import (
+    EverNoteLoader,
+    TextLoader,
+    UnstructuredEPubLoader,
+    UnstructuredHTMLLoader,
+    UnstructuredMarkdownLoader,
+    UnstructuredODTLoader,
+    UnstructuredPowerPointLoader,
+    UnstructuredWordDocumentLoader,
+    PyPDFLoader,
+)
+import param
+import os
+import torch
+from conversadocs.bones import DocChat
+dc = DocChat()
+##### GRADIO CONFIG ####
+if torch.cuda.is_available():
+    print("CUDA is available on this system.")
+    os.system('CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python --force-reinstall --upgrade --no-cache-dir --verbose')
+else:
+    print("CUDA is not available on this system.")
+    os.system('pip install llama-cpp-python')
+css="""
+#col-container {max-width: 700px; margin-left: auto; margin-right: auto;}
+"""
+title = """
+<div style="text-align: center;max-width: 700px;">
+    <h1>Chat with Documents 📚 - Falcon, Llama-2</h1>
+    <p style="text-align: center;">Upload txt, pdf, doc, docx, enex, epub, html, md, odt, ptt, pttx; click the "Click to Upload Files" button, <br />
+    Wait for the Status to show Loaded documents, start typing your questions. <br />
+    The app is set to store chat-history</p>
+</div>
+"""
+theme='aliabid94/new-theme'
+def flag():
+  return "PROCESSING..."
+def upload_file(files, max_docs):
+    file_paths = [file.name for file in files]
+    return dc.call_load_db(file_paths, max_docs)
+def predict(message, chat_history, max_k):
+        print(message)
+        bot_message = dc.convchain(message, max_k)
+        print(bot_message)
+        return "", dc.get_chats()
+def convert():
+  docs = dc.get_sources()
+  data_docs = ""
+  for i in range(0,len(docs),2):
+    txt = docs[i][1].replace("\n","<br>")
+    sc = "Archive: " + docs[i+1][1]["source"]
+    try:
+      pg = "Page: " + str(docs[i+1][1]["page"])
+    except:
+      pg = "Document Data"
+    data_docs += f"<hr><h3 style='color:red;'>{pg}</h2><p>{txt}</p><p>{sc}</p>"
+  return data_docs
+with gr.Blocks(theme=theme, css=css) as demo:
+  with gr.Tab("Chat"):
+    with gr.Column(elem_id="col-container"):
+        gr.HTML(title)
+        upload_button = gr.UploadButton("Click to Upload Files", file_types=["pdf"], file_count="multiple")
+        file_output = gr.HTML()
+        chatbot = gr.Chatbot([], elem_id="chatbot").style(height=300)
+        msg = gr.Textbox(label="Question", placeholder="Type your question and hit Enter ")
+    with gr.Column():
+        sou = gr.HTML("")
+  with gr.Tab("Chat Options"):
+    max_docs = gr.inputs.Slider(1, 10, default=3, label="Maximum querys to the DB.", step=1)
+    row_table = gr.HTML("<hr><h4> </h2>")
+    clear_button = gr.Button("CLEAR CHAT HISTORY", )
+    link_output = gr.HTML("")
+    clear_button.click(flag,[],[link_output]).then(dc.clr_history,[], [link_output]).then(lambda: None, None, chatbot, queue=False)
+    upload_button.upload(flag,[],[file_output]).then(upload_file, [upload_button, max_docs], file_output)
+  with gr.Tab("Change model"):
+    gr.HTML("<h3>Only models from the GGML library are accepted.</h3>")
+    repo_ = gr.Textbox(label="Repository" ,value="TheBloke/Llama-2-7B-Chat-GGML")
+    file_ = gr.Textbox(label="File name" ,value="llama-2-7b-chat.ggmlv3.q2_K.bin")
+    max_tokens = gr.inputs.Slider(1, 2048, default=16, label="Max new tokens", step=1)
+    temperature = gr.inputs.Slider(0.1, 1., default=0.2, label="Temperature", step=0.1)
+    top_k = gr.inputs.Slider(0.01, 1., default=0.95, label="Top K", step=0.01)
+    top_p = gr.inputs.Slider(0, 100, default=50, label="Top P", step=1)
+    repeat_penalty = gr.inputs.Slider(0.1, 100., default=1.2, label="Repeat penalty", step=0.1)
+    change_model_button = gr.Button("Load Model")
+    model_verify = gr.HTML("Loaded model Falcon 7B-instruct")
+    default_model = gr.HTML("<hr><h4>Default Model</h2>")
+    falcon_button = gr.Button("FALCON 7B-Instruct")
+  msg.submit(predict,[msg, chatbot, max_docs],[msg, chatbot]).then(convert,[],[sou])
+  change_model_button.click(dc.change_llm,[repo_, file_, max_tokens, temperature, top_p, top_k, repeat_penalty, max_docs],[model_verify])
+  falcon_button.click(dc.default_falcon_model, [], [model_verify])
+demo.launch(enable_queue=True)

conversadocs/bones.py ADDED Viewed

	@@ -0,0 +1,216 @@

+import gradio as gr
+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
+from langchain.vectorstores import DocArrayInMemorySearch
+from langchain.chains import RetrievalQA,  ConversationalRetrievalChain
+from langchain.memory import ConversationBufferMemory
+from langchain.chat_models import ChatOpenAI
+from langchain.embeddings import HuggingFaceEmbeddings
+from langchain import HuggingFaceHub
+from langchain.llms import LlamaCpp
+from huggingface_hub import hf_hub_download
+import param
+import os
+import torch
+from langchain.document_loaders import (
+    EverNoteLoader,
+    TextLoader,
+    UnstructuredEPubLoader,
+    UnstructuredHTMLLoader,
+    UnstructuredMarkdownLoader,
+    UnstructuredODTLoader,
+    UnstructuredPowerPointLoader,
+    UnstructuredWordDocumentLoader,
+    PyPDFLoader,
+)
+#YOUR_HF_TOKEN = os.getenv("My_hf_token")
+llm_api=HuggingFaceHub(
+    huggingfacehub_api_token=os.getenv("My_hf_token"),
+    repo_id="tiiuae/falcon-7b-instruct",
+    model_kwargs={
+        "temperature":0.2,
+        "max_new_tokens":500,
+        "top_k":50,
+        "top_p":0.95,
+        "repetition_penalty":1.2,
+        },), #ChatOpenAI(model_name=llm_name, temperature=0)
+#alter
+def load_db(files, chain_type, k, llm):
+    EXTENSIONS = {
+        ".txt": (TextLoader, {"encoding": "utf8"}),
+        ".pdf": (PyPDFLoader, {}),
+        ".doc": (UnstructuredWordDocumentLoader, {}),
+        ".docx": (UnstructuredWordDocumentLoader, {}),
+        ".enex": (EverNoteLoader, {}),
+        ".epub": (UnstructuredEPubLoader, {}),
+        ".html": (UnstructuredHTMLLoader, {}),
+        ".md": (UnstructuredMarkdownLoader, {}),
+        ".odt": (UnstructuredODTLoader, {}),
+        ".ppt": (UnstructuredPowerPointLoader, {}),
+        ".pptx": (UnstructuredPowerPointLoader, {}),
+    }
+    # select extensions loader
+    documents = []
+    for file in files:
+      ext = "." + file.rsplit(".", 1)[-1]
+      if ext in EXTENSIONS:
+          loader_class, loader_args = EXTENSIONS[ext]
+          loader = loader_class(file, **loader_args)
+          documents.extend(loader.load())
+      else:
+        pass
+    # load documents
+    if documents == []:
+        loader_class, loader_args = EXTENSIONS['.txt']
+        loader = loader_class('demo_docs/demo.txt', **loader_args)
+        documents = loader.load()
+    # split documents
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
+    docs = text_splitter.split_documents(documents)
+    # define embedding
+    embeddings = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2') # all-mpnet-base-v2 #embeddings = OpenAIEmbeddings()
+    # create vector database from data
+    db = DocArrayInMemorySearch.from_documents(docs, embeddings)
+    # define retriever
+    retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": k})
+    # create a chatbot chain. Memory is managed externally.
+    qa = ConversationalRetrievalChain.from_llm(
+        llm=llm,
+        chain_type=chain_type,
+        retriever=retriever,
+        return_source_documents=True,
+        return_generated_question=True,
+    )
+    return qa
+class DocChat(param.Parameterized):
+    chat_history = param.List([])
+    answer = param.String("")
+    db_query  = param.String("")
+    db_response = param.List([])
+    llm = llm_api[0]
+    k_value = param.Integer(3)
+    def __init__(self,  **params):
+        super(DocChat, self).__init__( **params)
+        self.loaded_file = "demo_docs/demo.txt"
+        self.qa = load_db(self.loaded_file,"stuff", self.k_value, self.llm)
+    def call_load_db(self, path_file, k):
+        if not os.path.exists(path_file[0]):  # init or no file specified
+            return "No file loaded"
+        else:
+          try:
+            self.qa = load_db(path_file, "stuff", k, self.llm)
+            self.loaded_file = path_file
+          except:
+            return f'No valid file'
+        self.clr_history()
+        return f"New DB created | Loaded File: {self.loaded_file}"
+    # chat
+    def convchain(self, query, k_max):
+        if k_max != self.k_value:
+          print("Maximum querys changed, reloading DB")
+          self.qa = load_db(self.loaded_file,"stuff", k_max, self.llm)
+          self.k_value = k_max
+        result = self.qa({"question": query, "chat_history": self.chat_history})
+        self.chat_history.extend([(query, result["answer"])])
+        self.db_query = result["generated_question"]
+        self.db_response = result["source_documents"]
+        self.answer = result['answer']
+        return self.answer
+    def change_llm(self, repo_, file_, max_tokens=16, temperature=0.2, top_p=0.95, top_k=50, repeat_penalty=1.2, k=3):
+        if torch.cuda.is_available():
+            try:
+              model_path = hf_hub_download(repo_id=repo_, filename=file_)
+              self.llm = LlamaCpp(
+                  model_path=model_path,
+                  n_ctx=1000,
+                  n_batch=512,
+                  n_gpu_layers=35,
+                  max_tokens=max_tokens,
+                  verbose=False,
+                  temperature=temperature,
+                  top_p=top_p,
+                  top_k=top_k,
+                  repeat_penalty=repeat_penalty,
+                  )
+              self.qa = load_db(self.loaded_file,"stuff", k, self.llm)
+              self.k_value = k
+              return f"Loaded {file_}"
+            except:
+              return "No valid model"
+        else:
+            try:
+              model_path = hf_hub_download(repo_id=repo_, filename=file_)
+              self.llm = LlamaCpp(
+                  model_path=model_path,
+                  n_ctx=1000,
+                  n_batch=8,
+                  max_tokens=max_tokens,
+                  verbose=False,
+                  temperature=temperature,
+                  top_p=top_p,
+                  top_k=top_k,
+                  repeat_penalty=repeat_penalty,
+                  )
+              self.qa = load_db(self.loaded_file,"stuff", k, self.llm)
+              self.k_value = k
+              return f"Loaded {file_}"
+            except:
+              return "No valid model"
+    def default_falcon_model(self):
+      self.llm = llm_api[0]
+      self.qa = load_db(self.loaded_file,"stuff", self.k_value, self.llm)
+      return "Loaded model Falcon 7B-instruct"
+    @param.depends('db_query ', )
+    def get_lquest(self):
+        if not self.db_query :
+            return print("Last question to DB: no DB accesses so far")
+        return self.db_query
+    @param.depends('db_response', )
+    def get_sources(self):
+        if not self.db_response:
+            return
+        #rlist=[f"Result of DB lookup:"]
+        rlist=[]
+        for doc in self.db_response:
+          for element in doc:
+            rlist.append(element)
+        return rlist
+    @param.depends('convchain', 'clr_history')
+    def get_chats(self):
+        if not self.chat_history:
+            return "No History Yet"
+        #rlist=[f"Current Chat History variable"]
+        rlist=[]
+        for exchange in self.chat_history:
+            rlist.append(exchange)
+        return rlist
+    def clr_history(self,count=0):
+        self.chat_history = []
+        return "HISTORY CLEARED"

demo_docs/demo.txt ADDED Viewed

	@@ -0,0 +1,25 @@

+Title: Moon's Legacy - Three Perspectives through Time
+1. The Ancient Astronomer: Thoth, 3000 BCE
+In the ancient land of Egypt, Thoth, an esteemed astronomer and scribe, gazes at the night sky filled with stars. Among them, the mysterious and radiant moon captivates his attention. Thoth believes that the moon is a celestial deity, guiding the seasons and tides, influencing the lives of mortals below. He diligently records his observations and wisdom in hieroglyphs, attributing mystical qualities to the moon, believing it to be a link between the earthly and divine realms.
+Thoth's insights pass through generations, shaping early lunar beliefs and lunar calendars. The lunar phases become symbols of rebirth and renewal in various ancient cultures, weaving a spiritual connection between humans and the moon. As the centuries pass, Thoth's legacy lives on, influencing astronomical studies and cultural practices.
+2. The Space Explorer: Dr. Maria Rodriguez, 1969 CE
+Fast forward to the 20th century, Dr. Maria Rodriguez is a brilliant astrophysicist working at NASA during the Apollo era. It is July 20, 1969, and Dr. Rodriguez joins millions around the world as they anxiously watch the televised images of the first manned moon landing. As Neil Armstrong takes that historic step, Maria feels an overwhelming mix of emotions, knowing she is witnessing a turning point in human history.
+Inspired by the Apollo missions, Dr. Rodriguez dedicates her life to lunar research, studying the moon's geology, composition, and its impact on Earth. With the advancements in technology, she becomes a pioneer in space exploration, leading missions to establish lunar bases, conduct experiments, and search for resources to support humanity's expansion beyond Earth.
+Maria's contributions not only push the boundaries of scientific knowledge but also ignite dreams of colonizing the moon and beyond, leaving an indelible mark on humanity's understanding of space and our place within it.
+3. The Lunar Colonist: Chang Min-Joon, 2045 CE
+In the mid-21st century, the vision of lunar colonization becomes a reality. Chang Min-Joon, a resourceful engineer, is among the first settlers on the moon. Born on Earth, he now finds himself in awe of the barren lunar landscape, Earth rising majestically on the horizon. As part of the lunar colony, Chang Min-Joon works diligently to establish sustainable habitats, tapping into the moon's resources for energy, water, and shelter.
+Over time, the lunar colony grows, becoming a melting pot of cultures and scientific pursuits. Min-Joon and his fellow colonists encounter challenges, adapt to life in a low-gravity environment, and develop a unique lunar culture. Their existence exemplifies humanity's ability to thrive beyond our home planet.
+As the moon becomes a stepping stone for future interplanetary missions, Chang Min-Joon envisions a future where the moon is a crucial launch pad for journeys to Mars and beyond. He dreams of humanity spreading throughout the solar system, propelled by the legacy of ancient beliefs, scientific exploration, and the determination of countless individuals who contributed to our understanding and relationship with the moon.
+Together, the stories of Thoth, Dr. Maria Rodriguez, and Chang Min-Joon embody the multifaceted history of the moon, from a celestial deity to a stepping stone for humanity's interplanetary journey, leaving an enduring legacy for generations to come.

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+transformers
+numpy
+torch
+pypdf
+langchain
+langchain[docarray]
+tiktoken
+sentence_transformers
+chromadb
+huggingface_hub
+unstructured[local-inference]
+gradio