Spaces:

xangma
/

chat-pykg

App Files Files Community

xangma commited on Apr 15, 2023

Commit

0f7b25d

•

1 Parent(s): 80b4f00

latest

Browse files

Files changed (5) hide show

.gitignore +1 -1
app.py +115 -66
chain.py +14 -3
ingest.py +147 -78
requirements.txt +2 -1

.gitignore CHANGED Viewed

@@ -4,4 +4,4 @@ downloaded/*
 __pycache__/*
 launch.json
 .DS_Store
-devcode.py

 __pycache__/*
 launch.json
 .DS_Store
+*devcode*

app.py CHANGED Viewed

@@ -1,9 +1,11 @@
 # chat-pykg/app.py
 import datetime
 import os
 import random
 import shutil
 import string
 import chromadb
 import gradio as gr
@@ -13,10 +15,26 @@ from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings
 from langchain.vectorstores import Chroma
 from chain import get_new_chain1
-from ingest import ingest_docs
-# logging.basicConfig(stream=sys.stdout, level=logging.INFO)
-# logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
 def randomword(length):
     letters = string.ascii_lowercase
@@ -25,23 +43,27 @@ def randomword(length):
 def change_tab():
     return gr.Tabs.update(selected=0)
-def merge_collections(collection_load_names, vs_state):
     merged_documents = []
     merged_embeddings = []
     for collection_name in collection_load_names:
         chroma_obj_get = chromadb.Client(Settings(
             chroma_db_impl="duckdb+parquet",
-            persist_directory=".persisted_data",
             anonymized_telemetry = True
         ))
         if collection_name == '':
             continue
-        collection_obj = chroma_obj_get.get_collection(collection_name, embedding_function=HuggingFaceEmbeddings())
         collection = collection_obj.get(include=["metadatas", "documents", "embeddings"])
         for i in range(len(collection['documents'])):
             merged_documents.append(Document(page_content=collection['documents'][i], metadata = collection['metadatas'][i]))
             merged_embeddings.append(collection['embeddings'][i])
-    merged_vectorstore = Chroma(collection_name="temp", embedding_function=HuggingFaceEmbeddings())
     merged_vectorstore.add_documents(documents=merged_documents, embeddings=merged_embeddings)
     return merged_vectorstore
@@ -64,28 +86,38 @@ def set_chain_up(openai_api_key, model_selector, k_textbox, max_tokens_textbox,
     else:
         return agent
-def delete_collection(all_collections_state, collections_viewer):
     client = chromadb.Client(Settings(
         chroma_db_impl="duckdb+parquet",
-        persist_directory=".persisted_data" # Optional, defaults to .chromadb/ in the current directory
     ))
     for collection in collections_viewer:
         try:
             client.delete_collection(collection)
             all_collections_state.remove(collection)
             collections_viewer.remove(collection)
-        except:
-            continue
     return all_collections_state, collections_viewer
-def delete_all_collections(all_collections_state):
-    shutil.rmtree(".persisted_data")
     return []
-def list_collections(all_collections_state):
     client = chromadb.Client(Settings(
         chroma_db_impl="duckdb+parquet",
-        persist_directory=".persisted_data" # Optional, defaults to .chromadb/ in the current directory
     ))
     collection_names = [[c.name][0] for c in client.list_collections()]
     return collection_names
@@ -94,9 +126,12 @@ def update_checkboxgroup(all_collections_state):
     new_options = [i for i in all_collections_state]
     return gr.CheckboxGroup.update(choices=new_options)
-def destroy_agent(agent):
-    agent = None
-    return agent
 def clear_chat(chatbot, history):
     return [], []
@@ -110,12 +145,6 @@ def chat(inp, history, agent):
         if agent == 'no_vectorstore':
             history.append((inp, "Please ingest some package docs to use"))
             return history, history
-        if agent == 'all_collections' and inp != []:
-            history.append(("", f"Current vectorstores: {inp}"))
-            return history, history
-        if agent == 'all_vs_deleted':
-            history.append((inp, "All vectorstores deleted"))
-            return history, history
     else:
         print("\n==== date/time: " + str(datetime.datetime.now()) + " ====")
         print("inp: " + inp)
@@ -126,10 +155,10 @@ def chat(inp, history, agent):
         print(history)
     return history, history
-block = gr.Blocks(css=".gradio-container {background-color: system;}")
 with block:
-    gr.Markdown("<h3><center>chat-pykg</center></h3>")
     with gr.Tabs() as tabs:
         with gr.TabItem("Chat", id=0):
             with gr.Row():
@@ -139,22 +168,26 @@ with block:
                     lines=1,
                     type="password",
                 )
-                model_selector = gr.Dropdown(["gpt-3.5-turbo", "gpt-4", "other"], label="Model", show_label=True)
-                model_selector.value = "gpt-3.5-turbo"
                 k_textbox = gr.Textbox(
                     placeholder="k: Number of search results to consider",
                     label="Search Results k:",
                     show_label=True,
                     lines=1,
                 )
-                k_textbox.value = "20"
                 max_tokens_textbox = gr.Textbox(
                     placeholder="max_tokens: Maximum number of tokens to generate",
                     label="max_tokens",
                     show_label=True,
                     lines=1,
                 )
-                max_tokens_textbox.value="1000"
             chatbot = gr.Chatbot()
             with gr.Row():
                 clear_btn = gr.Button("Clear Chat", variant="secondary").style(full_width=False)
@@ -167,7 +200,7 @@ with block:
             gr.Examples(
                 examples=[
                     "What does this code do?",
-                    "Where is this specific method in the source code and why is it broken?"
                 ],
                 inputs=message,
             )
@@ -178,35 +211,41 @@ with block:
             The source code is split/broken down into many document objects using langchain's pythoncodetextsplitter, which apparently tries to keep whole functions etc. together. This means that each file in the source code is split into many smaller documents, and the k value is the number of documents to consider when searching for the most similar documents to the question. With gpt-3.5-turbo, k=10 seems to work well, but with gpt-4, k=20 seems to work better.
             The model's memory is set to 5 messages, but I haven't tested with gpt-3.5-turbo yet to see if it works well. It seems to work well with gpt-4."""
             )
-        with gr.TabItem("Collections manager", id=1):
             with gr.Row():
-                with gr.Column(scale=2):
-                    all_collections_to_get = gr.List(headers=['New Collections to make'],row_count=3, label='Collections_to_get', show_label=True, interactive=True, max_cols=1, max_rows=3)
-                    make_collections_button = gr.Button(value="Make new collection(s)", variant="secondary").style(full_width=False)
-                    with gr.Row():
-                        chunk_size_textbox = gr.Textbox(
-                            placeholder="Chunk size",
-                            label="Chunk size",
-                            show_label=True,
-                            lines=1,
-                        )
-                        chunk_overlap_textbox = gr.Textbox(
-                            placeholder="Chunk overlap",
-                            label="Chunk overlap",
-                            show_label=True,
-                            lines=1,
                         )
-                        chunk_size_textbox.value = "1000"
-                        chunk_overlap_textbox.value = "0"
-                    with gr.Row():
-                        gr.HTML('<center>See the <a href=https://python.langchain.com/en/latest/reference/modules/text_splitter.html>Langchain textsplitter docs</a></center>')
-                with gr.Column(scale=2):
-                    collections_viewer = gr.CheckboxGroup(choices=[], label='Collections_viewer', show_label=True)
-                with gr.Column(scale=1):
-                    load_collections_button = gr.Button(value="Load collection(s) to chat!", variant="secondary").style(full_width=False)
-                    get_all_collection_names_button = gr.Button(value="List all saved collections", variant="secondary").style(full_width=False)
-                    delete_collections_button = gr.Button(value="Delete selected saved collections", variant="secondary").style(full_width=False)
-                    delete_all_collections_button = gr.Button(value="Delete all saved collections", variant="secondary").style(full_width=False)
         gr.HTML(
             "<center>Powered by <a href='https://github.com/hwchase17/langchain'>LangChain 🦜️🔗</a></center>"
         )
@@ -216,25 +255,35 @@ with block:
         vs_state = gr.State()
         all_collections_state = gr.State()
         chat_state = gr.State()
         submit.click(set_chain_up, inputs=[openai_api_key_textbox, model_selector, k_textbox, max_tokens_textbox, vs_state, agent_state], outputs=[agent_state]).then(chat, inputs=[message, history_state, agent_state], outputs=[chatbot, history_state])
         message.submit(set_chain_up, inputs=[openai_api_key_textbox, model_selector, k_textbox, max_tokens_textbox, vs_state, agent_state], outputs=[agent_state]).then(chat, inputs=[message, history_state, agent_state], outputs=[chatbot, history_state])
-        load_collections_button.click(merge_collections, inputs=[collections_viewer, vs_state], outputs=[vs_state])#.then(change_tab, None, tabs) #.then(set_chain_up, inputs=[openai_api_key_textbox, model_selector, k_textbox, max_tokens_textbox, vs_state, agent_state], outputs=[agent_state])
-        make_collections_button.click(ingest_docs, inputs=[all_collections_state, all_collections_to_get, chunk_size_textbox, chunk_overlap_textbox], outputs=[all_collections_state], show_progress=True).then(update_checkboxgroup, inputs = [all_collections_state], outputs = [collections_viewer])
-        delete_collections_button.click(delete_collection, inputs=[all_collections_state, collections_viewer], outputs=[all_collections_state, collections_viewer]).then(update_checkboxgroup, inputs = [all_collections_state], outputs = [collections_viewer])
-        delete_all_collections_button.click(delete_all_collections, inputs=[all_collections_state], outputs=[all_collections_state]).then(update_checkboxgroup, inputs = [all_collections_state], outputs = [collections_viewer])
-        get_all_collection_names_button.click(list_collections, inputs=[all_collections_state], outputs=[all_collections_state]).then(update_checkboxgroup, inputs = [all_collections_state], outputs = [collections_viewer])
         clear_btn.click(clear_chat, inputs = [chatbot, history_state], outputs = [chatbot, history_state])
         # Whenever chain parameters change, destroy the agent.
-        input_list = [openai_api_key_textbox, model_selector, k_textbox, max_tokens_textbox]
         output_list = [agent_state]
         for input_item in input_list:
             input_item.change(
-                destroy_agent,
                 inputs=output_list,
                 outputs=output_list,
             )
-        all_collections_state.value = list_collections(all_collections_state)
         block.load(update_checkboxgroup, inputs = all_collections_state, outputs = collections_viewer)
 block.launch(debug=True)

 # chat-pykg/app.py
 import datetime
+import logging
 import os
 import random
 import shutil
 import string
+import sys
 import chromadb
 import gradio as gr
 from langchain.vectorstores import Chroma
 from chain import get_new_chain1
+from ingest import embedding_chooser, ingest_docs
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
+class LogTextboxHandler(logging.StreamHandler):
+    def __init__(self, textbox):
+        super().__init__()
+        self.textbox = textbox
+    def emit(self, record):
+        log_entry = self.format(record)
+        self.textbox.value += f"{log_entry}\n"
+def toggle_log_textbox(log_textbox_state):
+    toggle_visibility = not log_textbox_state
+    log_textbox_state = not log_textbox_state
+    return log_textbox_state,gr.update(visible=toggle_visibility)
+def update_textbox(full_log):
+    return gr.update(value=full_log)
 def randomword(length):
     letters = string.ascii_lowercase
 def change_tab():
     return gr.Tabs.update(selected=0)
+def merge_collections(collection_load_names, vs_state, embedding_radio):
+    if type(embedding_radio) == gr.Radio:
+        embedding_radio = embedding_radio.value
+    persist_directory = os.path.join(".persisted_data", embedding_radio.replace(' ','_'))
+    embedding_function = embedding_chooser(embedding_radio)
     merged_documents = []
     merged_embeddings = []
     for collection_name in collection_load_names:
         chroma_obj_get = chromadb.Client(Settings(
             chroma_db_impl="duckdb+parquet",
+            persist_directory=persist_directory,
             anonymized_telemetry = True
         ))
         if collection_name == '':
             continue
+        collection_obj = chroma_obj_get.get_collection(collection_name, embedding_function=embedding_function)
         collection = collection_obj.get(include=["metadatas", "documents", "embeddings"])
         for i in range(len(collection['documents'])):
             merged_documents.append(Document(page_content=collection['documents'][i], metadata = collection['metadatas'][i]))
             merged_embeddings.append(collection['embeddings'][i])
+    merged_vectorstore = Chroma(collection_name="temp", embedding_function=embedding_function)
     merged_vectorstore.add_documents(documents=merged_documents, embeddings=merged_embeddings)
     return merged_vectorstore
     else:
         return agent
+def delete_collection(all_collections_state, collections_viewer, embedding_radio):
+    if type(embedding_radio) == gr.Radio:
+        embedding_radio = embedding_radio.value
+    persist_directory = os.path.join(".persisted_data", embedding_radio.replace(' ','_'))
     client = chromadb.Client(Settings(
         chroma_db_impl="duckdb+parquet",
+        persist_directory=persist_directory # Optional, defaults to .chromadb/ in the current directory
     ))
     for collection in collections_viewer:
         try:
             client.delete_collection(collection)
             all_collections_state.remove(collection)
             collections_viewer.remove(collection)
+        except Exception as e:
+            logging.error(e)
     return all_collections_state, collections_viewer
+def delete_all_collections(all_collections_state, embedding_radio):
+    if type(embedding_radio) == gr.Radio:
+        embedding_radio = embedding_radio.value
+    persist_directory = os.path.join(".persisted_data", embedding_radio.replace(' ','_'))
+    shutil.rmtree(persist_directory)
     return []
+def list_collections(all_collections_state, embedding_radio):
+    if type(embedding_radio) == gr.Radio:
+        embedding_radio = embedding_radio.value
+    persist_directory = os.path.join(".persisted_data", embedding_radio.replace(' ','_'))
     client = chromadb.Client(Settings(
         chroma_db_impl="duckdb+parquet",
+        persist_directory=persist_directory # Optional, defaults to .chromadb/ in the current directory
     ))
     collection_names = [[c.name][0] for c in client.list_collections()]
     return collection_names
     new_options = [i for i in all_collections_state]
     return gr.CheckboxGroup.update(choices=new_options)
+def update_log_textbox(full_log):
+    return gr.Textbox.update(value=full_log)
+def destroy_state(state):
+    state = None
+    return state
 def clear_chat(chatbot, history):
     return [], []
         if agent == 'no_vectorstore':
             history.append((inp, "Please ingest some package docs to use"))
             return history, history
     else:
         print("\n==== date/time: " + str(datetime.datetime.now()) + " ====")
         print("inp: " + inp)
         print(history)
     return history, history
+block = gr.Blocks(title = "chat-pykg", analytics_enabled = False, css=".gradio-container {background-color: system;}")
 with block:
+    gr.Markdown("<h1><center>chat-pykg</center></h1>")
     with gr.Tabs() as tabs:
         with gr.TabItem("Chat", id=0):
             with gr.Row():
                     lines=1,
                     type="password",
                 )
+                model_selector = gr.Dropdown(
+                    choices=["gpt-3.5-turbo", "gpt-4", "other"],
+                    label="Model",
+                    show_label=True,
+                    value = "gpt-3.5-turbo"
+                )
                 k_textbox = gr.Textbox(
                     placeholder="k: Number of search results to consider",
                     label="Search Results k:",
                     show_label=True,
                     lines=1,
+                    value="20",
                 )
                 max_tokens_textbox = gr.Textbox(
                     placeholder="max_tokens: Maximum number of tokens to generate",
                     label="max_tokens",
                     show_label=True,
                     lines=1,
+                    value="1000",
                 )
             chatbot = gr.Chatbot()
             with gr.Row():
                 clear_btn = gr.Button("Clear Chat", variant="secondary").style(full_width=False)
             gr.Examples(
                 examples=[
                     "What does this code do?",
+                    "I want to change the chat-pykg app to have a log viewer, where the user can see what python is doing in the background. How could I do that?",
                 ],
                 inputs=message,
             )
             The source code is split/broken down into many document objects using langchain's pythoncodetextsplitter, which apparently tries to keep whole functions etc. together. This means that each file in the source code is split into many smaller documents, and the k value is the number of documents to consider when searching for the most similar documents to the question. With gpt-3.5-turbo, k=10 seems to work well, but with gpt-4, k=20 seems to work better.
             The model's memory is set to 5 messages, but I haven't tested with gpt-3.5-turbo yet to see if it works well. It seems to work well with gpt-4."""
             )
+        with gr.TabItem("Repository Selector/Manager", id=1):
             with gr.Row():
+                collections_viewer = gr.CheckboxGroup(choices=[], label='Repository Viewer', show_label=True)
+            with gr.Row():
+                load_collections_button = gr.Button(value="Load respositories to chat!", variant="secondary")#.style(full_width=False)
+                get_all_collection_names_button = gr.Button(value="List all saved repositories", variant="secondary")#.style(full_width=False)
+                delete_collections_button = gr.Button(value="Delete selected saved repositories", variant="secondary")#.style(full_width=False)
+                delete_all_collections_button = gr.Button(value="Delete all saved repositories", variant="secondary")#.style(full_width=False)
+        with gr.TabItem("Get New Repositories", id=2):
+                with gr.Row():
+                    all_collections_to_get = gr.List(headers=['Repository URL', 'Folders'], row_count=3, col_count=2, label='Repositories to get', show_label=True, interactive=True, max_cols=2, max_rows=3)
+                    make_collections_button = gr.Button(value="Get new repositories", variant="secondary").style(full_width=False)
+                with gr.Row():
+                    chunk_size_textbox = gr.Textbox(
+                        placeholder="Chunk size",
+                        label="Chunk size",
+                        show_label=True,
+                        lines=1,
+                        value="1000"
+                    )
+                    chunk_overlap_textbox = gr.Textbox(
+                        placeholder="Chunk overlap",
+                        label="Chunk overlap",
+                        show_label=True,
+                        lines=1,
+                        value="0"
+                    )
+                    embedding_radio = gr.Radio(
+                        choices = ['Sentence Transformers', 'OpenAI'],
+                        label="Embedding Options",
+                        show_label=True,
+                        value='Sentence Transformers'
                         )
+                with gr.Row():
+                    gr.HTML('<center>See the <a href=https://python.langchain.com/en/latest/reference/modules/text_splitter.html>Langchain textsplitter docs</a></center>')
         gr.HTML(
             "<center>Powered by <a href='https://github.com/hwchase17/langchain'>LangChain 🦜️🔗</a></center>"
         )
         vs_state = gr.State()
         all_collections_state = gr.State()
         chat_state = gr.State()
+        debug_state = gr.State()
+        debug_state.value = False
         submit.click(set_chain_up, inputs=[openai_api_key_textbox, model_selector, k_textbox, max_tokens_textbox, vs_state, agent_state], outputs=[agent_state]).then(chat, inputs=[message, history_state, agent_state], outputs=[chatbot, history_state])
         message.submit(set_chain_up, inputs=[openai_api_key_textbox, model_selector, k_textbox, max_tokens_textbox, vs_state, agent_state], outputs=[agent_state]).then(chat, inputs=[message, history_state, agent_state], outputs=[chatbot, history_state])
+        load_collections_button.click(merge_collections, inputs=[collections_viewer, vs_state, embedding_radio], outputs=[vs_state])#.then(change_tab, None, tabs) #.then(set_chain_up, inputs=[openai_api_key_textbox, model_selector, k_textbox, max_tokens_textbox, vs_state, agent_state], outputs=[agent_state])
+        make_collections_button.click(ingest_docs, inputs=[all_collections_state, all_collections_to_get, chunk_size_textbox, chunk_overlap_textbox, embedding_radio, debug_state], outputs=[all_collections_state, all_collections_to_get], show_progress=True).then(update_checkboxgroup, inputs = [all_collections_state], outputs = [collections_viewer])
+        delete_collections_button.click(delete_collection, inputs=[all_collections_state, collections_viewer, embedding_radio], outputs=[all_collections_state, collections_viewer]).then(update_checkboxgroup, inputs = [all_collections_state], outputs = [collections_viewer])
+        delete_all_collections_button.click(delete_all_collections, inputs=[all_collections_state, embedding_radio], outputs=[all_collections_state]).then(update_checkboxgroup, inputs = [all_collections_state], outputs = [collections_viewer])
+        get_all_collection_names_button.click(list_collections, inputs=[all_collections_state, embedding_radio], outputs=[all_collections_state]).then(update_checkboxgroup, inputs = [all_collections_state], outputs = [collections_viewer])
         clear_btn.click(clear_chat, inputs = [chatbot, history_state], outputs = [chatbot, history_state])
         # Whenever chain parameters change, destroy the agent.
+        input_list = [openai_api_key_textbox, model_selector, k_textbox, max_tokens_textbox, embedding_radio]
         output_list = [agent_state]
         for input_item in input_list:
             input_item.change(
+                destroy_state,
                 inputs=output_list,
                 outputs=output_list,
             )
+        all_collections_state.value = list_collections(all_collections_state, embedding_radio)
         block.load(update_checkboxgroup, inputs = all_collections_state, outputs = collections_viewer)
+    log_textbox_handler = LogTextboxHandler(gr.TextArea(interactive=False, placeholder="Logs will appear here...", visible=False))
+    log_textbox = log_textbox_handler.textbox
+    logging.getLogger().addHandler(log_textbox_handler)
+    log_textbox_visibility_state = gr.State()
+    log_textbox_visibility_state.value = False
+    log_toggle_button = gr.Button("Toggle Log", variant="secondary")
+    log_toggle_button.click(toggle_log_textbox, inputs=[log_textbox_visibility_state], outputs=[log_textbox_visibility_state,log_textbox])
+block.queue(concurrency_count=40)
 block.launch(debug=True)

chain.py CHANGED Viewed

@@ -1,5 +1,6 @@
 # chat-pykg/chain.py
 from langchain.chains.base import Chain
 from langchain import HuggingFaceHub
 from langchain.chains.question_answering import load_qa_chain
@@ -10,12 +11,21 @@ from langchain.chains.llm import LLMChain
 from langchain.callbacks.base import CallbackManager
 from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
 from langchain.chains.conversational_retrieval.prompts import CONDENSE_QUESTION_PROMPT, QA_PROMPT
 # logging.basicConfig(stream=sys.stdout, level=logging.INFO)
 # logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
 def get_new_chain1(vectorstore, model_selector, k_textbox, max_tokens_textbox) -> Chain:
     template = """You are called chat-pykg and are an AI assistant coded in python using langchain and gradio. You are very helpful for answering questions about various open source libraries.
                 You are given the following extracted parts of code and a question. Provide a conversational answer to the question.
                 Do NOT make up any hyperlinks that are not in the code.
@@ -34,8 +44,8 @@ def get_new_chain1(vectorstore, model_selector, k_textbox, max_tokens_textbox) -
         llm = HuggingFaceHub(repo_id="chavinlo/gpt4-x-alpaca")#, model_kwargs={"temperature":0, "max_length":64})
         doc_chain_llm = HuggingFaceHub(repo_id="chavinlo/gpt4-x-alpaca")
     question_generator = LLMChain(llm=llm, prompt=CONDENSE_QUESTION_PROMPT)
-    doc_chain = load_qa_chain(doc_chain_llm, chain_type="stuff", prompt=QA_PROMPT)
     # memory = ConversationKGMemory(llm=llm, input_key="question", output_key="answer")
     memory = ConversationBufferWindowMemory(input_key="question", output_key="answer", k=5)
     retriever = vectorstore.as_retriever(search_type="similarity")
@@ -45,5 +55,6 @@ def get_new_chain1(vectorstore, model_selector, k_textbox, max_tokens_textbox) -
         retriever.search_kwargs = {"k": 10}
     qa = ConversationalRetrievalChain(
         retriever=retriever, memory=memory, combine_docs_chain=doc_chain, question_generator=question_generator)
     return qa

 # chat-pykg/chain.py
+from typing import Any, Dict, Iterable, List, Optional, Type, TypeVar
+from pydantic import Extra, Field, root_validator
 from langchain.chains.base import Chain
 from langchain import HuggingFaceHub
 from langchain.chains.question_answering import load_qa_chain
 from langchain.callbacks.base import CallbackManager
 from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
 from langchain.chains.conversational_retrieval.prompts import CONDENSE_QUESTION_PROMPT, QA_PROMPT
+from langchain.chains.combine_documents.base import BaseCombineDocumentsChain
+from langchain.chains.llm import LLMChain
+from langchain.schema import BaseLanguageModel, BaseRetriever, Document
+from langchain.prompts.prompt import PromptTemplate
 # logging.basicConfig(stream=sys.stdout, level=logging.INFO)
 # logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
 def get_new_chain1(vectorstore, model_selector, k_textbox, max_tokens_textbox) -> Chain:
+    # def _get_docs(self, question: str, inputs: Dict[str, Any]) -> List[Document]:
+    #     docs = self.retriever.vectorstore._collection.query(question, n_results=self.retriever.search_kwargs["k"], where = {"source":{"$contains":"search_string"}}, where_document = {"$contains":"search_string"})
+    #     return self._reduce_tokens_below_limit(docs)
     template = """You are called chat-pykg and are an AI assistant coded in python using langchain and gradio. You are very helpful for answering questions about various open source libraries.
                 You are given the following extracted parts of code and a question. Provide a conversational answer to the question.
                 Do NOT make up any hyperlinks that are not in the code.
         llm = HuggingFaceHub(repo_id="chavinlo/gpt4-x-alpaca")#, model_kwargs={"temperature":0, "max_length":64})
         doc_chain_llm = HuggingFaceHub(repo_id="chavinlo/gpt4-x-alpaca")
     question_generator = LLMChain(llm=llm, prompt=CONDENSE_QUESTION_PROMPT)
+    doc_chain = load_qa_chain(doc_chain_llm, chain_type="stuff", prompt=QA_PROMPT)#, document_prompt = PromptTemplate(input_variables=["source", "page_content"], template="{source}\n{page_content}"))
     # memory = ConversationKGMemory(llm=llm, input_key="question", output_key="answer")
     memory = ConversationBufferWindowMemory(input_key="question", output_key="answer", k=5)
     retriever = vectorstore.as_retriever(search_type="similarity")
         retriever.search_kwargs = {"k": 10}
     qa = ConversationalRetrievalChain(
         retriever=retriever, memory=memory, combine_docs_chain=doc_chain, question_generator=question_generator)
+    # qa._get_docs = _get_docs.__get__(qa, ConversationalRetrievalChain)
     return qa

ingest.py CHANGED Viewed

@@ -1,8 +1,9 @@
 # chat-pykg/ingest.py
 import tempfile
 from langchain.document_loaders import SitemapLoader, ReadTheDocsLoader, TextLoader
 from langchain.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings
-from langchain.text_splitter import RecursiveCharacterTextSplitter, PythonCodeTextSplitter, MarkdownTextSplitter
 from langchain.vectorstores.faiss import FAISS
 import os
 from langchain.vectorstores import Chroma
@@ -10,8 +11,12 @@ import shutil
 from pathlib import Path
 import subprocess
 import chromadb
-from chromadb.config import Settings
-import chromadb.utils.embedding_functions as ef
 # class CachedChroma(Chroma, ABC):
 #     """
@@ -65,6 +70,62 @@ import chromadb.utils.embedding_functions as ef
 #             )
 #         raise ValueError("Either documents or collection_name must be specified.")
 def get_text(content):
     relevant_part = content.find("div", {"class": "markdown"})
     if relevant_part is not None:
@@ -72,83 +133,96 @@ def get_text(content):
     else:
         return ""
-def ingest_docs(all_collections_state, urls, chunk_size, chunk_overlap):
-    """Get documents from web pages."""
     all_docs = []
-    folders=[]
-    documents = []
     shutil.rmtree('downloaded/', ignore_errors=True)
     known_exts = ["py", "md"]
     py_splitter = PythonCodeTextSplitter(chunk_size=int(chunk_size), chunk_overlap=int(chunk_overlap))
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=int(chunk_size), chunk_overlap=int(chunk_overlap))
     md_splitter = MarkdownTextSplitter(chunk_size=int(chunk_size), chunk_overlap=int(chunk_overlap))
-    for url in urls:
         paths_by_ext = {}
         docs_by_ext = {}
         for ext in known_exts + ["other"]:
             docs_by_ext[ext] = []
             paths_by_ext[ext] = []
-        url = url[0]
-        if url == '':
-            continue
-        if "." in url:
-            if len(url) > 1:
-                folder = url.split('.')[1]
-            else:
-                folder = '.'
         else:
-            destination = Path(os.path.join('downloaded',url))
-            destination.mkdir(exist_ok=True, parents=True)
-            destination = destination.as_posix()
-            if url[0] == '/':
-                url = url[1:]
-            org = url.split('/')[0]
-            repo = url.split('/')[1]
             repo_url = f"https://github.com/{org}/{repo}.git"
-            # join all strings after 2nd slash
-            folder = '/'.join(url.split('/')[2:])
-            if folder[-1] == '/':
-                folder = folder[:-1]
-            if folder:
-                with tempfile.TemporaryDirectory() as temp_dir:
-                    temp_path = Path(temp_dir)
-                    # Initialize the Git repository
-                    subprocess.run(["git", "init"], cwd=temp_path)
-                    # Add the remote repository
-                    subprocess.run(["git", "remote", "add", "-f", "origin", repo_url], cwd=temp_path)
-                    # Enable sparse-checkout
-                    subprocess.run(["git", "config", "core.sparseCheckout", "true"], cwd=temp_path)
-                    # Specify the folder to checkout
-                    with open(temp_path / ".git" / "info" / "sparse-checkout", "w") as f:
-                        f.write(f"{folder}/\n")
-                    # Checkout the desired branch
-                    res = subprocess.run(["git", "checkout", 'main'], cwd=temp_path)
-                    if res.returncode == 1:
-                        res = subprocess.run(["git", "checkout", "master"], cwd=temp_path)
-                    res = subprocess.run(["cp", "-r", (temp_path / folder).as_posix(), '/'.join(destination.split('/')[:-1])])
-                    folder = destination
-        local_repo_path_1 = folder
-        if local_repo_path_1 == '.':
-            local_repo_path_1 = os.getcwd()
-        for root, dirs, files in os.walk(local_repo_path_1):
             for file in files:
-                file_path = os.path.join(root, file)
-                rel_file_path = os.path.relpath(file_path, local_repo_path_1)
-                ext = rel_file_path.split('.')[-1]
-                if rel_file_path.startswith('.'):
                     continue
                 try:
-                    if paths_by_ext.get(rel_file_path.split('.')[-1]) is None:
-                        paths_by_ext["other"].append(rel_file_path)
-                        docs_by_ext["other"].append(TextLoader(os.path.join(local_repo_path_1, rel_file_path)).load()[0])
                     else:
-                        paths_by_ext[ext].append(rel_file_path)
-                        docs_by_ext[ext].append(TextLoader(os.path.join(local_repo_path_1, rel_file_path)).load()[0])
                 except Exception as e:
                     continue
         for ext in docs_by_ext.keys():
@@ -157,25 +231,20 @@ def ingest_docs(all_collections_state, urls, chunk_size, chunk_overlap):
             if ext == "md":
                 documents += md_splitter.split_documents(docs_by_ext[ext])
             # else:
-            #     documents += text_splitter.split_documents(docs_by_ext[ext]
         all_docs += documents
-        if 'downloaded/' in folder:
-            folder = '-'.join(folder.split('/')[1:])
-        if folder == '.':
-            folder = 'chat-pykg'
-        collection = Chroma.from_documents(documents=documents, collection_name=folder, embedding=HuggingFaceEmbeddings(), persist_directory=".persisted_data")
         collection.persist()
-        all_collections_state.append(folder)
-    return all_collections_state
-    # embeddings = HuggingFaceEmbeddings()
-    # merged_vectorstore = Chroma.from_documents(persist_directory=".persisted_data", documents=documents, embedding=embeddings, collection_name='merged_collections')
-    # #vectorstore = FAISS.from_documents(documents, embeddings)
-    # # # Save vectorstore
-    # # with open("vectorstore.pkl", "wb") as f:
-    # #     pickle.dump(vectorstore. , f)
-    # return merged_vectorstore
 if __name__ == "__main__":
     ingest_docs()

 # chat-pykg/ingest.py
 import tempfile
+import gradio as gr
 from langchain.document_loaders import SitemapLoader, ReadTheDocsLoader, TextLoader
 from langchain.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings
+from langchain.text_splitter import RecursiveCharacterTextSplitter, PythonCodeTextSplitter, MarkdownTextSplitter, TextSplitter
 from langchain.vectorstores.faiss import FAISS
 import os
 from langchain.vectorstores import Chroma
 from pathlib import Path
 import subprocess
 import chromadb
+import magic
+from typing import Any, Dict, Iterable, List, Optional, Type, TypeVar
+from pydantic import Extra, Field, root_validator
+import logging
+logger = logging.getLogger()
+from langchain.docstore.document import Document
 # class CachedChroma(Chroma, ABC):
 #     """
 #             )
 #         raise ValueError("Either documents or collection_name must be specified.")
+def embedding_chooser(embedding_radio):
+    if embedding_radio == "Sentence Transformers":
+        embedding_function = HuggingFaceEmbeddings()
+    elif embedding_radio == "OpenAI":
+        embedding_function = OpenAIEmbeddings()
+    else:
+        embedding_function = HuggingFaceEmbeddings()
+    return embedding_function
+# Monkeypatch pending PR
+def _merge_splits(self, splits: Iterable[str], separator: str) -> List[str]:
+    # We now want to combine these smaller pieces into medium size
+    # chunks to send to the LLM.
+    separator_len = self._length_function(separator)
+    docs = []
+    current_doc: List[str] = []
+    total = 0
+    for index, d in enumerate(splits):
+        _len = self._length_function(d)
+        if (
+            total + _len + (separator_len if len(current_doc) > 0 else 0)
+            > self._chunk_size
+        ):
+            if total > self._chunk_size:
+                logger.warning(
+                    f"Created a chunk of size {total}, "
+                    f"which is longer than the specified {self._chunk_size}"
+                )
+            if len(current_doc) > 0:
+                doc = self._join_docs(current_doc, separator)
+                if doc is not None:
+                    docs.append(doc)
+                # Keep on popping if:
+                # - we have a larger chunk than in the chunk overlap
+                # - or if we still have any chunks and the length is long
+                while total > self._chunk_overlap or (
+                    total + _len + (separator_len if len(current_doc) > 0 else 0)
+                    > self._chunk_size
+                    and total > 0
+                ):
+                    total -= self._length_function(current_doc[0]) + (
+                        separator_len if len(current_doc) > 1 else 0
+                    )
+                    current_doc = current_doc[1:]
+        if index > 0:
+            current_doc.append(separator + d)
+        else:
+            current_doc.append(d)
+        total += _len + (separator_len if len(current_doc) > 1 else 0)
+    doc = self._join_docs(current_doc, separator)
+    if doc is not None:
+        docs.append(doc)
+    return docs
 def get_text(content):
     relevant_part = content.find("div", {"class": "markdown"})
     if relevant_part is not None:
     else:
         return ""
+def ingest_docs(all_collections_state, urls, chunk_size, chunk_overlap, embedding_radio, debug=False):
+    cleared_list = urls.copy()
+    def sanitize_folder_name(folder_name):
+        if folder_name != '':
+            folder_name = folder_name.strip().rstrip('/')
+        else:
+            folder_name = '.' # current directory
+        return folder_name
+    def is_hidden(path):
+        return os.path.basename(path).startswith('.')
+    embedding_function = embedding_chooser(embedding_radio)
     all_docs = []
     shutil.rmtree('downloaded/', ignore_errors=True)
     known_exts = ["py", "md"]
+    # Initialize text splitters
     py_splitter = PythonCodeTextSplitter(chunk_size=int(chunk_size), chunk_overlap=int(chunk_overlap))
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=int(chunk_size), chunk_overlap=int(chunk_overlap))
     md_splitter = MarkdownTextSplitter(chunk_size=int(chunk_size), chunk_overlap=int(chunk_overlap))
+    py_splitter._merge_splits = _merge_splits.__get__(py_splitter, TextSplitter)
+    # Process input URLs
+    urls = [[url.strip(), [sanitize_folder_name(folder) for folder in url_folders.split(',')]] for url, url_folders in urls]
+    for j in range(len(urls)):
+        orgrepo = urls[j][0]
+        repo_folders = urls[j][1]
+        if orgrepo == '':
+            continue
+        if orgrepo.replace('/','-') in all_collections_state:
+            logging.info(f"Skipping {orgrepo} as it is already in the database")
+            continue
+        documents = []
+        paths = []
         paths_by_ext = {}
         docs_by_ext = {}
         for ext in known_exts + ["other"]:
             docs_by_ext[ext] = []
             paths_by_ext[ext] = []
+        if orgrepo[0] == '/' or orgrepo[0] == '.':
+            # Ingest local folder
+            local_repo_path = sanitize_folder_name(orgrepo[1:])
         else:
+            # Ingest remote git repo
+            org = orgrepo.split('/')[0]
+            repo = orgrepo.split('/')[1]
             repo_url = f"https://github.com/{org}/{repo}.git"
+            local_repo_path = os.path.join('.downloaded', orgrepo) if debug else tempfile.mkdtemp()
+            # Initialize the Git repository
+            subprocess.run(["git", "init"], cwd=local_repo_path)
+            # Add the remote repository
+            subprocess.run(["git", "remote", "add", "-f", "origin", repo_url], cwd=local_repo_path)
+            # Enable sparse-checkout
+            subprocess.run(["git", "config", "core.sparseCheckout", "true"], cwd=local_repo_path)
+            # Specify the folder to checkout
+            cmd = ["git", "sparse-checkout", "set"] + [i for i in repo_folders]
+            subprocess.run(cmd, cwd=local_repo_path)
+            # Check if branch is called main or master
+            # Checkout the desired branch
+            res = subprocess.run(["git", "checkout", 'main'], cwd=local_repo_path)
+            if res.returncode == 1:
+                res = subprocess.run(["git", "checkout", "master"], cwd=local_repo_path)
+            #res = subprocess.run(["cp", "-r", (Path(local_repo_path) / repo_folders[i]).as_posix(), '/'.join(destination.split('/')[:-1])])#
+            # Iterate through files and process them
+        if local_repo_path == '.':
+            orgrepo='chat-pykg'
+        for root, dirs, files in os.walk(local_repo_path):
+            dirs[:] = [d for d in dirs if not is_hidden(d)]  # Ignore hidden directories
             for file in files:
+                if is_hidden(file):
                     continue
+                file_path = os.path.join(root, file)
+                rel_file_path = os.path.relpath(file_path, local_repo_path)
                 try:
+                    if '.' not in rel_file_path:
+                        inferred_filetype = magic.from_file(file_path, mime=True)
+                        if "python" in inferred_filetype or "text/plain" in inferred_filetype:
+                            ext = "py"
+                        else:
+                            ext = "other"
                     else:
+                        ext = rel_file_path.split('.')[-1]
+                    if docs_by_ext.get(ext) is None:
+                        ext = "other"
+                    doc = TextLoader(os.path.join(local_repo_path, rel_file_path)).load()[0]
+                    doc.metadata["source"] = os.path.join(orgrepo, rel_file_path)
+                    docs_by_ext[ext].append(doc)
+                    paths_by_ext[ext].append(rel_file_path)
                 except Exception as e:
                     continue
         for ext in docs_by_ext.keys():
             if ext == "md":
                 documents += md_splitter.split_documents(docs_by_ext[ext])
             # else:
+            #     documents += text_splitter.split_documents(docs_by_ext[ext]
         all_docs += documents
+        # For each document, add the metadata to the page_content
+        for doc in documents:
+            doc.page_content = f'# source:{doc.metadata["source"]}\n{doc.page_content}'
+        if type(embedding_radio) == gr.Radio:
+            embedding_radio = embedding_radio.value
+        persist_directory = os.path.join(".persisted_data", embedding_radio.replace(' ','_'))
+        collection_name = orgrepo.replace('/','-')
+        collection = Chroma.from_documents(documents=documents, collection_name=collection_name, embedding=embedding_function, persist_directory=persist_directory)
         collection.persist()
+        all_collections_state.append(collection_name)
+        cleared_list[j][0], cleared_list[j][1] = '', ''
+    return all_collections_state, gr.update(value=cleared_list)
 if __name__ == "__main__":
     ingest_docs()

requirements.txt CHANGED Viewed

@@ -6,4 +6,5 @@ Flask
 transformers
 gradio
 chromadb
-sentence_transformers

 transformers
 gradio
 chromadb
+sentence_transformers
+python-magic