Spaces:

ShawnAI
/

VectorDB

Sleeping

App Files Files Community

ShawnAI commited on Jun 14, 2023

Commit

f0d1783

•

1 Parent(s): 2aba62d

Create app.py

Browse files

Files changed (1) hide show

app.py +156 -0

app.py ADDED Viewed

	@@ -0,0 +1,156 @@

+import gradio as gr
+from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceInstructEmbeddings, OpenAIEmbeddings
+from langchain.vectorstores import Pinecone
+import pinecone
+import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+PINECONE_KEY = os.environ.get("PINECONE_KEY", "")
+PINECONE_ENV = os.environ.get("PINECONE_ENV", "asia-northeast1-gcp")
+PINECONE_INDEX = os.environ.get("PINECONE_INDEX", '3gpp-r16')
+EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL", "hkunlp/instructor-large")
+EMBEDDING_LOADER = os.environ.get("EMBEDDING_LOADER", "HuggingFaceInstructEmbeddings")
+EMBEDDING_LIST = ["HuggingFaceInstructEmbeddings", "HuggingFaceEmbeddings"]
+# return top-k text chunks from vector store
+TOP_K_DEFAULT = 15
+TOP_K_MAX = 30
+SCORE_DEFAULT = 0.33
+global g_db
+g_db = None
+def init_db(emb_name, emb_loader, db_api_key, db_env, db_index):
+    embeddings = eval(emb_loader)(model_name=emb_name)
+    pinecone.init(api_key     = db_api_key,
+                  environment = db_env)
+    global g_db
+    g_db = Pinecone.from_existing_index(index_name = db_index,
+                                      embedding  = embeddings)
+    return str(g_db)
+def get_db():
+    return g_db
+def remove_duplicates(documents, score_min):
+    seen_content = set()
+    unique_documents = []
+    for (doc, score) in documents:
+        if (doc.page_content not in seen_content) and (score >= score_min):
+            seen_content.add(doc.page_content)
+            unique_documents.append(doc)
+    return unique_documents
+def get_data(query, top_k, score):
+    if not query:
+        return "Please init db in configuration"
+    print("Use db: " + str(g_db))
+    docs = g_db.similarity_search_with_score(query = query,
+                                             k=top_k)
+    #docsearch = db.as_retriever(search_kwargs={'k':top_k})
+    #docs = docsearch.get_relevant_documents(query)
+    udocs = remove_duplicates(docs, score)
+    return udocs
+with gr.Blocks(
+    title = "3GPP Database",
+    theme = "Base",
+    css = """.bigbox {
+    min-height:250px;
+}
+""") as demo:
+    with gr.Tab("Matching"):
+        with gr.Accordion("Vector similarity"):
+            with gr.Row():
+                with gr.Column():
+                    top_k = gr.Slider(1,
+                                      TOP_K_MAX,
+                                      value=TOP_K_DEFAULT,
+                                      step=1,
+                                      label="Vector similarity top_k",
+                                      interactive=True)
+                with gr.Column():
+                    score = gr.Slider(0.01,
+                                      0.99,
+                                      value=SCORE_DEFAULT,
+                                      step=0.01,
+                                      label="Vector similarity score",
+                                      interactive=True)
+        with gr.Row():
+             inp = gr.Textbox(label = "Input",
+                              placeholder="What are you looking for?")
+             out = gr.Textbox(label = "Output")
+        btn_run = gr.Button("Run", variant="primary")
+    with gr.Tab("Configuration"):
+        with gr.Row():
+            loading = gr.Textbox(get_db, max_lines=1, show_label=False)
+            btn_init = gr.Button("Init")
+        with gr.Accordion("Embedding"):
+            with gr.Row():
+                with gr.Column():
+                    emb_textbox = gr.Textbox(
+                        label = "Embedding Model",
+                        # show_label = False,
+                        value = EMBEDDING_MODEL,
+                        placeholder = "Paste Your Embedding Model Repo on HuggingFace",
+                        lines=1,
+                        interactive=True,
+                        type='email')
+                with gr.Column():
+                    emb_dropdown = gr.Dropdown(
+                        EMBEDDING_LIST,
+                        value=EMBEDDING_LOADER,
+                        multiselect=False,
+                        interactive=True,
+                        label="Embedding Loader")
+        with gr.Accordion("Pinecone Database"):
+            with gr.Row():
+                db_api_textbox = gr.Textbox(
+                    label = "Pinecone API Key",
+                    # show_label = False,
+                    value = PINECONE_KEY,
+                    placeholder = "Paste Your Pinecone API Key (xx-xx-xx-xx-xx) and Hit ENTER",
+                    lines=1,
+                    interactive=True,
+                    type='password')
+            with gr.Row():
+                db_env_textbox = gr.Textbox(
+                    label = "Pinecone Environment",
+                    # show_label = False,
+                    value = PINECONE_ENV,
+                    placeholder = "Paste Your Pinecone Environment (xx-xx-xx) and Hit ENTER",
+                    lines=1,
+                    interactive=True,
+                    type='email')
+                db_index_textbox = gr.Textbox(
+                    label = "Pinecone Index",
+                    # show_label = False,
+                    value = PINECONE_INDEX,
+                    placeholder = "Paste Your Pinecone Index (xxxx) and Hit ENTER",
+                    lines=1,
+                    interactive=True,
+                    type='email')
+    btn_init.click(fn=init_db, inputs=[emb_textbox, emb_dropdown, db_api_textbox, db_env_textbox, db_index_textbox], outputs=loading)
+    btn_run.click(fn=get_data, inputs=[inp, top_k, score], outputs=out)
+if __name__ == "__main__":
+    demo.queue()
+    demo.launch(inbrowser = True)