|
import gradio as gr |
|
|
|
from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceInstructEmbeddings, OpenAIEmbeddings |
|
from langchain.vectorstores import Pinecone |
|
import pinecone |
|
import os |
|
os.environ["TOKENIZERS_PARALLELISM"] = "false" |
|
|
|
|
|
PINECONE_KEY = os.environ.get("PINECONE_KEY", "") |
|
PINECONE_ENV = os.environ.get("PINECONE_ENV", "us-east-1") |
|
PINECONE_INDEX = os.environ.get("PINECONE_INDEX", '3gpp-r16-hg') |
|
|
|
EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL", "hkunlp/instructor-large") |
|
EMBEDDING_LOADER = os.environ.get("EMBEDDING_LOADER", "HuggingFaceInstructEmbeddings") |
|
EMBEDDING_LIST = ["HuggingFaceInstructEmbeddings", "HuggingFaceEmbeddings"] |
|
|
|
|
|
TOP_K_DEFAULT = 15 |
|
TOP_K_MAX = 30 |
|
SCORE_DEFAULT = 0.33 |
|
|
|
global g_db |
|
g_db = None |
|
|
|
def init_db(emb_name, emb_loader, db_api_key, db_env, db_index): |
|
|
|
embeddings = eval(emb_loader)(model_name=emb_name) |
|
|
|
pinecone.init(api_key = db_api_key, |
|
environment = db_env) |
|
|
|
global g_db |
|
|
|
g_db = Pinecone.from_existing_index(index_name = db_index, |
|
embedding = embeddings) |
|
return str(g_db) |
|
|
|
|
|
def get_db(): |
|
return g_db |
|
|
|
|
|
def remove_duplicates(documents, score_min): |
|
seen_content = set() |
|
unique_documents = [] |
|
for (doc, score) in documents: |
|
if (doc.page_content not in seen_content) and (score >= score_min): |
|
seen_content.add(doc.page_content) |
|
unique_documents.append(doc) |
|
return unique_documents |
|
|
|
|
|
def get_data(query, top_k, score): |
|
if not query: |
|
return "Please init db in configuration" |
|
|
|
print("Use db: " + str(g_db)) |
|
|
|
docs = g_db.similarity_search_with_score(query = query, |
|
k=top_k) |
|
|
|
|
|
udocs = remove_duplicates(docs, score) |
|
return udocs |
|
|
|
with gr.Blocks( |
|
title = "3GPP Database", |
|
theme = "Base", |
|
css = """.bigbox { |
|
min-height:250px; |
|
} |
|
""") as demo: |
|
with gr.Tab("Matching"): |
|
with gr.Accordion("Vector similarity"): |
|
with gr.Row(): |
|
with gr.Column(): |
|
top_k = gr.Slider(1, |
|
TOP_K_MAX, |
|
value=TOP_K_DEFAULT, |
|
step=1, |
|
label="Vector similarity top_k", |
|
interactive=True) |
|
with gr.Column(): |
|
score = gr.Slider(0.01, |
|
0.99, |
|
value=SCORE_DEFAULT, |
|
step=0.01, |
|
label="Vector similarity score", |
|
interactive=True) |
|
|
|
with gr.Row(): |
|
inp = gr.Textbox(label = "Input", |
|
placeholder="What are you looking for?") |
|
out = gr.Textbox(label = "Output") |
|
|
|
btn_run = gr.Button("Run", variant="primary") |
|
|
|
with gr.Tab("Configuration"): |
|
with gr.Row(): |
|
loading = gr.Textbox(get_db, max_lines=1, show_label=False) |
|
btn_init = gr.Button("Init") |
|
with gr.Accordion("Embedding"): |
|
with gr.Row(): |
|
with gr.Column(): |
|
emb_textbox = gr.Textbox( |
|
label = "Embedding Model", |
|
|
|
value = EMBEDDING_MODEL, |
|
placeholder = "Paste Your Embedding Model Repo on HuggingFace", |
|
lines=1, |
|
interactive=True, |
|
type='email') |
|
|
|
with gr.Column(): |
|
emb_dropdown = gr.Dropdown( |
|
EMBEDDING_LIST, |
|
value=EMBEDDING_LOADER, |
|
multiselect=False, |
|
interactive=True, |
|
label="Embedding Loader") |
|
|
|
with gr.Accordion("Pinecone Database"): |
|
with gr.Row(): |
|
db_api_textbox = gr.Textbox( |
|
label = "Pinecone API Key", |
|
|
|
value = PINECONE_KEY, |
|
placeholder = "Paste Your Pinecone API Key (xx-xx-xx-xx-xx) and Hit ENTER", |
|
lines=1, |
|
interactive=True, |
|
type='password') |
|
with gr.Row(): |
|
db_env_textbox = gr.Textbox( |
|
label = "Pinecone Environment", |
|
|
|
value = PINECONE_ENV, |
|
placeholder = "Paste Your Pinecone Environment (xx-xx-xx) and Hit ENTER", |
|
lines=1, |
|
interactive=True, |
|
type='email') |
|
db_index_textbox = gr.Textbox( |
|
label = "Pinecone Index", |
|
|
|
value = PINECONE_INDEX, |
|
placeholder = "Paste Your Pinecone Index (xxxx) and Hit ENTER", |
|
lines=1, |
|
interactive=True, |
|
type='email') |
|
|
|
btn_init.click(fn=init_db, inputs=[emb_textbox, emb_dropdown, db_api_textbox, db_env_textbox, db_index_textbox], outputs=loading) |
|
btn_run.click(fn=get_data, inputs=[inp, top_k, score], outputs=out) |
|
|
|
if __name__ == "__main__": |
|
demo.queue() |
|
demo.launch(inbrowser = True) |
|
|