Spaces:

ShawnAI
/

VectorDB

Running

App Files Files Community

VectorDB / app.py

ShawnAI

Update app.py

780cfe3 verified 10 months ago

raw

history blame contribute delete

5.61 kB

	import gradio as gr

	from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceInstructEmbeddings, OpenAIEmbeddings
	from langchain.vectorstores import Pinecone
	import pinecone
	import os
	os.environ["TOKENIZERS_PARALLELISM"] = "false"


	PINECONE_KEY = os.environ.get("PINECONE_KEY", "")
	PINECONE_ENV = os.environ.get("PINECONE_ENV", "us-east-1")
	PINECONE_INDEX = os.environ.get("PINECONE_INDEX", '3gpp-r16-hg')

	EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL", "hkunlp/instructor-large")
	EMBEDDING_LOADER = os.environ.get("EMBEDDING_LOADER", "HuggingFaceInstructEmbeddings")
	EMBEDDING_LIST = ["HuggingFaceInstructEmbeddings", "HuggingFaceEmbeddings"]

	# return top-k text chunks from vector store
	TOP_K_DEFAULT = 15
	TOP_K_MAX = 30
	SCORE_DEFAULT = 0.33

	global g_db
	g_db = None

	def init_db(emb_name, emb_loader, db_api_key, db_env, db_index):

	embeddings = eval(emb_loader)(model_name=emb_name)

	pinecone.init(api_key = db_api_key,
	environment = db_env)

	global g_db

	g_db = Pinecone.from_existing_index(index_name = db_index,
	embedding = embeddings)
	return str(g_db)


	def get_db():
	return g_db


	def remove_duplicates(documents, score_min):
	seen_content = set()
	unique_documents = []
	for (doc, score) in documents:
	if (doc.page_content not in seen_content) and (score >= score_min):
	seen_content.add(doc.page_content)
	unique_documents.append(doc)
	return unique_documents


	def get_data(query, top_k, score):
	if not query:
	return "Please init db in configuration"

	print("Use db: " + str(g_db))

	docs = g_db.similarity_search_with_score(query = query,
	k=top_k)
	#docsearch = db.as_retriever(search_kwargs={'k':top_k})
	#docs = docsearch.get_relevant_documents(query)
	udocs = remove_duplicates(docs, score)
	return udocs

	with gr.Blocks(
	title = "3GPP Database",
	theme = "Base",
	css = """.bigbox {
	min-height:250px;
	}
	""") as demo:
	with gr.Tab("Matching"):
	with gr.Accordion("Vector similarity"):
	with gr.Row():
	with gr.Column():
	top_k = gr.Slider(1,
	TOP_K_MAX,
	value=TOP_K_DEFAULT,
	step=1,
	label="Vector similarity top_k",
	interactive=True)
	with gr.Column():
	score = gr.Slider(0.01,
	0.99,
	value=SCORE_DEFAULT,
	step=0.01,
	label="Vector similarity score",
	interactive=True)

	with gr.Row():
	inp = gr.Textbox(label = "Input",
	placeholder="What are you looking for?")
	out = gr.Textbox(label = "Output")

	btn_run = gr.Button("Run", variant="primary")

	with gr.Tab("Configuration"):
	with gr.Row():
	loading = gr.Textbox(get_db, max_lines=1, show_label=False)
	btn_init = gr.Button("Init")
	with gr.Accordion("Embedding"):
	with gr.Row():
	with gr.Column():
	emb_textbox = gr.Textbox(
	label = "Embedding Model",
	# show_label = False,
	value = EMBEDDING_MODEL,
	placeholder = "Paste Your Embedding Model Repo on HuggingFace",
	lines=1,
	interactive=True,
	type='email')

	with gr.Column():
	emb_dropdown = gr.Dropdown(
	EMBEDDING_LIST,
	value=EMBEDDING_LOADER,
	multiselect=False,
	interactive=True,
	label="Embedding Loader")

	with gr.Accordion("Pinecone Database"):
	with gr.Row():
	db_api_textbox = gr.Textbox(
	label = "Pinecone API Key",
	# show_label = False,
	value = PINECONE_KEY,
	placeholder = "Paste Your Pinecone API Key (xx-xx-xx-xx-xx) and Hit ENTER",
	lines=1,
	interactive=True,
	type='password')
	with gr.Row():
	db_env_textbox = gr.Textbox(
	label = "Pinecone Environment",
	# show_label = False,
	value = PINECONE_ENV,
	placeholder = "Paste Your Pinecone Environment (xx-xx-xx) and Hit ENTER",
	lines=1,
	interactive=True,
	type='email')
	db_index_textbox = gr.Textbox(
	label = "Pinecone Index",
	# show_label = False,
	value = PINECONE_INDEX,
	placeholder = "Paste Your Pinecone Index (xxxx) and Hit ENTER",
	lines=1,
	interactive=True,
	type='email')

	btn_init.click(fn=init_db, inputs=[emb_textbox, emb_dropdown, db_api_textbox, db_env_textbox, db_index_textbox], outputs=loading)
	btn_run.click(fn=get_data, inputs=[inp, top_k, score], outputs=out)

	if __name__ == "__main__":
	demo.queue()
	demo.launch(inbrowser = True)