Spaces:
Runtime error
Runtime error
import gradio as gr | |
from qdrant_client import QdrantClient | |
from qdrant_client import models | |
from sentence_transformers import SentenceTransformer | |
from huggingface_hub import hf_hub_url | |
from dotenv import load_dotenv | |
import os | |
from functools import lru_cache | |
load_dotenv() | |
URL = os.getenv("QDRANT_URL") | |
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY") | |
sentence_embedding_model = SentenceTransformer("BAAI/bge-large-en") | |
print(URL) | |
print(QDRANT_API_KEY) | |
collection_name = "dataset_cards" | |
client = QdrantClient( | |
url=URL, | |
api_key=QDRANT_API_KEY, | |
) | |
def format_results(results): | |
markdown = "" | |
for result in results: | |
hub_id = result.payload["id"] | |
url = hf_hub_url(hub_id, "README.md", repo_type="dataset") | |
header = f"## [{hub_id}]({url})" | |
markdown += header + "\n" | |
markdown += result.payload["section_text"] + "\n" | |
return markdown | |
def search(query: str): | |
query_ = sentence_embedding_model.encode( | |
f"Represent this sentence for searching relevant passages:{query}" | |
) | |
results = client.search( | |
collection_name="dataset_cards", | |
query_vector=query_, | |
limit=10, | |
) | |
return format_results(results) | |
def hub_id_qdrant_id(hub_id): | |
matches = client.scroll( | |
collection_name="dataset_cards", | |
scroll_filter=models.Filter( | |
must=[ | |
models.FieldCondition(key="id", match=models.MatchValue(value=hub_id)), | |
] | |
), | |
limit=1, | |
with_payload=True, | |
with_vectors=False, | |
) | |
try: | |
return matches[0][0].id | |
except IndexError as e: | |
raise gr.Error( | |
f"Hub id {hub_id} not in out database. This could be because it is very new or because it doesn't have much documentation." | |
) from e | |
def recommend(hub_id): | |
positive_id = hub_id_qdrant_id(hub_id) | |
results = client.recommend(collection_name=collection_name, positive=[positive_id]) | |
return format_results(results) | |
def query(search_term, search_type): | |
if search_type == "Recommend similar datasets": | |
return recommend(search_term) | |
else: | |
return search(search_term) | |
with gr.Blocks() as demo: | |
gr.Markdown("## 🤗 Semantic Dataset Search") | |
with gr.Row(): | |
gr.Markdown( | |
"This Gradio app allows you to search for datasets based on their descriptions. You can either search for similar datasets to a given dataset or search for datasets based on a query." | |
) | |
with gr.Row(): | |
search_term = gr.Textbox(value="movie review sentiment", | |
label="hub id i.e. IMDB or query i.e. movie review sentiment" | |
) | |
with gr.Row(): | |
with gr.Row(): | |
find_similar_btn = gr.Button("Search") | |
search_type = gr.Radio( | |
["Recommend similar datasets", "Semantic Search"], | |
label="Search type", | |
value="Semantic Search", | |
interactive=True, | |
) | |
results = gr.Markdown() | |
find_similar_btn.click(query, [search_term, search_type], results) | |
demo.launch() | |