Spaces:

librarian-bots
/

huggingface-datasets-semantic-search

Running

App Files Files Community

davanstrien HF staff commited on Aug 9, 2023

Commit

13dd954

•

1 Parent(s): 6b04271

draft app

Browse files

Files changed (3) hide show

app.py +108 -0
requirements.in +4 -0
requirements.txt +294 -0

app.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import gradio as gr
+from qdrant_client import QdrantClient
+from qdrant_client import models
+from sentence_transformers import SentenceTransformer
+from huggingface_hub import hf_hub_url
+from dotenv import load_dotenv
+import os
+from functools import lru_cache
+load_dotenv()
+URL = os.getenv("QDRANT_URL")
+QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
+sentence_embedding_model = SentenceTransformer("BAAI/bge-large-en")
+print(URL)
+print(QDRANT_API_KEY)
+collection_name = "dataset_cards"
+client = QdrantClient(
+    url=URL,
+    api_key=QDRANT_API_KEY,
+)
+def format_results(results):
+    markdown = ""
+    for result in results:
+        hub_id = result.payload["id"]
+        url = hf_hub_url(hub_id, "README.md", repo_type="dataset")
+        header = f"## [{hub_id}]({url})"
+        markdown += header + "\n"
+        markdown += result.payload["section_text"] + "\n"
+    return markdown
+@lru_cache()
+def search(query: str):
+    query_ = sentence_embedding_model.encode(
+        f"Represent this sentence for searching relevant passages:{query}"
+    )
+    results = client.search(
+        collection_name="dataset_cards",
+        query_vector=query_,
+        limit=10,
+    )
+    return format_results(results)
+@lru_cache()
+def hub_id_qdrant_id(hub_id):
+    matches = client.scroll(
+        collection_name="dataset_cards",
+        scroll_filter=models.Filter(
+            must=[
+                models.FieldCondition(key="id", match=models.MatchValue(value=hub_id)),
+            ]
+        ),
+        limit=1,
+        with_payload=True,
+        with_vectors=False,
+    )
+    try:
+        return matches[0][0].id
+    except IndexError as e:
+        raise gr.Error(
+            f"Hub id {hub_id} not in out database. This could be because it is very new or because it doesn't have much documentation."
+        ) from e
+@lru_cache()
+def recommend(hub_id):
+    positive_id = hub_id_qdrant_id(hub_id)
+    results = client.recommend(collection_name=collection_name, positive=[positive_id])
+    return format_results(results)
+def query(search_term, search_type):
+    if search_type == "Recommend similar datasets":
+        return recommend(search_term)
+    else:
+        return search(search_term)
+with gr.Blocks() as demo:
+    gr.Markdown("## &#129303; Sematic dataset search")
+    with gr.Row():
+        gr.Markdown(
+            "This Gradio app allows you to search for datasets based on their descriptions. You can either search for similar datasets to a given dataset or search for datasets based on a query."
+        )
+    with gr.Row():
+        search_term = gr.Textbox(value="movie review sentiment",
+            label="hub id i.e. IMDB or query i.e. movie review sentiment"
+        )
+    with gr.Row():
+        with gr.Row():
+            find_similar_btn = gr.Button("Search")
+            search_type = gr.Radio(
+                ["Recommend similar datasets", "Semantic Search"],
+                label="Search type",
+                value="Semantic Search",
+                interactive=True,
+            )
+    results = gr.Markdown()
+    find_similar_btn.click(query, [search_term, search_type], results)
+demo.launch()

requirements.in ADDED Viewed

	@@ -0,0 +1,4 @@

+gradio
+python-dotenv
+qdrant-client==1.3.1
+sentence-transformers

requirements.txt ADDED Viewed

	@@ -0,0 +1,294 @@

+#
+# This file is autogenerated by pip-compile with Python 3.11
+# by the following command:
+#
+#    pip-compile
+#
+aiofiles==23.2.1
+    # via gradio
+aiohttp==3.8.5
+    # via gradio
+aiosignal==1.3.1
+    # via aiohttp
+altair==5.0.1
+    # via gradio
+anyio==3.7.1
+    # via
+    #   httpcore
+    #   starlette
+async-timeout==4.0.2
+    # via aiohttp
+attrs==23.1.0
+    # via
+    #   aiohttp
+    #   jsonschema
+    #   referencing
+certifi==2023.7.22
+    # via
+    #   httpcore
+    #   httpx
+    #   requests
+charset-normalizer==3.2.0
+    # via
+    #   aiohttp
+    #   requests
+click==8.1.6
+    # via
+    #   nltk
+    #   uvicorn
+contourpy==1.1.0
+    # via matplotlib
+cycler==0.11.0
+    # via matplotlib
+fastapi==0.101.0
+    # via gradio
+ffmpy==0.3.1
+    # via gradio
+filelock==3.12.2
+    # via
+    #   huggingface-hub
+    #   torch
+    #   transformers
+fonttools==4.42.0
+    # via matplotlib
+frozenlist==1.4.0
+    # via
+    #   aiohttp
+    #   aiosignal
+fsspec==2023.6.0
+    # via
+    #   gradio-client
+    #   huggingface-hub
+gradio==3.39.0
+    # via -r requirements.in
+gradio-client==0.3.0
+    # via gradio
+grpcio==1.56.2
+    # via
+    #   grpcio-tools
+    #   qdrant-client
+grpcio-tools==1.56.2
+    # via qdrant-client
+h11==0.14.0
+    # via
+    #   httpcore
+    #   uvicorn
+h2==4.1.0
+    # via httpx
+hpack==4.0.0
+    # via h2
+httpcore==0.17.3
+    # via httpx
+httpx[http2]==0.24.1
+    # via
+    #   gradio
+    #   gradio-client
+    #   qdrant-client
+huggingface-hub==0.16.4
+    # via
+    #   gradio
+    #   gradio-client
+    #   sentence-transformers
+    #   transformers
+hyperframe==6.0.1
+    # via h2
+idna==3.4
+    # via
+    #   anyio
+    #   httpx
+    #   requests
+    #   yarl
+jinja2==3.1.2
+    # via
+    #   altair
+    #   gradio
+    #   torch
+joblib==1.3.2
+    # via
+    #   nltk
+    #   scikit-learn
+jsonschema==4.19.0
+    # via altair
+jsonschema-specifications==2023.7.1
+    # via jsonschema
+kiwisolver==1.4.4
+    # via matplotlib
+linkify-it-py==2.0.2
+    # via markdown-it-py
+markdown-it-py[linkify]==2.2.0
+    # via
+    #   gradio
+    #   mdit-py-plugins
+markupsafe==2.1.3
+    # via
+    #   gradio
+    #   jinja2
+matplotlib==3.7.2
+    # via gradio
+mdit-py-plugins==0.3.3
+    # via gradio
+mdurl==0.1.2
+    # via markdown-it-py
+mpmath==1.3.0
+    # via sympy
+multidict==6.0.4
+    # via
+    #   aiohttp
+    #   yarl
+networkx==3.1
+    # via torch
+nltk==3.8.1
+    # via sentence-transformers
+numpy==1.25.2
+    # via
+    #   altair
+    #   contourpy
+    #   gradio
+    #   matplotlib
+    #   pandas
+    #   qdrant-client
+    #   scikit-learn
+    #   scipy
+    #   sentence-transformers
+    #   torchvision
+    #   transformers
+orjson==3.9.4
+    # via gradio
+packaging==23.1
+    # via
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+    #   matplotlib
+    #   transformers
+pandas==2.0.3
+    # via
+    #   altair
+    #   gradio
+pillow==10.0.0
+    # via
+    #   gradio
+    #   matplotlib
+    #   torchvision
+portalocker==2.7.0
+    # via qdrant-client
+protobuf==4.24.0
+    # via grpcio-tools
+pydantic==1.10.12
+    # via
+    #   fastapi
+    #   gradio
+    #   qdrant-client
+pydub==0.25.1
+    # via gradio
+pyparsing==3.0.9
+    # via matplotlib
+python-dateutil==2.8.2
+    # via
+    #   matplotlib
+    #   pandas
+python-dotenv==1.0.0
+    # via -r requirements.in
+python-multipart==0.0.6
+    # via gradio
+pytz==2023.3
+    # via pandas
+pyyaml==6.0.1
+    # via
+    #   gradio
+    #   huggingface-hub
+    #   transformers
+qdrant-client==1.3.1
+    # via -r requirements.in
+referencing==0.30.2
+    # via
+    #   jsonschema
+    #   jsonschema-specifications
+regex==2023.8.8
+    # via
+    #   nltk
+    #   transformers
+requests==2.31.0
+    # via
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+    #   torchvision
+    #   transformers
+rpds-py==0.9.2
+    # via
+    #   jsonschema
+    #   referencing
+safetensors==0.3.2
+    # via transformers
+scikit-learn==1.3.0
+    # via sentence-transformers
+scipy==1.11.1
+    # via
+    #   scikit-learn
+    #   sentence-transformers
+semantic-version==2.10.0
+    # via gradio
+sentence-transformers==2.2.2
+    # via -r requirements.in
+sentencepiece==0.1.99
+    # via sentence-transformers
+six==1.16.0
+    # via python-dateutil
+sniffio==1.3.0
+    # via
+    #   anyio
+    #   httpcore
+    #   httpx
+starlette==0.27.0
+    # via fastapi
+sympy==1.12
+    # via torch
+threadpoolctl==3.2.0
+    # via scikit-learn
+tokenizers==0.13.3
+    # via transformers
+toolz==0.12.0
+    # via altair
+torch==2.0.1
+    # via
+    #   sentence-transformers
+    #   torchvision
+torchvision==0.15.2
+    # via sentence-transformers
+tqdm==4.66.0
+    # via
+    #   huggingface-hub
+    #   nltk
+    #   sentence-transformers
+    #   transformers
+transformers==4.31.0
+    # via sentence-transformers
+typing-extensions==4.5.0
+    # via
+    #   fastapi
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+    #   pydantic
+    #   qdrant-client
+    #   torch
+tzdata==2023.3
+    # via pandas
+uc-micro-py==1.0.2
+    # via linkify-it-py
+urllib3==1.26.16
+    # via
+    #   qdrant-client
+    #   requests
+uvicorn==0.23.2
+    # via gradio
+websockets==11.0.3
+    # via
+    #   gradio
+    #   gradio-client
+yarl==1.9.2
+    # via aiohttp
+# The following packages are considered to be unsafe in a requirements file:
+# setuptools