Spaces:

librarian-bots
/

huggingface-semantic-search

Running

App Files Files Community

davanstrien HF Staff commited on Aug 10, 2023

Commit

dba982b

1 Parent(s): 3352738

add linked models option

Browse files

Files changed (1) hide show

app.py +42 -11

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ import gradio as gr
 from dotenv import load_dotenv
 from qdrant_client import QdrantClient, models
 from sentence_transformers import SentenceTransformer
 load_dotenv()
@@ -22,7 +23,7 @@ client = QdrantClient(
 )
-def format_results(results):
     markdown = (
         "<h1 style='text-align: center;'>  &#x2728; Dataset Search Results  &#x2728;"
         " </h1> \n\n"
@@ -35,12 +36,31 @@ def format_results(results):
         markdown += header + "\n"
         markdown += f"**Downloads:** {download_number}\n\n"
         markdown += f"{result.payload['section_text']} \n"
     return markdown
 @lru_cache(maxsize=100_000)
-def search(query: str, limit: Optional[int] = 10):
     query_ = sentence_embedding_model.encode(
         f"Represent this sentence for searching relevant passages:{query}"
     )
@@ -49,7 +69,7 @@ def search(query: str, limit: Optional[int] = 10):
         query_vector=query_,
         limit=limit,
     )
-    return format_results(results)
 @lru_cache(maxsize=100_000)
@@ -69,25 +89,30 @@ def hub_id_qdrant_id(hub_id):
         return matches[0][0].id
     except IndexError as e:
         raise gr.Error(
-            f"Hub id {hub_id} not in out database. This could be because it is very new"
             " or because it doesn't have much documentation."
         ) from e
 @lru_cache()
-def recommend(hub_id, limit: Optional[int] = 10):
     positive_id = hub_id_qdrant_id(hub_id)
     results = client.recommend(
         collection_name=collection_name, positive=[positive_id], limit=limit
     )
-    return format_results(results)
-def query(search_term, search_type, limit: Optional[int] = 10):
     if search_type == "Recommend similar datasets":
-        return recommend(search_term, limit)
     else:
-        return search(search_term, limit)
 with gr.Blocks() as demo:
@@ -120,10 +145,16 @@ with gr.Blocks() as demo:
                 step=1,
                 value=10,
                 label="Maximum number of results",
-                help="This is the maximum number of results that will be returned",
             )
     results = gr.Markdown()
-    find_similar_btn.click(query, [search_term, search_type, max_results], results)
 demo.launch()

 from dotenv import load_dotenv
 from qdrant_client import QdrantClient, models
 from sentence_transformers import SentenceTransformer
+from huggingface_hub import list_models
 load_dotenv()
 )
+def format_results(results, show_associated_models=True):
     markdown = (
         "<h1 style='text-align: center;'>  &#x2728; Dataset Search Results  &#x2728;"
         " </h1> \n\n"
         markdown += header + "\n"
         markdown += f"**Downloads:** {download_number}\n\n"
         markdown += f"{result.payload['section_text']} \n"
+        if show_associated_models:
+            if linked_models := get_models_for_dataset(hub_id):
+                linked_models = [
+                    f"[{model}](https://huggingface.co/{model})"
+                    for model in linked_models
+                ]
+                markdown += (
+                    "<details><summary>Models trained on this dataset</summary>\n\n"
+                )
+                markdown += "- " + "\n- ".join(linked_models) + "\n\n"
+                markdown += "</details>\n\n"
     return markdown
 @lru_cache(maxsize=100_000)
+def get_models_for_dataset(id):
+    results = list(iter(list_models(filter=f"dataset:{id}")))
+    if results:
+        results = list({result.id for result in results})
+    return results
+@lru_cache(maxsize=200_000)
+def search(query: str, limit: Optional[int] = 10, show_linked_models: bool = False):
     query_ = sentence_embedding_model.encode(
         f"Represent this sentence for searching relevant passages:{query}"
     )
         query_vector=query_,
         limit=limit,
     )
+    return format_results(results, show_associated_models=show_linked_models)
 @lru_cache(maxsize=100_000)
         return matches[0][0].id
     except IndexError as e:
         raise gr.Error(
+            f"Hub id {hub_id} not in the database. This could be because it is very new"
             " or because it doesn't have much documentation."
         ) from e
 @lru_cache()
+def recommend(hub_id, limit: Optional[int] = 10, show_linked_models=False):
     positive_id = hub_id_qdrant_id(hub_id)
     results = client.recommend(
         collection_name=collection_name, positive=[positive_id], limit=limit
     )
+    return format_results(results, show_associated_models=show_linked_models)
+def query(
+    search_term,
+    search_type,
+    limit: Optional[int] = 10,
+    show_linked_models: bool = False,
+):
     if search_type == "Recommend similar datasets":
+        return recommend(search_term, limit, show_linked_models)
     else:
+        return search(search_term, limit, show_linked_models)
 with gr.Blocks() as demo:
                 step=1,
                 value=10,
                 label="Maximum number of results",
             )
+            show_linked_models = gr.Checkbox(
+                label="Show associated models",
+                default=False,
+            )
     results = gr.Markdown()
+    find_similar_btn.click(
+        query, [search_term, search_type, max_results, show_linked_models], results
+    )
 demo.launch()