import asyncio
import json
import re
from typing import Dict, List
import gradio as gr
import httpx
from cashews import cache
from huggingface_hub import ModelCard
cache.setup("mem://")
API_URL = "https://davanstrien-huggingface-datasets-search-v2.hf.space/similar"
HF_API_URL = "https://huggingface.co/api/datasets"
README_URL_TEMPLATE = "https://huggingface.co/datasets/{}/raw/main/README.md"
async def fetch_similar_datasets(dataset_id: str, limit: int = 10) -> List[Dict]:
async with httpx.AsyncClient() as client:
response = await client.get(f"{API_URL}?dataset_id={dataset_id}&n={limit + 1}")
if response.status_code == 200:
results = response.json()["results"]
# Remove the input dataset from the results
return [r for r in results if r["dataset_id"] != dataset_id][:limit]
return []
async def fetch_similar_datasets_by_text(query: str, limit: int = 10) -> List[Dict]:
async with httpx.AsyncClient() as client:
response = await client.post(
f"{API_URL}_by_text", params={"query": query, "n": limit + 1}
)
if response.status_code == 200:
results = response.json()["results"]
return results[:limit]
return []
async def search_similar_datasets_by_text(query: str, limit: int = 10):
results = await fetch_similar_datasets_by_text(query, limit)
if not results:
return "No similar datasets found."
# Fetch dataset cards and info concurrently
dataset_cards = await asyncio.gather(
*[fetch_dataset_card(result["dataset_id"]) for result in results]
)
dataset_infos = await asyncio.gather(
*[fetch_dataset_info(result["dataset_id"]) for result in results]
)
return format_results(results, dataset_cards, dataset_infos)
async def fetch_dataset_card(dataset_id: str) -> str:
url = README_URL_TEMPLATE.format(dataset_id)
async with httpx.AsyncClient() as client:
response = await client.get(url)
return ModelCard(response.text).text if response.status_code == 200 else ""
async def fetch_dataset_info(dataset_id: str) -> Dict:
async with httpx.AsyncClient() as client:
response = await client.get(f"{HF_API_URL}/{dataset_id}")
return response.json() if response.status_code == 200 else {}
def format_results(
results: List[Dict], dataset_cards: List[str], dataset_infos: List[Dict]
) -> str:
markdown = (
"
✨ Similar Datasets ✨
\n\n"
)
for result, card, info in zip(results, dataset_cards, dataset_infos):
hub_id = result["dataset_id"]
similarity = result["similarity"]
url = f"https://huggingface.co/datasets/{hub_id}"
# Always use the Hub ID as the title
header = f"## [{hub_id}]({url})"
markdown += header + "\n"
markdown += f"**Similarity Score:** {similarity:.4f}\n\n"
if info:
downloads = info.get("downloads", 0)
likes = info.get("likes", 0)
last_modified = info.get("lastModified", "N/A")
markdown += f"**Downloads:** {downloads} | **Likes:** {likes} | **Last Modified:** {last_modified}\n\n"
if card:
# Remove the title from the card content
card_without_title = re.sub(
r"^#.*\n", "", card, count=1, flags=re.MULTILINE
)
# Split the card into paragraphs
paragraphs = card_without_title.split("\n\n")
# Find the first non-empty text paragraph that's not just an image
preview = next(
(
p
for p in paragraphs
if p.strip()
and not p.strip().startswith("![")
and not p.strip().startswith(" 300 else preview
# Add the preview
markdown += f"{preview}\n\n"
# Limit image size in the full dataset card
full_card = re.sub(
r'',
full_card,
)
markdown += f"Full Dataset Card
\n\n{full_card}\n\n \n\n"
markdown += "---\n\n"
return markdown
async def search_similar_datasets(dataset_id: str, limit: int = 10):
results = await fetch_similar_datasets(dataset_id, limit)
if not results:
return "No similar datasets found."
# Fetch dataset cards and info concurrently
dataset_cards = await asyncio.gather(
*[fetch_dataset_card(result["dataset_id"]) for result in results]
)
dataset_infos = await asyncio.gather(
*[fetch_dataset_info(result["dataset_id"]) for result in results]
)
return format_results(results, dataset_cards, dataset_infos)
with gr.Blocks() as demo:
gr.Markdown("## 🤗 Dataset Similarity Search")
with gr.Row():
gr.Markdown(
"This Gradio app allows you to find similar datasets based on a given dataset ID or a text query. "
"Choose the search type and enter either a dataset ID or a text query to find similar datasets with previews of their dataset cards.\n\n"
"For a seamless experience on the Hugging Face website, check out the "
"[Hugging Face Similar Chrome extension](https://chromewebstore.google.com/detail/hugging-face-similar/aijelnjllajooinkcpkpbhckbghghpnl?authuser=0&hl=en). "
"This extension adds a 'Similar Datasets' section directly to Hugging Face dataset pages, "
"making it even easier to discover related datasets for your projects."
)
with gr.Row():
search_type = gr.Radio(
["Dataset ID", "Text Query"], label="Search Type", value="Dataset ID"
)
with gr.Row():
dataset_id = gr.Textbox(
value="airtrain-ai/fineweb-edu-fortified",
label="Dataset ID (e.g., airtrain-ai/fineweb-edu-fortified)",
)
text_query = gr.Textbox(
label="Text Query (e.g., 'natural language processing dataset')",
visible=False,
)
with gr.Row():
search_btn = gr.Button("Search Similar Datasets")
max_results = gr.Slider(
minimum=1,
maximum=50,
step=1,
value=10,
label="Maximum number of results",
)
results = gr.Markdown()
def toggle_input_visibility(choice):
return gr.update(visible=choice == "Dataset ID"), gr.update(
visible=choice == "Text Query"
)
search_type.change(
toggle_input_visibility, inputs=[search_type], outputs=[dataset_id, text_query]
)
search_btn.click(
lambda search_type, dataset_id, text_query, limit: asyncio.run(
search_similar_datasets(dataset_id, limit)
if search_type == "Dataset ID"
else search_similar_datasets_by_text(text_query, limit)
),
inputs=[search_type, dataset_id, text_query, max_results],
outputs=results,
)
demo.launch()