{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "id": "-DBXBd1Q6SFF" }, "outputs": [], "source": [ "import requests\n", "from typing import List, Dict, Any, Iterator\n", "\n", "class DatasetSearchClient:\n", " def __init__(self, base_url: str = \"https://librarian-bots-dataset-column-search-api.hf.space\"):\n", " self.base_url = base_url\n", "\n", " def search(self,\n", " columns: List[str],\n", " match_all: bool = False,\n", " page_size: int = 100) -> Iterator[Dict[str, Any]]:\n", " \"\"\"\n", " Search datasets using the provided API, automatically handling pagination.\n", "\n", " Args:\n", " columns (List[str]): List of column names to search for.\n", " match_all (bool, optional): If True, match all columns. If False, match any column. Defaults to False.\n", " page_size (int, optional): Number of results per page. Defaults to 100.\n", "\n", " Yields:\n", " Dict[str, Any]: Each dataset result from all pages.\n", "\n", " Raises:\n", " requests.RequestException: If there's an error with the HTTP request.\n", " ValueError: If the API returns an unexpected response format.\n", " \"\"\"\n", " page = 1\n", " total_results = None\n", "\n", " while total_results is None or (page - 1) * page_size < total_results:\n", " params = {\n", " \"columns\": columns,\n", " \"match_all\": str(match_all).lower(),\n", " \"page\": page,\n", " \"page_size\": page_size\n", " }\n", "\n", " try:\n", " response = requests.get(f\"{self.base_url}/search\", params=params)\n", " response.raise_for_status()\n", " data = response.json()\n", "\n", " if not {\"total\", \"page\", \"page_size\", \"results\"}.issubset(data.keys()):\n", " raise ValueError(\"Unexpected response format from the API\")\n", "\n", " if total_results is None:\n", " total_results = data['total']\n", "\n", " for dataset in data['results']:\n", " yield dataset\n", "\n", " page += 1\n", "\n", " except requests.RequestException as e:\n", " raise requests.RequestException(f\"Error connecting to the API: {str(e)}\")\n", " except ValueError as e:\n", " raise ValueError(f\"Error processing API response: {str(e)}\")\n", "\n", "# Create an instance of the client\n", "client = DatasetSearchClient()" ] }, { "cell_type": "code", "source": [ "results = list(client.search(['tools'],match_all=True))\n", "len(results)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "9yupgFYx6Sqx", "outputId": "ac6d7c15-2267-4bbd-ceaa-1d98faee188b" }, "execution_count": 5, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "38" ] }, "metadata": {}, "execution_count": 5 } ] }, { "cell_type": "code", "source": [ "results[0]" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "atL-PQq76VrV", "outputId": "f357fe16-a1f9-4bb2-ca3d-767f3ac6508d" }, "execution_count": 6, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "{'hub_id': 'llamafactory/glaive_toolcall_en',\n", " 'likes': 1,\n", " 'downloads': 1151,\n", " 'tags': ['task_categories:text-generation',\n", " 'task_categories:question-answering',\n", " 'language:en',\n", " 'license:apache-2.0',\n", " 'size_categories:1K