File size: 8,566 Bytes
4dc29ae 6791083 4dc29ae 6791083 4dc29ae 6791083 4dc29ae 2c29ed1 6791083 4dc29ae 6791083 2c29ed1 6791083 2c29ed1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 |
import sqlite3
import os
from huggingface_hub import HfApi
import requests
import huggingface_hub
from huggingface_hub.hf_api import SpaceInfo
from typing import List, Dict, Any, Union
import json
# Test if huggingface_hub is properly imported
print("HuggingFace Hub version:", huggingface_hub.__version__)
#print("HuggingFace Hub API version:", HfApi.__version__)
# Use persistent storage if available
DB_PATH = '/data/huggingface_spaces.db' if os.path.exists('/data') else 'huggingface_spaces.db'
SQL_CREATE_SPACES = 'sql/create_spaces.sql'
SQL_UPDATE_SPACES = 'sql/update_spaces.sql'
SQL_CREATE_ENDPOINTS = 'sql/create_endpoints.sql'
SQL_UPDATE_ENDPOINTS = 'sql/update_endpoints.sql'
SQL_CREATE_TOOLS = 'sql/create_tools.sql'
SQL_UPDATE_TOOLS = 'sql/update_tools.sql'
from sql.sql_utils import load_sql_query, is_database_outdated, update_db_timestamp, create_metadata_table
def create_database():
"""Initialize database if needed"""
# Load and execute table creation SQL
create_metadata_table(DB_PATH)
query_create_spaces = load_sql_query(SQL_CREATE_SPACES)
query_create_endpoints = load_sql_query(SQL_CREATE_ENDPOINTS)
query_create_tools = load_sql_query(SQL_CREATE_TOOLS)
with sqlite3.connect(DB_PATH) as conn:
conn.executescript(query_create_spaces)
conn.executescript(query_create_endpoints)
conn.executescript(query_create_tools)
conn.commit()
def generate_endpoint_urls(space_id: str) -> Dict[str, str]:
"""Generate potential endpoint URLs for a space"""
# Convert "author/space-name" to "author-space-name"
subdomain = space_id.replace('/', '-').replace('_', '-').lower()
return {
"sse": f"https://{subdomain}.hf.space/gradio_api/mcp/sse",
"schema": f"https://{subdomain}.hf.space/gradio_api/mcp/schema"
}
def check_endpoint_availability(url: str) -> bool:
"""Check if endpoint exists and returns valid response"""
try:
response = requests.head(url, timeout=5, allow_redirects=True)
return response.status_code == 200
except (requests.exceptions.RequestException, requests.exceptions.Timeout):
return False
def normalize_tool_format(tool_data: Union[Dict[str, Any], List[Dict[str, Any]]]) -> List[tuple[str, str, Dict]]:
"""
Normalize different tool formats into a consistent format.
Returns list of tuples: (tool_name, description, properties)
"""
result = []
if isinstance(tool_data, list):
# Handle list format
for tool in tool_data:
if name := tool.get('name'):
result.append((
name,
tool.get('description', ''),
tool.get('inputSchema', {})
))
else:
# Handle dictionary format
for name, data in tool_data.items():
result.append((
name,
data.get('description', ''),
data
))
return result
def fetch_and_parse_schema(url: str) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
"""Fetch and parse tool schema from endpoint"""
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
return response.json()
except (requests.exceptions.RequestException, ValueError):
return []
def save_endpoints_and_tools(conn: sqlite3.Connection, space_id: str):
"""Discover and store endpoints and tools for a space"""
cursor = conn.cursor()
endpoint_urls = generate_endpoint_urls(space_id)
# Check and save endpoints
for endpoint_type, url in endpoint_urls.items():
if check_endpoint_availability(url):
query_update_endpoints = load_sql_query(SQL_UPDATE_ENDPOINTS)
cursor.execute(query_update_endpoints, (space_id, endpoint_type, url))
# Process schema endpoint if available
if 'schema' in endpoint_urls and check_endpoint_availability(endpoint_urls['schema']):
tools_data = fetch_and_parse_schema(endpoint_urls['schema'])
if not tools_data:
return
# Convert to normalized format and save
for tool_name, description, properties in normalize_tool_format(tools_data):
try:
query_update_tools = load_sql_query(SQL_UPDATE_TOOLS)
cursor.execute(query_update_tools, (
space_id,
tool_name,
description,
json.dumps(properties)
))
except Exception as e:
print(f"Error saving tool {tool_name} for space {space_id}: {e}")
continue
def fetch_spaces() -> List[Dict[str, Any]]:
"""
Fetch spaces using the Hugging Face API with enhanced filtering and model card access
"""
api = HfApi()
spaces = []
try:
# Get all spaces with 'mcp' or 'server' in their metadata
for space in api.list_spaces(
filter="gradio",
search="mcp-server",
limit=100,
full=True, # Get full metadata
):
try:
# Get detailed space information
space_info: SpaceInfo = api.space_info(repo_id=f"{space.id}")
# Extract model card information
model_card = space_info.cardData
# Get tags and additional metadata
tags = space_info.cardData.get("tags", [])
description = space_info.cardData.get("description", "")
title = space.id.split("/")[-1]
author = space.id.split("/")[0]
likes = space_info.likes
url = f"https://huggingface.co/spaces/{space.id}"
spaces.append({
'id': space.id,
'title': title,
'author': author,
'description': description,
'likes': likes,
'url': url,
#'model_card': model_card,
'tags': ' '.join(tags) if tags else None,
'last_modified': space_info.lastModified,
'private': space_info.private,
})
except Exception as e:
print(f"Error fetching space info for {space.id}: {e}")
continue
return spaces
except Exception as e:
print(f"Error fetching spaces: {e}")
return []
def save_to_database(spaces):
"""Save spaces data to database and process endpoints and tools"""
# Load SQL queries
query_update_spaces = load_sql_query(SQL_UPDATE_SPACES)
# Create database connection
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
try:
for space in spaces:
try:
# Execute insert query
cursor.execute(query_update_spaces, (
space['id'],
space['title'],
space['author'],
space['description'],
space['likes'],
space['url'],
space['tags'],
space['last_modified'],
space['private']
))
# Process endpoints and tools for each space
save_endpoints_and_tools(conn, space['id'])
except sqlite3.IntegrityError as e:
print(f"Error saving space {space['title']}: {e}")
continue
except Exception as e:
print(f"Unexpected error processing space {space['title']}: {e}")
continue
conn.commit()
print(f"Database saved at: {DB_PATH}")
print(f"Processed {len(spaces)} spaces")
finally:
conn.close()
def update_database():
"""Update database if needed"""
create_database()
if not is_database_outdated(DB_PATH):
print("Database is up to date")
return False
print("Starting fetching process...")
spaces_data = fetch_spaces()
save_to_database(spaces_data)
# Update last fetch time
update_db_timestamp(DB_PATH)
print("Process complete! Data saved to database")
return True
if __name__ == "__main__":
update_database()
|