DataHubHub / utils /huggingface_integration.py
whackthejacker's picture
Upload 34 files
43b66f1 verified
import streamlit as st
import pandas as pd
import os
from huggingface_hub import HfApi, list_datasets
from datasets import load_dataset
@st.cache_data(ttl=3600)
def search_huggingface_datasets(query, limit=20):
"""
Search for datasets on Hugging Face Hub.
Args:
query: Search query string
limit: Maximum number of results to return
Returns:
List of dataset metadata
"""
try:
api = HfApi()
datasets = list_datasets(
filter=query,
limit=limit
)
# Convert to list of dicts with relevant info
results = []
for dataset in datasets:
results.append({
'id': dataset.id,
'name': dataset.id.split('/')[-1],
'description': dataset.description or "No description available",
'author': dataset.author or "Unknown",
'tags': dataset.tags,
'downloads': dataset.downloads
})
return results
except Exception as e:
st.error(f"Error searching Hugging Face Hub: {str(e)}")
return []
@st.cache_data(ttl=3600)
def load_huggingface_dataset(dataset_id, split='train'):
"""
Load a dataset from Hugging Face Hub.
Args:
dataset_id: ID of the dataset on HF Hub (e.g., 'mnist', 'glue', etc.)
split: Dataset split to load (e.g., 'train', 'test', 'validation')
Returns:
Pandas DataFrame containing the dataset
"""
try:
# Load the dataset
dataset = load_dataset(dataset_id, split=split)
# Convert to pandas DataFrame
df = dataset.to_pandas()
return df
except Exception as e:
st.error(f"Error loading dataset '{dataset_id}': {str(e)}")
raise
def upload_to_huggingface(dataset, dataset_name, token=None):
"""
Upload a dataset to Hugging Face Hub.
Args:
dataset: Pandas DataFrame to upload
dataset_name: Name for the dataset
token: Hugging Face API token (optional, will use environment variable if not provided)
Returns:
URL to the uploaded dataset
"""
# Get token from environment if not provided
if token is None:
token = os.getenv("HF_TOKEN")
if not token:
raise ValueError("No Hugging Face token provided. Set the HF_TOKEN environment variable or pass a token.")
try:
# Convert to HF dataset
from datasets import Dataset
hf_dataset = Dataset.from_pandas(dataset)
# Upload to HF Hub
push_result = hf_dataset.push_to_hub(
dataset_name,
token=token
)
return f"https://huggingface.co/datasets/{push_result.repo_id}"
except Exception as e:
st.error(f"Error uploading to Hugging Face Hub: {str(e)}")
raise