speech-resource-finder / huggingface_search.py
Alp
wiki search, huge refactor
5ea1cbe
"""
HuggingFace model and dataset search functionality
"""
import re
import requests
from bs4 import BeautifulSoup
from collections import defaultdict
def parse_stat_number(stat_text):
"""
Parse HuggingFace stat numbers like '4.07M', '23.4k', '349' into integers
Returns integer value or 0 if parsing fails
"""
if not stat_text:
return 0
stat_text = stat_text.strip().upper()
try:
# Handle 'M' (millions)
if 'M' in stat_text:
return int(float(stat_text.replace('M', '')) * 1_000_000)
# Handle 'K' (thousands)
elif 'K' in stat_text:
return int(float(stat_text.replace('K', '')) * 1_000)
# Plain number
else:
return int(stat_text.replace(',', ''))
except (ValueError, AttributeError):
return 0
def search_huggingface_models(iso_639_1, iso_639_2, pipeline_tag, max_results=100, max_pages=3):
"""
Search HuggingFace for models supporting a specific language
Args:
iso_639_1: ISO 639-1 (2-letter) code
iso_639_2: ISO 639-2 (3-letter) code
pipeline_tag: 'automatic-speech-recognition' or 'text-to-speech'
max_results: maximum number of models to return
max_pages: maximum number of pages to search per language code
Returns:
tuple: (list of model dictionaries, log messages)
"""
logs = []
# Try both language code formats
codes_to_try = []
if iso_639_1:
codes_to_try.append(iso_639_1)
if iso_639_2:
codes_to_try.append(iso_639_2)
if not codes_to_try:
logs.append("No language codes available for search")
return [], logs
logs.append(f"Language codes to search: {set(codes_to_try)}")
models = []
seen_models = set()
for code in codes_to_try:
if len(models) >= max_results:
break
logs.append(f"Searching for language code: {code}")
# Try multiple pages for this language code
for page in range(max_pages):
if len(models) >= max_results:
break
try:
# Use HuggingFace model search with pagination
url = f"https://huggingface.co/models?pipeline_tag={pipeline_tag}&language={code}&sort=trending"
if page > 0:
url += f"&p={page}"
logs.append(f" Page {page}: {url}")
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Parse model cards from the page
model_cards = soup.find_all('article', class_='overview-card-wrapper')
if not model_cards:
logs.append(f" No model cards found on page {page}")
break
logs.append(f" Found {len(model_cards)} model cards on page {page}")
for card in model_cards:
if len(models) >= max_results:
break
try:
link = card.find('a', href=True)
if link:
href = link.get('href', '')
model_name = href.lstrip('/')
if model_name and model_name != '#' and model_name not in seen_models:
seen_models.add(model_name)
# Parse stats directly from the card HTML by looking at SVG icons
downloads = 0
likes = 0
size = ""
# Find all SVG elements in the card
svgs = card.find_all('svg')
for svg in svgs:
# Get the next sibling text after the SVG
next_elem = svg.find_next_sibling(string=True)
stat_text = ""
if next_elem and next_elem.strip():
stat_text = next_elem.strip()
else:
# Try to find text in the next sibling element (e.g., <span>)
next_tag = svg.find_next_sibling()
if next_tag:
stat_text = next_tag.get_text(strip=True)
if not stat_text or len(stat_text) < 1:
continue
# Identify icon type by viewBox or path content
svg_str = str(svg)
# Download icon
if 'M26 24v4H6v-4H4v4a2 2 0 0 0 2 2h20a2 2 0 0 0 2-2v-4zm0-10l-1.41-1.41L17 20.17V2h-2v18.17l-7.59-7.58L6 14l10 10l10-10z' in svg_str:
downloads = parse_stat_number(stat_text)
# Like/heart icon
elif 'M22.45,6a5.47,5.47,0,0,1,3.91,1.64,5.7,5.7,0,0,1,0,8L16,26.13' in svg_str:
likes = parse_stat_number(stat_text)
# Model size icon
elif 'M10 10H8.4V8.4H10V10Zm0-3.2H8.4V5.2H10v1.6ZM6.8 10H5.2V8.4h1.6V10Z' in svg_str:
# Model parameter count (e.g., "2B", "0.6B")
if len(stat_text) <= 6 and re.search(r'\d+\.?\d*\s*[Bb]', stat_text):
size = stat_text
models.append({
'name': model_name,
'url': f"https://huggingface.co/{model_name}",
'downloads': downloads,
'likes': likes,
'size': size
})
except Exception as e:
logs.append(f" Error parsing model card: {e}")
continue
except Exception as e:
logs.append(f" ERROR searching page {page}: {e}")
break
# Sort by downloads (descending)
models.sort(key=lambda x: x['downloads'], reverse=True)
logs.append(f"Total unique models found: {len(models)}")
return models, logs
def search_huggingface_datasets(iso_639_1, iso_639_2, task_category, max_results=100, max_pages=3):
"""
Search HuggingFace for datasets supporting a specific language
Args:
iso_639_1: ISO 639-1 (2-letter) code
iso_639_2: ISO 639-2 (3-letter) code
task_category: 'automatic-speech-recognition' or 'text-to-speech'
max_results: maximum number of datasets to return
max_pages: maximum number of pages to search per language code
Returns:
tuple: (list of dataset dictionaries, log messages)
"""
logs = []
# Collect all unique language codes for this language
language_codes = set()
if iso_639_1:
language_codes.add(iso_639_1)
if iso_639_2:
language_codes.add(iso_639_2)
if not language_codes:
logs.append("No language codes available for search")
return [], logs
logs.append(f"Language codes to search: {language_codes}")
datasets = []
seen_datasets = set()
# Search separately for each language code
for code in language_codes:
if len(datasets) >= max_results:
break
logs.append(f"Searching for language code: {code}")
for page in range(max_pages):
if len(datasets) >= max_results:
break
try:
# Use HuggingFace dataset search
url = f"https://huggingface.co/datasets?task_categories=task_categories:{task_category}&language=language:{code}&sort=trending"
if page > 0:
url += f"&p={page}"
logs.append(f" Page {page}: {url}")
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Parse dataset cards from the page
dataset_cards = soup.find_all('article', class_='overview-card-wrapper')
if not dataset_cards:
logs.append(f" No dataset cards found on page {page}")
break
logs.append(f" Found {len(dataset_cards)} dataset cards on page {page}")
for card in dataset_cards:
if len(datasets) >= max_results:
break
try:
link = card.find('a', href=True)
if link:
href = link.get('href', '')
dataset_path = href.lstrip('/')
# Remove "datasets/" prefix if present
if dataset_path.startswith('datasets/'):
dataset_name = dataset_path[9:]
else:
dataset_name = dataset_path
if dataset_name and dataset_name != '#' and dataset_name not in seen_datasets:
seen_datasets.add(dataset_name)
# Parse stats directly from the card HTML by looking at SVG icons
downloads = 0
likes = 0
size = ""
# Find all SVG elements in the card
svgs = card.find_all('svg')
for svg in svgs:
# Get the next sibling text after the SVG
next_elem = svg.find_next_sibling(string=True)
stat_text = ""
if next_elem and next_elem.strip():
stat_text = next_elem.strip()
else:
# Try to find text in the next sibling element (e.g., <span>)
next_tag = svg.find_next_sibling()
if next_tag:
stat_text = next_tag.get_text(strip=True)
# Skip non-numeric text like "Viewer", "Updated", etc.
if not stat_text or len(stat_text) < 1 or stat_text in ['Viewer', 'Updated']:
continue
# Identify icon type by viewBox or path content
svg_str = str(svg)
# Download icon
if 'M26 24v4H6v-4H4v4a2 2 0 0 0 2 2h20a2 2 0 0 0 2-2v-4zm0-10l-1.41-1.41L17 20.17V2h-2v18.17l-7.59-7.58L6 14l10 10l10-10z' in svg_str:
downloads = parse_stat_number(stat_text)
# Like/heart icon
elif 'M22.45,6a5.47,5.47,0,0,1,3.91,1.64,5.7,5.7,0,0,1,0,8L16,26.13' in svg_str:
likes = parse_stat_number(stat_text)
# Dataset size icon
elif 'fill-rule="evenodd"' in svg_str and 'clip-rule="evenodd"' in svg_str:
# Dataset size (e.g., "411k", "23.4M", "65.1k")
if any(c in stat_text for c in ['k', 'K', 'm', 'M']) or stat_text.replace(',', '').replace('.', '').isdigit():
size = stat_text
datasets.append({
'name': dataset_name,
'url': f"https://huggingface.co/datasets/{dataset_name}",
'downloads': downloads,
'likes': likes,
'size': size
})
except Exception as e:
logs.append(f" Error parsing dataset card: {e}")
continue
except Exception as e:
logs.append(f" ERROR searching page {page}: {e}")
break
# Sort by downloads (descending)
datasets.sort(key=lambda x: x['downloads'], reverse=True)
logs.append(f"Total unique datasets found: {len(datasets)}")
return datasets, logs
def deduplicate_models(models):
"""
Deduplicate models by base name (without user/org prefix)
Keep the model with most downloads and count duplicates
Returns list of deduplicated models with duplicate count added
"""
# Group models by base name
grouped = defaultdict(list)
for model in models:
# Extract base name (everything after last '/')
name_parts = model['name'].split('/')
if len(name_parts) > 1:
base_name = name_parts[-1] # e.g., "whisper-large-v3"
else:
base_name = model['name']
grouped[base_name].append(model)
# For each group, keep the one with most downloads
deduplicated = []
for base_name, model_list in grouped.items():
# Sort by downloads (descending) and keep the first one
model_list.sort(key=lambda x: x['downloads'], reverse=True)
best_model = model_list[0]
# Add duplicate count (total in group)
best_model['duplicates'] = len(model_list) - 1
deduplicated.append(best_model)
# Sort by downloads again
deduplicated.sort(key=lambda x: x['downloads'], reverse=True)
return deduplicated