Spaces:

voidful
/

RefCheck

Sleeping

File size: 5,035 Bytes

11a28db

#!/usr/bin/env python3
"""
Download conference/journal proceedings from DBLP as BibTeX files.

Uses the DBLP venue-based search API which is more reliable than
the TOC-based .bht queries (which often return 404 or single entries).

API format:
  https://dblp.org/search/publ/api
    ?q=venue:{VenueName}: year:{year}:
    &h=1000        # max results per batch
    &f={offset}    # pagination offset
    &format=bib1   # BibTeX format

Usage:
    python scripts/update_db.py
"""
import requests
import time
import sys
from pathlib import Path

DBLP_API = "https://dblp.org/search/publ/api"

# (dblp_venue_name, output_prefix, years)
# dblp_venue_name: exact venue string used in DBLP's venue: filter
# output_prefix:   filename prefix for saved .bib files
CONFERENCES = [
    # ── Speech & Audio ──────────────────────────────────────────
    ("INTERSPEECH",  "interspeech",  range(2018, 2027)),
    ("ICASSP",       "icassp",       range(2018, 2027)),
    ("ASRU",         "asru",         [2019, 2021, 2023, 2025]),
    ("SLT",          "slt",          [2018, 2021, 2022, 2024]),

    # ── ML / AI ─────────────────────────────────────────────────
    ("ICML",         "icml",         range(2018, 2027)),
    ("NeurIPS",      "neurips",      range(2017, 2027)),
    ("ICLR",         "iclr",         range(2018, 2027)),
    ("AAAI",         "aaai",         range(2018, 2027)),
    ("IJCAI",        "ijcai",        range(2018, 2027)),
    ("CVPR",         "cvpr",         range(2018, 2027)),
    ("ECCV",         "eccv",         [2018, 2020, 2022, 2024]),
    ("ICCV",         "iccv",         [2019, 2021, 2023, 2025]),

    # ── NLP ─────────────────────────────────────────────────────
    ("ACL",          "acl",          range(2018, 2027)),       # includes Findings
    ("EMNLP",        "emnlp",        range(2018, 2027)),       # includes Findings
    ("NAACL",        "naacl",        range(2018, 2027)),
    ("EACL",         "eacl",         range(2018, 2027)),
    ("LREC/COLING",  "coling",       [2024, 2025]),
    # Older COLING uses different venue
    # ("COLING",       "coling",       [2018, 2020, 2022]),

    # ── IR / Web / Data ─────────────────────────────────────────
    ("SIGIR",        "sigir",        range(2018, 2027)),
    ("KDD",          "kdd",          range(2018, 2027)),
    ("WWW",          "www",          range(2018, 2027)),
    ("WSDM",         "wsdm",         range(2018, 2027)),
]

# Journals use venue search too
JOURNALS = [
    ("IEEE ACM Trans Audio Speech Lang Process",  "taslp",  range(2018, 2027)),
    ("Trans. Assoc. Comput. Linguistics",         "tacl",   range(2018, 2027)),
]


def download_venue(venue_name: str, prefix: str, year: int, out_dir: Path):
    """Download a conference/journal year from DBLP using venue search."""
    out_file = out_dir / f"{prefix}{year}.bib"
    if out_file.exists():
        return  # Skip if already downloaded

    query = f"venue:{venue_name}: year:{year}:"
    all_bib = []
    offset = 0

    while True:
        try:
            r = requests.get(DBLP_API, params={
                "q": query, "h": 1000, "f": offset,
                "format": "bib1",
            }, timeout=30, headers={"User-Agent": "BibGuard/1.0"})
            text = r.text.strip()
        except Exception as e:
            print(f"  ✗ {prefix}{year}: network error ({e})")
            return

        # Check for HTML error pages
        if not text or "<!DOCTYPE" in text[:100] or "@" not in text:
            break

        all_bib.append(text)
        n_entries = text.count("@")
        if n_entries < 1000:
            break
        offset += 1000
        time.sleep(1)

    if all_bib:
        total = sum(b.count("@") for b in all_bib)
        out_file.write_text("\n\n".join(all_bib), encoding="utf-8")
        print(f"  ✓ {prefix}{year}: {total} entries")
    else:
        print(f"  ✗ {prefix}{year}: not on DBLP yet")


def main():
    out = Path(__file__).resolve().parent.parent / "data" / "raw"
    out.mkdir(parents=True, exist_ok=True)

    print("📥 Downloading conference proceedings from DBLP...")
    total_confs = sum(len(list(years)) for _, _, years in CONFERENCES)
    done = 0
    for venue, prefix, years in CONFERENCES:
        for y in years:
            download_venue(venue, prefix, y, out)
            done += 1
            time.sleep(0.5)

    print(f"\n📥 Downloading journal volumes from DBLP...")
    for venue, prefix, years in JOURNALS:
        for y in years:
            download_venue(venue, prefix, y, out)
            time.sleep(0.5)

    print(f"\n✅ Done. Run: python scripts/build_index.py")


if __name__ == "__main__":
    main()