File size: 5,035 Bytes
11a28db | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 | #!/usr/bin/env python3
"""
Download conference/journal proceedings from DBLP as BibTeX files.
Uses the DBLP venue-based search API which is more reliable than
the TOC-based .bht queries (which often return 404 or single entries).
API format:
https://dblp.org/search/publ/api
?q=venue:{VenueName}: year:{year}:
&h=1000 # max results per batch
&f={offset} # pagination offset
&format=bib1 # BibTeX format
Usage:
python scripts/update_db.py
"""
import requests
import time
import sys
from pathlib import Path
DBLP_API = "https://dblp.org/search/publ/api"
# (dblp_venue_name, output_prefix, years)
# dblp_venue_name: exact venue string used in DBLP's venue: filter
# output_prefix: filename prefix for saved .bib files
CONFERENCES = [
# ββ Speech & Audio ββββββββββββββββββββββββββββββββββββββββββ
("INTERSPEECH", "interspeech", range(2018, 2027)),
("ICASSP", "icassp", range(2018, 2027)),
("ASRU", "asru", [2019, 2021, 2023, 2025]),
("SLT", "slt", [2018, 2021, 2022, 2024]),
# ββ ML / AI βββββββββββββββββββββββββββββββββββββββββββββββββ
("ICML", "icml", range(2018, 2027)),
("NeurIPS", "neurips", range(2017, 2027)),
("ICLR", "iclr", range(2018, 2027)),
("AAAI", "aaai", range(2018, 2027)),
("IJCAI", "ijcai", range(2018, 2027)),
("CVPR", "cvpr", range(2018, 2027)),
("ECCV", "eccv", [2018, 2020, 2022, 2024]),
("ICCV", "iccv", [2019, 2021, 2023, 2025]),
# ββ NLP βββββββββββββββββββββββββββββββββββββββββββββββββββββ
("ACL", "acl", range(2018, 2027)), # includes Findings
("EMNLP", "emnlp", range(2018, 2027)), # includes Findings
("NAACL", "naacl", range(2018, 2027)),
("EACL", "eacl", range(2018, 2027)),
("LREC/COLING", "coling", [2024, 2025]),
# Older COLING uses different venue
# ("COLING", "coling", [2018, 2020, 2022]),
# ββ IR / Web / Data βββββββββββββββββββββββββββββββββββββββββ
("SIGIR", "sigir", range(2018, 2027)),
("KDD", "kdd", range(2018, 2027)),
("WWW", "www", range(2018, 2027)),
("WSDM", "wsdm", range(2018, 2027)),
]
# Journals use venue search too
JOURNALS = [
("IEEE ACM Trans Audio Speech Lang Process", "taslp", range(2018, 2027)),
("Trans. Assoc. Comput. Linguistics", "tacl", range(2018, 2027)),
]
def download_venue(venue_name: str, prefix: str, year: int, out_dir: Path):
"""Download a conference/journal year from DBLP using venue search."""
out_file = out_dir / f"{prefix}{year}.bib"
if out_file.exists():
return # Skip if already downloaded
query = f"venue:{venue_name}: year:{year}:"
all_bib = []
offset = 0
while True:
try:
r = requests.get(DBLP_API, params={
"q": query, "h": 1000, "f": offset,
"format": "bib1",
}, timeout=30, headers={"User-Agent": "BibGuard/1.0"})
text = r.text.strip()
except Exception as e:
print(f" β {prefix}{year}: network error ({e})")
return
# Check for HTML error pages
if not text or "<!DOCTYPE" in text[:100] or "@" not in text:
break
all_bib.append(text)
n_entries = text.count("@")
if n_entries < 1000:
break
offset += 1000
time.sleep(1)
if all_bib:
total = sum(b.count("@") for b in all_bib)
out_file.write_text("\n\n".join(all_bib), encoding="utf-8")
print(f" β {prefix}{year}: {total} entries")
else:
print(f" β {prefix}{year}: not on DBLP yet")
def main():
out = Path(__file__).resolve().parent.parent / "data" / "raw"
out.mkdir(parents=True, exist_ok=True)
print("π₯ Downloading conference proceedings from DBLP...")
total_confs = sum(len(list(years)) for _, _, years in CONFERENCES)
done = 0
for venue, prefix, years in CONFERENCES:
for y in years:
download_venue(venue, prefix, y, out)
done += 1
time.sleep(0.5)
print(f"\nπ₯ Downloading journal volumes from DBLP...")
for venue, prefix, years in JOURNALS:
for y in years:
download_venue(venue, prefix, y, out)
time.sleep(0.5)
print(f"\nβ
Done. Run: python scripts/build_index.py")
if __name__ == "__main__":
main()
|