Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| import os | |
| import sys | |
| import time | |
| import xml.etree.ElementTree as ET | |
| from datetime import datetime, timezone | |
| import requests | |
| S3_BUCKET = "https://noaa-rrfs-pds.s3.amazonaws.com" | |
| PREFIX_ROOT = "rrfs_a" | |
| def list_bucket(prefix: str): | |
| params = {"delimiter": "/", "prefix": prefix} | |
| r = requests.get(S3_BUCKET + "/", params=params, timeout=20) | |
| r.raise_for_status() | |
| return ET.fromstring(r.text) | |
| def find_latest_cycle(day_ymd: str) -> str | None: | |
| root = list_bucket(f"{PREFIX_ROOT}/rrfs.{day_ymd}/") | |
| hours = [] | |
| for cp in root.findall("{http://s3.amazonaws.com/doc/2006-03-01/}CommonPrefixes"): | |
| pref = cp.find("{http://s3.amazonaws.com/doc/2006-03-01/}Prefix").text | |
| parts = pref.strip("/").split("/") | |
| if len(parts) >= 3: | |
| hh = parts[2] | |
| if hh.isdigit() and len(hh) == 2: | |
| hours.append(hh) | |
| return max(hours) if hours else None | |
| def list_prslev_keys(day_ymd: str, hh: str) -> list[str]: | |
| # Returns keys like rrfs_a/rrfs.YYYYMMDD/HH/rrfs.tHHz.prslev.2p5km.fNNN.DOM.grib2(.idx) | |
| root = list_bucket(f"{PREFIX_ROOT}/rrfs.{day_ymd}/{hh}/") | |
| keys = [] | |
| for ct in root.findall("{http://s3.amazonaws.com/doc/2006-03-01/}Contents"): | |
| key = ct.find("{http://s3.amazonaws.com/doc/2006-03-01/}Key").text | |
| if "/rrfs.t" in key and ".prslev" in key and key.endswith(".grib2"): | |
| keys.append(key) | |
| return keys | |
| def choose_smallest_refc_candidate(keys: list[str]) -> str | None: | |
| # Prefer smaller domains to keep downloads reasonable (hi, pr), then others | |
| domain_order = ["hi", "pr", "ak", "conus", "na"] | |
| # Prefer f000 first | |
| sorted_keys = sorted(keys, key=lambda k: ("f000" not in k, next((i for i, d in enumerate(domain_order) if f".{d}.grib2" in k), 99), k)) | |
| return sorted_keys[0] if sorted_keys else None | |
| def ensure_refc_in_idx(grib_url: str) -> bool: | |
| idx_url = grib_url + ".idx" | |
| r = requests.get(idx_url, timeout=20) | |
| if r.status_code != 200: | |
| return False | |
| return "REFC:" in r.text | |
| def download(url: str, out_path: str): | |
| with requests.get(url, stream=True, timeout=30) as r: | |
| r.raise_for_status() | |
| with open(out_path, "wb") as f: | |
| for chunk in r.iter_content(chunk_size=1024 * 1024): | |
| if chunk: | |
| f.write(chunk) | |
| def main(): | |
| day = datetime.now(timezone.utc).strftime("%Y%m%d") | |
| latest = find_latest_cycle(day) | |
| if latest is None: | |
| print(f"No cycles found for {day} under {S3_BUCKET}/{PREFIX_ROOT}", file=sys.stderr) | |
| sys.exit(2) | |
| keys = list_prslev_keys(day, latest) | |
| if not keys: | |
| print(f"No prslev GRIB2 keys found for {day} {latest}Z", file=sys.stderr) | |
| sys.exit(2) | |
| candidate = choose_smallest_refc_candidate(keys) | |
| if candidate is None: | |
| print("No candidate GRIB2 key found", file=sys.stderr) | |
| sys.exit(2) | |
| grib_url = f"{S3_BUCKET}/{candidate}" | |
| if not ensure_refc_in_idx(grib_url): | |
| print("Chosen file does not contain REFC in index; aborting per requirements.", file=sys.stderr) | |
| sys.exit(3) | |
| os.makedirs("data", exist_ok=True) | |
| out = os.path.join("data", os.path.basename(candidate)) | |
| print(f"Downloading: {grib_url}\n -> {out}") | |
| t0 = time.time() | |
| download(grib_url, out) | |
| dt = time.time() - t0 | |
| size_mb = os.path.getsize(out) / (1024 * 1024) | |
| print(f"Done: {size_mb:.1f} MiB in {dt:.1f}s") | |
| # Save index for quick verification | |
| idx_path = out + ".idx" | |
| r = requests.get(grib_url + ".idx", timeout=20) | |
| r.raise_for_status() | |
| with open(idx_path, "wb") as f: | |
| f.write(r.content) | |
| # Echo REFC lines | |
| lines = [ln for ln in r.text.splitlines() if "REFC:" in ln] | |
| print("REFC index lines:") | |
| for ln in lines[:5]: | |
| print(ln) | |
| if __name__ == "__main__": | |
| main() | |