import urllib import sys import requests import os import logging import shutil import json from tqdm import tqdm import time from pathlib import Path from datasets.download import DownloadConfig, DownloadManager logging.basicConfig( format='%(asctime)s %(levelname)s: %(message)s', level=logging.INFO, handlers=[ logging.FileHandler("cv12_download.log"), logging.StreamHandler(sys.stdout) ] ) _BUNDLE_URL_TEMPLATE_DELTA = 'cv-corpus-12.0-delta-2022-12-07/cv-corpus-12.0-delta-2022-12-07-{locale}.tar.gz' _BUNDLE_VERSION = _BUNDLE_URL_TEMPLATE_DELTA.split("/")[0] _API_URL = "https://commonvoice.mozilla.org/api/v1" def _get_bundle_url(locale, url_template): path = url_template.replace("{locale}", locale) path = urllib.parse.quote(path.encode("utf-8"), safe="~()*!.'") response = requests.get(f"{_API_URL}/bucket/dataset/{path}", timeout=10.0).json() return response["url"] def _log_download(locale, bundle_version): email = "polina@huggingface.co" payload = {"email": email, "locale": locale, "dataset": bundle_version} requests.post(f"{_API_URL}/{locale}/downloaders", json=payload).json() def download_language(dl_manager, lang, root_dir): _log_download(lang, _BUNDLE_VERSION) url = _get_bundle_url(lang, _BUNDLE_URL_TEMPLATE_DELTA) i = 1 while url == "https://s3.dualstack.us-west-2.amazonaws.com/": if i == 6: raise ConnectionError(f"Cannot download '{lang.upper()}' data, fetched url: {url}. ") i += 1 logging.warning(f"Unsuccessful attempt to fetch data url. Trying {i} time. ") time.sleep(15) _log_download(lang, _BUNDLE_VERSION) url = _get_bundle_url(lang, _BUNDLE_URL_TEMPLATE_DELTA) logging.info(f"Trying to download data for '{lang.upper()}'... ") path = dl_manager.download_and_extract(url) if os.path.isdir(path): logging.info(f"'{lang.upper()}' data downloaded to {path}. ") shutil.move(path, root_dir / f"data/{lang}") else: # if it's not a dir, there was no data update in the release logging.info(f"No data for '{lang.upper()}' found. ") def main(): root_dir = Path("") with open("langs.json", "r") as f: languages = json.load(f).keys() if (root_dir / "langs_ok.txt").exists(): with open(root_dir / "langs_ok.txt") as f: langs_to_skip = set([line.strip().split("_")[1] for line in f.read().split("\n") if line]) logging.info(f"Already downloaded languages: {langs_to_skip}") else: langs_to_skip = set() dl_config = DownloadConfig( cache_dir=root_dir / "cache", resume_download=True, max_retries=5, ) dl_manager = DownloadManager( download_config=dl_config, record_checksums=False, ) for lang_id, lang in enumerate(tqdm(languages, desc="Processing languages...")): if lang in langs_to_skip: logging.info(f"Data for '{lang.upper()}' language already downloaded, skipping it. ") continue try: download_language(dl_manager, lang, root_dir=root_dir) with open(root_dir / "langs_ok.txt", "a") as f: f.write(f"{lang_id}_{lang}\n") except ConnectionError as e: logging.error(e.strerror) with open(root_dir / "langs_failed.txt", "a") as f: f.write(f"{lang_id}_{lang}\n") time.sleep(10) if __name__ == "__main__": main()