|
import urllib |
|
import sys |
|
import requests |
|
import os |
|
import logging |
|
import shutil |
|
import json |
|
from tqdm import tqdm |
|
import time |
|
from pathlib import Path |
|
from datasets.download import DownloadConfig, DownloadManager |
|
|
|
|
|
logging.basicConfig( |
|
format='%(asctime)s %(levelname)s: %(message)s', |
|
level=logging.INFO, |
|
handlers=[ |
|
logging.FileHandler("cv12_download.log"), |
|
logging.StreamHandler(sys.stdout) |
|
] |
|
) |
|
|
|
_BUNDLE_URL_TEMPLATE_DELTA = 'cv-corpus-12.0-delta-2022-12-07/cv-corpus-12.0-delta-2022-12-07-{locale}.tar.gz' |
|
_BUNDLE_VERSION = _BUNDLE_URL_TEMPLATE_DELTA.split("/")[0] |
|
_API_URL = "https://commonvoice.mozilla.org/api/v1" |
|
|
|
|
|
def _get_bundle_url(locale, url_template): |
|
path = url_template.replace("{locale}", locale) |
|
path = urllib.parse.quote(path.encode("utf-8"), safe="~()*!.'") |
|
response = requests.get(f"{_API_URL}/bucket/dataset/{path}", timeout=10.0).json() |
|
return response["url"] |
|
|
|
|
|
def _log_download(locale, bundle_version): |
|
email = "polina@huggingface.co" |
|
payload = {"email": email, "locale": locale, "dataset": bundle_version} |
|
requests.post(f"{_API_URL}/{locale}/downloaders", json=payload).json() |
|
|
|
|
|
def download_language(dl_manager, lang, root_dir): |
|
_log_download(lang, _BUNDLE_VERSION) |
|
url = _get_bundle_url(lang, _BUNDLE_URL_TEMPLATE_DELTA) |
|
i = 1 |
|
while url == "https://s3.dualstack.us-west-2.amazonaws.com/": |
|
if i == 6: |
|
raise ConnectionError(f"Cannot download '{lang.upper()}' data, fetched url: {url}. ") |
|
i += 1 |
|
logging.warning(f"Unsuccessful attempt to fetch data url. Trying {i} time. ") |
|
time.sleep(15) |
|
_log_download(lang, _BUNDLE_VERSION) |
|
url = _get_bundle_url(lang, _BUNDLE_URL_TEMPLATE_DELTA) |
|
|
|
logging.info(f"Trying to download data for '{lang.upper()}'... ") |
|
path = dl_manager.download_and_extract(url) |
|
if os.path.isdir(path): |
|
logging.info(f"'{lang.upper()}' data downloaded to {path}. ") |
|
shutil.move(path, root_dir / f"data/{lang}") |
|
else: |
|
logging.info(f"No data for '{lang.upper()}' found. ") |
|
|
|
|
|
def main(): |
|
root_dir = Path("") |
|
with open("langs.json", "r") as f: |
|
languages = json.load(f).keys() |
|
|
|
if (root_dir / "langs_ok.txt").exists(): |
|
with open(root_dir / "langs_ok.txt") as f: |
|
langs_to_skip = set([line.strip().split("_")[1] for line in f.read().split("\n") if line]) |
|
logging.info(f"Already downloaded languages: {langs_to_skip}") |
|
else: |
|
langs_to_skip = set() |
|
|
|
dl_config = DownloadConfig( |
|
cache_dir=root_dir / "cache", |
|
resume_download=True, |
|
max_retries=5, |
|
) |
|
dl_manager = DownloadManager( |
|
download_config=dl_config, |
|
record_checksums=False, |
|
) |
|
|
|
for lang_id, lang in enumerate(tqdm(languages, desc="Processing languages...")): |
|
if lang in langs_to_skip: |
|
logging.info(f"Data for '{lang.upper()}' language already downloaded, skipping it. ") |
|
continue |
|
try: |
|
download_language(dl_manager, lang, root_dir=root_dir) |
|
with open(root_dir / "langs_ok.txt", "a") as f: |
|
f.write(f"{lang_id}_{lang}\n") |
|
except ConnectionError as e: |
|
logging.error(e.strerror) |
|
with open(root_dir / "langs_failed.txt", "a") as f: |
|
f.write(f"{lang_id}_{lang}\n") |
|
time.sleep(10) |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|