File size: 3,485 Bytes
dace825
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import urllib
import sys
import requests
import os
import logging
import shutil
import json
from tqdm import tqdm
import time
from pathlib import Path
from datasets.download import DownloadConfig, DownloadManager


logging.basicConfig(
    format='%(asctime)s %(levelname)s: %(message)s',
    level=logging.INFO,
    handlers=[
        logging.FileHandler("cv12_download.log"),
        logging.StreamHandler(sys.stdout)
    ]
)

_BUNDLE_URL_TEMPLATE_DELTA = 'cv-corpus-12.0-delta-2022-12-07/cv-corpus-12.0-delta-2022-12-07-{locale}.tar.gz'
_BUNDLE_VERSION = _BUNDLE_URL_TEMPLATE_DELTA.split("/")[0]
_API_URL = "https://commonvoice.mozilla.org/api/v1"


def _get_bundle_url(locale, url_template):
    path = url_template.replace("{locale}", locale)
    path = urllib.parse.quote(path.encode("utf-8"), safe="~()*!.'")
    response = requests.get(f"{_API_URL}/bucket/dataset/{path}", timeout=10.0).json()
    return response["url"]


def _log_download(locale, bundle_version):
    email = "polina@huggingface.co"
    payload = {"email": email, "locale": locale, "dataset": bundle_version}
    requests.post(f"{_API_URL}/{locale}/downloaders", json=payload).json()


def download_language(dl_manager, lang, root_dir):
    _log_download(lang, _BUNDLE_VERSION)
    url = _get_bundle_url(lang, _BUNDLE_URL_TEMPLATE_DELTA)
    i = 1
    while url == "https://s3.dualstack.us-west-2.amazonaws.com/":
        if i == 6:
            raise ConnectionError(f"Cannot download '{lang.upper()}' data, fetched url: {url}. ")
        i += 1
        logging.warning(f"Unsuccessful attempt to fetch data url. Trying {i} time. ")
        time.sleep(15)
        _log_download(lang, _BUNDLE_VERSION)
        url = _get_bundle_url(lang, _BUNDLE_URL_TEMPLATE_DELTA)

    logging.info(f"Trying to download data for '{lang.upper()}'... ")
    path = dl_manager.download_and_extract(url)
    if os.path.isdir(path):
        logging.info(f"'{lang.upper()}' data downloaded to {path}. ")
        shutil.move(path, root_dir / f"data/{lang}")
    else:  # if it's not a dir, there was no data update in the release
        logging.info(f"No data for '{lang.upper()}' found. ")


def main():
    root_dir = Path("")
    with open("langs.json", "r") as f:
        languages = json.load(f).keys()

    if (root_dir / "langs_ok.txt").exists():
        with open(root_dir / "langs_ok.txt") as f:
            langs_to_skip = set([line.strip().split("_")[1] for line in f.read().split("\n") if line])
        logging.info(f"Already downloaded languages: {langs_to_skip}")
    else:
        langs_to_skip = set()

    dl_config = DownloadConfig(
        cache_dir=root_dir / "cache",
        resume_download=True,
        max_retries=5,
    )
    dl_manager = DownloadManager(
        download_config=dl_config,
        record_checksums=False,
    )

    for lang_id, lang in enumerate(tqdm(languages, desc="Processing languages...")):
        if lang in langs_to_skip:
            logging.info(f"Data for '{lang.upper()}' language already downloaded, skipping it. ")
            continue
        try:
            download_language(dl_manager, lang, root_dir=root_dir)
            with open(root_dir / "langs_ok.txt", "a") as f:
                f.write(f"{lang_id}_{lang}\n")
        except ConnectionError as e:
            logging.error(e.strerror)
            with open(root_dir / "langs_failed.txt", "a") as f:
                f.write(f"{lang_id}_{lang}\n")
        time.sleep(10)


if __name__ == "__main__":
    main()