Polos-Demo / polos /corpora.py
yuwd's picture
init
03f6091
raw
history blame
1.8 kB
# -*- coding: utf-8 -*-
r"""
Corpora
==============
Available corpora to train/test Polos models.
"""
import os
import click
from torchnlp.download import download_file_maybe_extract
corpus2download = {
"apequest": "https://unbabel-experimental-data-sets.s3-eu-west-1.amazonaws.com/polos/hter/apequest.zip",
"qt21": "https://unbabel-experimental-data-sets.s3-eu-west-1.amazonaws.com/polos/hter/qt21.zip",
"wmt-metrics": "https://unbabel-experimental-data-sets.s3-eu-west-1.amazonaws.com/polos/da/wmt-metrics.zip",
"doc-wmt19": "https://unbabel-experimental-data-sets.s3-eu-west-1.amazonaws.com/polos/da/doc-wmt19.zip",
}
def download_corpus(corpus: str, saving_directory: str = None) -> None:
"""Function that downloads a corpus from AWS.
:param corpus: Name of the corpus to be loaded.
:param saving_directory: RELATIVE path to the saving folder.
"""
corpus = corpus.lower()
if not saving_directory:
saving_directory = "data/"
if not os.path.exists(saving_directory):
os.makedirs(saving_directory)
if os.path.isdir(saving_directory + corpus):
click.secho(f"{corpus} is already in cache.", fg="yellow")
return
elif corpus in corpus2download:
download_file_maybe_extract(
corpus2download[corpus],
directory=saving_directory,
)
else:
raise Exception(f"{corpus} is not a valid corpus!")
click.secho("Download succeeded.", fg="yellow")
if os.path.exists(saving_directory + corpus + ".zip"):
os.remove(saving_directory + corpus + ".zip")
elif os.path.exists(saving_directory + corpus + ".tar.gz"):
os.remove(saving_directory + corpus + ".tar.gz")
else:
click.secho("Fail to delete compressed file.", fg="red")