File size: 4,540 Bytes
2f044c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import argparse
import json
import logging
import os
import tempfile
import zipfile
from datetime import datetime
from pathlib import Path
from typing import Optional, Union

import huggingface_hub

from relik.common.log import get_logger
from relik.common.utils import SAPIENZANLP_DATE_FORMAT, get_md5

logger = get_logger(__name__, level=logging.DEBUG)


def create_info_file(tmpdir: Path):
    logger.debug("Computing md5 of model.zip")
    md5 = get_md5(tmpdir / "model.zip")
    date = datetime.now().strftime(SAPIENZANLP_DATE_FORMAT)

    logger.debug("Dumping info.json file")
    with (tmpdir / "info.json").open("w") as f:
        json.dump(dict(md5=md5, upload_date=date), f, indent=2)


def zip_run(
    dir_path: Union[str, os.PathLike],
    tmpdir: Union[str, os.PathLike],
    zip_name: str = "model.zip",
) -> Path:
    logger.debug(f"zipping {dir_path} to {tmpdir}")
    # creates a zip version of the provided dir_path
    run_dir = Path(dir_path)
    zip_path = tmpdir / zip_name

    with zipfile.ZipFile(zip_path, "w") as zip_file:
        # fully zip the run directory maintaining its structure
        for file in run_dir.rglob("*.*"):
            if file.is_dir():
                continue

            zip_file.write(file, arcname=file.relative_to(run_dir))

    return zip_path


def get_logged_in_username():
    token = huggingface_hub.HfFolder.get_token()
    if token is None:
        raise ValueError(
            "No HuggingFace token found. You need to execute `huggingface-cli login` first!"
        )
    api = huggingface_hub.HfApi()
    user = api.whoami(token=token)
    return user["name"]


def upload(
    model_dir: Union[str, os.PathLike],
    model_name: str,
    filenames: Optional[list[str]] = None,
    organization: Optional[str] = None,
    repo_name: Optional[str] = None,
    commit: Optional[str] = None,
    archive: bool = False,
):
    token = huggingface_hub.HfFolder.get_token()
    if token is None:
        raise ValueError(
            "No HuggingFace token found. You need to execute `huggingface-cli login` first!"
        )

    repo_id = repo_name or model_name
    if organization is not None:
        repo_id = f"{organization}/{repo_id}"
    with tempfile.TemporaryDirectory() as tmpdir:
        api = huggingface_hub.HfApi()
        repo_url = api.create_repo(
            token=token,
            repo_id=repo_id,
            exist_ok=True,
        )
        repo = huggingface_hub.Repository(
            str(tmpdir), clone_from=repo_url, use_auth_token=token
        )

        tmp_path = Path(tmpdir)
        if archive:
            # otherwise we zip the model_dir
            logger.debug(f"Zipping {model_dir} to {tmp_path}")
            zip_run(model_dir, tmp_path)
            create_info_file(tmp_path)
        else:
            # if the user wants to upload a transformers model, we don't need to zip it
            # we just need to copy the files to the tmpdir
            logger.debug(f"Copying {model_dir} to {tmpdir}")
            # copy only the files that are needed
            if filenames is not None:
                for filename in filenames:
                    os.system(f"cp {model_dir}/{filename} {tmpdir}")
            else:
                os.system(f"cp -r {model_dir}/* {tmpdir}")

        # this method automatically puts large files (>10MB) into git lfs
        repo.push_to_hub(commit_message=commit or "Automatic push from sapienzanlp")


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "model_dir", help="The directory of the model you want to upload"
    )
    parser.add_argument("model_name", help="The model you want to upload")
    parser.add_argument(
        "--organization",
        help="the name of the organization where you want to upload the model",
    )
    parser.add_argument(
        "--repo_name",
        help="Optional name to use when uploading to the HuggingFace repository",
    )
    parser.add_argument(
        "--commit", help="Commit message to use when pushing to the HuggingFace Hub"
    )
    parser.add_argument(
        "--archive",
        action="store_true",
        help="""
            Whether to compress the model directory before uploading it.
            If True, the model directory will be zipped and the zip file will be uploaded.
            If False, the model directory will be uploaded as is.""",
    )
    return parser.parse_args()


def main():
    upload(**vars(parse_args()))


if __name__ == "__main__":
    main()