patrickvonplaten commited on
Commit
7e4953f
1 Parent(s): 4861382
adapt_trans_config.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import os
4
+ import shutil
5
+ from tempfile import TemporaryDirectory
6
+ from typing import List, Optional
7
+
8
+ from huggingface_hub import (
9
+ CommitInfo,
10
+ CommitOperationAdd,
11
+ Discussion,
12
+ HfApi,
13
+ hf_hub_download,
14
+ )
15
+ from huggingface_hub.file_download import repo_folder_name
16
+
17
+
18
+ class AlreadyExists(Exception):
19
+ pass
20
+
21
+
22
+ def convert_single(model_id: str, folder: str) -> List["CommitOperationAdd"]:
23
+ config_file_name = "generation_config.json"
24
+ config_file = hf_hub_download(repo_id=model_id, filename=config_file_name)
25
+
26
+ old_config_file = config_file
27
+
28
+ new_config_file = os.path.join(folder, config_file)
29
+ success = convert_file(old_config_file, new_config_file)
30
+ if success:
31
+ operations = [
32
+ CommitOperationAdd(
33
+ path_in_repo=config_file_name, path_or_fileobj=new_config_file
34
+ )
35
+ ]
36
+ model_type = success
37
+ return operations, model_type
38
+ else:
39
+ return False, False
40
+
41
+
42
+ def convert_file(
43
+ old_config: str,
44
+ new_config: str,
45
+ ):
46
+ with open(old_config, "r") as f:
47
+ old_dict = json.load(f)
48
+
49
+ old_dict["max_initial_timestamp_index"] = 50
50
+ old_dict["prev_sot_token_id"] = old_dict["suppress_tokens"][-2]
51
+
52
+ with open(new_config, "w") as f:
53
+ json_str = json.dumps(old_dict, indent=2, sort_keys=True) + "\n"
54
+ f.write(json_str)
55
+
56
+ return "Whisper"
57
+
58
+
59
+ def previous_pr(api: "HfApi", model_id: str, pr_title: str) -> Optional["Discussion"]:
60
+ try:
61
+ discussions = api.get_repo_discussions(repo_id=model_id)
62
+ except Exception:
63
+ return None
64
+ for discussion in discussions:
65
+ if (
66
+ discussion.status == "open"
67
+ and discussion.is_pull_request
68
+ and discussion.title == pr_title
69
+ ):
70
+ return discussion
71
+
72
+
73
+ def convert(api: "HfApi", model_id: str, force: bool = False) -> Optional["CommitInfo"]:
74
+ pr_title = "Correct long-form generation config parameters 'max_initial_timestamp_index' and 'prev_sot_token_id'."
75
+ info = api.model_info(model_id)
76
+ filenames = set(s.rfilename for s in info.siblings)
77
+
78
+ if "generation_config.json" not in filenames:
79
+ print(f"Model: {model_id} has no generation_config.json file to change")
80
+ return
81
+
82
+ with TemporaryDirectory() as d:
83
+ folder = os.path.join(d, repo_folder_name(repo_id=model_id, repo_type="models"))
84
+ os.makedirs(folder)
85
+ new_pr = None
86
+ try:
87
+ operations = None
88
+ pr = previous_pr(api, model_id, pr_title)
89
+ if pr is not None and not force:
90
+ url = f"https://huggingface.co/{model_id}/discussions/{pr.num}"
91
+ new_pr = pr
92
+ raise AlreadyExists(
93
+ f"Model {model_id} already has an open PR check out {url}"
94
+ )
95
+ else:
96
+ operations, model_type = convert_single(model_id, folder)
97
+
98
+ if operations:
99
+ pr_title = pr_title.format(model_type)
100
+ contributor = model_id.split("/")[0]
101
+ pr_description = (
102
+ f"Hey {contributor} 👋, \n\n Your model repository seems to contain outdated generation config parameters, such as 'max_initial_timestamp_index' and is missing the 'prev_sot_token_id' parameter. "
103
+ "These parameters need to be updated to correctly handle long-form generation as stated in as part of https://github.com/huggingface/transformers/pull/27658. "
104
+ "This PR makes sure that everything is up to date and can be safely merged. \n\n Best, the Transformers team."
105
+ )
106
+ new_pr = api.create_commit(
107
+ repo_id=model_id,
108
+ operations=operations,
109
+ commit_message=pr_title,
110
+ commit_description=pr_description,
111
+ create_pr=True,
112
+ )
113
+ print(f"Pr created at {new_pr.pr_url}")
114
+ else:
115
+ print(f"No files to convert for {model_id}")
116
+ finally:
117
+ shutil.rmtree(folder)
118
+ return new_pr
119
+
120
+
121
+ if __name__ == "__main__":
122
+ DESCRIPTION = """
123
+ Simple utility tool to convert automatically some weights on the hub to `safetensors` format.
124
+ It is PyTorch exclusive for now.
125
+ It works by downloading the weights (PT), converting them locally, and uploading them back
126
+ as a PR on the hub.
127
+ """
128
+ parser = argparse.ArgumentParser(description=DESCRIPTION)
129
+ parser.add_argument(
130
+ "model_id",
131
+ type=str,
132
+ help="The name of the model on the hub to convert. E.g. `gpt2` or `facebook/wav2vec2-base-960h`",
133
+ )
134
+ parser.add_argument(
135
+ "--force",
136
+ action="store_true",
137
+ help="Create the PR even if it already exists of if the model was already converted.",
138
+ )
139
+ args = parser.parse_args()
140
+ model_id = args.model_id
141
+ api = HfApi()
142
+ convert(api, model_id, force=args.force)
all_ids.txt ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ openai/whisper-large-v3
2
+ openai/whisper-large-v2
3
+ brunoqgalvao/whisper-small-pt-br
4
+ openai/whisper-base.en
5
+ openai/whisper-tiny.en
6
+ openai/whisper-small
7
+ distil-whisper/distil-medium.en
8
+ openai/whisper-base
9
+ openai/whisper-tiny
10
+ openai/whisper-small.en
11
+ openai/whisper-medium
12
+ openai/whisper-large
13
+ MU-NLPC/whisper-small-audio-captioning
14
+ distil-whisper/distil-large-v2
15
+ hf-internal-testing/tiny-random-WhisperForConditionalGeneration
16
+ openai/whisper-medium.en
17
+ tarteel-ai/whisper-base-ar-quran
18
+ distil-whisper/distil-small.en
19
+ sawradip/bengali-whisper-medium-tugstugi
20
+ vasista22/whisper-gujarati-medium
21
+ vumichien/whisper-medium-jp
22
+ sanchit-gandhi/whisper-medium-fleurs-lang-id
23
+ vasista22/whisper-hindi-large-v2
24
+ simonl0909/whisper-large-v2-cantonese
25
+ bofenghuang/whisper-small-cv11-french
26
+ shhossain/whisper-tiny-bn-emo
27
+ Subhaka/whisper-small-Sinhala-Fine_Tune
28
+ biodatlab/whisper-th-medium-combined
29
+ primeline/whisper-large-v3-german
30
+ UnlikelyAI/whisper-tiny-demo
download_1.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ from diffusers import DiffusionPipeline
3
+
4
+ pipeline = DiffusionPipeline.from_pretrained(
5
+ "hf-internal-testing/diffusers-stable-diffusion-tiny-all"
6
+ )
mass_config_trans_adapt.sh ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ while read p; do
3
+ echo "-------------------------------"
4
+ echo "Attempting to open PR for $p"
5
+ python adapt_trans_config.py $p
6
+ done <all_ids.txt