Clémentine commited on
Commit
7302987
·
1 Parent(s): 7abc6a7

Added check on tokenizer to prevent submissions which won't run

Browse files
src/submission/check_validity.py CHANGED
@@ -8,6 +8,7 @@ import huggingface_hub
8
  from huggingface_hub import ModelCard
9
  from huggingface_hub.hf_api import ModelInfo
10
  from transformers import AutoConfig
 
11
 
12
  from src.envs import HAS_HIGHER_RATE_LIMIT
13
 
@@ -36,9 +37,24 @@ def check_model_card(repo_id: str) -> tuple[bool, str]:
36
  return True, ""
37
 
38
 
39
- def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False) -> tuple[bool, str]:
40
  try:
41
  config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  return True, None, config
43
 
44
  except ValueError:
@@ -48,7 +64,7 @@ def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_rem
48
  None
49
  )
50
 
51
- except Exception:
52
  return False, "was not found on hub!", None
53
 
54
 
 
8
  from huggingface_hub import ModelCard
9
  from huggingface_hub.hf_api import ModelInfo
10
  from transformers import AutoConfig
11
+ from transformers.models.auto.tokenization_auto import tokenizer_class_from_name, get_tokenizer_config
12
 
13
  from src.envs import HAS_HIGHER_RATE_LIMIT
14
 
 
37
  return True, ""
38
 
39
 
40
+ def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
41
  try:
42
  config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
43
+ if test_tokenizer:
44
+ tokenizer_config = get_tokenizer_config(model_name)
45
+ if tokenizer_config is not None:
46
+ tokenizer_class_candidate = tokenizer_config.get("tokenizer_class", None)
47
+ else:
48
+ tokenizer_class_candidate = config.tokenizer_class
49
+
50
+
51
+ tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate)
52
+ if tokenizer_class is None:
53
+ return (
54
+ False,
55
+ f"uses {tokenizer_class_candidate}, which is not in a transformers release, therefore not supported at the moment.",
56
+ None
57
+ )
58
  return True, None, config
59
 
60
  except ValueError:
 
64
  None
65
  )
66
 
67
+ except Exception as e:
68
  return False, "was not found on hub!", None
69
 
70
 
src/submission/submit.py CHANGED
@@ -54,12 +54,12 @@ def add_new_eval(
54
 
55
  # Is the model on the hub?
56
  if weight_type in ["Delta", "Adapter"]:
57
- base_model_on_hub, error, _ = is_model_on_hub(base_model, revision, H4_TOKEN)
58
  if not base_model_on_hub:
59
  return styled_error(f'Base model "{base_model}" {error}')
60
 
61
  if not weight_type == "Adapter":
62
- model_on_hub, error, _ = is_model_on_hub(model, revision)
63
  if not model_on_hub:
64
  return styled_error(f'Model "{model}" {error}')
65
 
 
54
 
55
  # Is the model on the hub?
56
  if weight_type in ["Delta", "Adapter"]:
57
+ base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=H4_TOKEN, test_tokenizer=True)
58
  if not base_model_on_hub:
59
  return styled_error(f'Base model "{base_model}" {error}')
60
 
61
  if not weight_type == "Adapter":
62
+ model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, test_tokenizer=True)
63
  if not model_on_hub:
64
  return styled_error(f'Model "{model}" {error}')
65