Should there be tokenizer files in the repo?

#2
by Yhyu13 - opened

I pulled this repo the local directory and try to load with --model-path by setting a local path. But Hugging face transformer still want to download the tokenizer confi from online which causes some error

β”‚ /home/hangyu5/Documents/Git-repoMy/AIResearchVault/repo/LLM/BLOOM/LLMZoo/llmzoo/deploy/webapp/in β”‚
β”‚ ference.py:235 in chat_loop                                                                      β”‚
β”‚                                                                                                  β”‚
β”‚   232 β”‚   β”‚   debug: bool,                                                                       β”‚
β”‚   233 ):                                                                                         β”‚
β”‚   234 β”‚   # Model                                                                                β”‚
β”‚ ❱ 235 β”‚   model, tokenizer = load_model(                                                         β”‚
β”‚   236 β”‚   β”‚   model_path, device, num_gpus, max_gpu_memory, load_8bit, load_4bit, debug          β”‚
β”‚   237 β”‚   )                                                                                      β”‚
β”‚   238                                                                                            β”‚
β”‚                                                                                                  β”‚
β”‚ /home/hangyu5/Documents/Git-repoMy/AIResearchVault/repo/LLM/BLOOM/LLMZoo/llmzoo/deploy/webapp/in β”‚
β”‚ ference.py:94 in load_model                                                                      β”‚
β”‚                                                                                                  β”‚
β”‚    91 β”‚   β”‚   tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)               β”‚
β”‚    92 β”‚   β”‚   model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True,   β”‚
β”‚    93 β”‚   else:                                                                                  β”‚
β”‚ ❱  94 β”‚   β”‚   tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)               β”‚
β”‚    95 β”‚   β”‚   model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True,   β”‚
β”‚    96 β”‚                                                                                          β”‚
β”‚    97 β”‚   if load_8bit:                                                                          β”‚
β”‚                                                                                                  β”‚
β”‚ /home/hangyu5/anaconda3/envs/pheonix/lib/python3.10/site-packages/transformers/models/auto/token β”‚
β”‚ ization_auto.py:642 in from_pretrained                                                           β”‚
β”‚                                                                                                  β”‚
β”‚   639 β”‚   β”‚   β”‚   return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *input   β”‚
β”‚   640 β”‚   β”‚                                                                                      β”‚
β”‚   641 β”‚   β”‚   # Next, let's try to use the tokenizer_config file to get the tokenizer class.     β”‚
β”‚ ❱ 642 β”‚   β”‚   tokenizer_config = get_tokenizer_config(pretrained_model_name_or_path, **kwargs)   β”‚
β”‚   643 β”‚   β”‚   if "_commit_hash" in tokenizer_config:                                             β”‚
β”‚   644 β”‚   β”‚   β”‚   kwargs["_commit_hash"] = tokenizer_config["_commit_hash"]                      β”‚
β”‚   645 β”‚   β”‚   config_tokenizer_class = tokenizer_config.get("tokenizer_class")                   β”‚
β”‚                                                                                                  β”‚
β”‚ /home/hangyu5/anaconda3/envs/pheonix/lib/python3.10/site-packages/transformers/models/auto/token β”‚
β”‚ ization_auto.py:486 in get_tokenizer_config                                                      β”‚
β”‚                                                                                                  β”‚
β”‚   483 β”‚   tokenizer_config = get_tokenizer_config("tokenizer-test")                              β”‚
β”‚   484 β”‚   ```"""                                                                                 β”‚
β”‚   485 β”‚   commit_hash = kwargs.get("_commit_hash", None)                                         β”‚
β”‚ ❱ 486 β”‚   resolved_config_file = cached_file(                                                    β”‚
β”‚   487 β”‚   β”‚   pretrained_model_name_or_path,                                                     β”‚
β”‚   488 β”‚   β”‚   TOKENIZER_CONFIG_FILE,                                                             β”‚
β”‚   489 β”‚   β”‚   cache_dir=cache_dir,                                                               β”‚
β”‚                                                                                                  β”‚
β”‚ /home/hangyu5/anaconda3/envs/pheonix/lib/python3.10/site-packages/transformers/utils/hub.py:409  β”‚
β”‚ in cached_file                                                                                   β”‚
β”‚                                                                                                  β”‚
β”‚    406 β”‚   user_agent = http_user_agent(user_agent)                                              β”‚
β”‚    407 β”‚   try:                                                                                  β”‚
β”‚    408 β”‚   β”‚   # Load from URL or cache if already cached                                        β”‚
β”‚ ❱  409 β”‚   β”‚   resolved_file = hf_hub_download(                                                  β”‚
β”‚    410 β”‚   β”‚   β”‚   path_or_repo_id,                                                              β”‚
β”‚    411 β”‚   β”‚   β”‚   filename,                                                                     β”‚
β”‚    412 β”‚   β”‚   β”‚   subfolder=None if len(subfolder) == 0 else subfolder,                         β”‚
β”‚                                                                                                  β”‚
β”‚ /home/hangyu5/anaconda3/envs/pheonix/lib/python3.10/site-packages/huggingface_hub/utils/_validat β”‚
β”‚ ors.py:112 in _inner_fn                                                                          β”‚
β”‚                                                                                                  β”‚
β”‚   109 β”‚   β”‚   β”‚   kwargs.items(),  # Kwargs values                                               β”‚
β”‚   110 β”‚   β”‚   ):                                                                                 β”‚
β”‚   111 β”‚   β”‚   β”‚   if arg_name in ["repo_id", "from_id", "to_id"]:                                β”‚
β”‚ ❱ 112 β”‚   β”‚   β”‚   β”‚   validate_repo_id(arg_value)                                                β”‚
β”‚   113 β”‚   β”‚   β”‚                                                                                  β”‚
β”‚   114 β”‚   β”‚   β”‚   elif arg_name == "token" and arg_value is not None:                            β”‚
β”‚   115 β”‚   β”‚   β”‚   β”‚   has_token = True                                                           β”‚
β”‚                                                                                                  β”‚
β”‚ /home/hangyu5/anaconda3/envs/pheonix/lib/python3.10/site-packages/huggingface_hub/utils/_validat β”‚
β”‚ ors.py:160 in validate_repo_id                                                                   β”‚
β”‚                                                                                                  β”‚
β”‚   157 β”‚   β”‚   raise HFValidationError(f"Repo id must be a string, not {type(repo_id)}: '{repo_   β”‚
β”‚   158 β”‚                                                                                          β”‚
β”‚   159 β”‚   if repo_id.count("/") > 1:                                                             β”‚
β”‚ ❱ 160 β”‚   β”‚   raise HFValidationError(                                                           β”‚
β”‚   161 β”‚   β”‚   β”‚   "Repo id must be in the form 'repo_name' or 'namespace/repo_name':"            β”‚
β”‚   162 β”‚   β”‚   β”‚   f" '{repo_id}'. Use `repo_type` argument if needed."                           β”‚
β”‚   163 β”‚   β”‚   )                                                                                  β”‚
╰─────────────────────────────────────────────────────────────────
FreedomAI org

Yes, we should include tokenizer files. And you could reuse the tokenizer files from FreedomIntelligence/phoenix-inst-chat-7b at the moment.

Thanks for pointing that out.

GeneZC changed discussion status to closed
GeneZC changed discussion status to open
FreedomAI org

And we have found a bug in our code, please use the updated version of our repo.

Sign up or log in to comment