Heinrich Dinkel commited on
Commit
79b1301
·
1 Parent(s): a09cac7

updated modeling

Browse files
Files changed (1) hide show
  1. modeling_glap.py +35 -9
modeling_glap.py CHANGED
@@ -19,6 +19,11 @@ from einops import rearrange
19
  from einops.layers.torch import Rearrange
20
  from transformers import PreTrainedModel
21
 
 
 
 
 
 
22
  from .configuration_glap import GlapConfig
23
 
24
 
@@ -815,17 +820,38 @@ class GlapModel(PreTrainedModel):
815
 
816
  def _get_tokenizer(self) -> NllbTokenizer:
817
  if self.tokenizer is None:
818
- # Find the model directory: HuggingFace copies .py files to its cache
819
- # but not .model files, so we use config._name_or_path (the original
820
- # model path) to locate the tokenizer.
 
821
  model_dir = Path(self.config._name_or_path)
822
- if not model_dir.is_dir():
823
- model_dir = Path(__file__).parent
824
- tokenizer_path = model_dir / "sentencepiece.source.256000.model"
825
- if not tokenizer_path.exists():
826
- tokenizer_path = (
827
- Path(__file__).parent / "sentencepiece.source.256000.model"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
828
  )
 
829
  self.tokenizer = NllbTokenizer(tokenizer_path)
830
  return self.tokenizer
831
 
 
19
  from einops.layers.torch import Rearrange
20
  from transformers import PreTrainedModel
21
 
22
+ try:
23
+ from huggingface_hub import hf_hub_download
24
+ except ImportError:
25
+ hf_hub_download = None # type: ignore[assignment,misc]
26
+
27
  from .configuration_glap import GlapConfig
28
 
29
 
 
820
 
821
  def _get_tokenizer(self) -> NllbTokenizer:
822
  if self.tokenizer is None:
823
+ tokenizer_filename = "sentencepiece.source.256000.model"
824
+ tokenizer_path: Optional[Path | str] = None
825
+
826
+ # 1. Check config._name_or_path (local directory)
827
  model_dir = Path(self.config._name_or_path)
828
+ candidate = model_dir / tokenizer_filename
829
+ if candidate.exists():
830
+ tokenizer_path = candidate
831
+
832
+ # 2. Check next to this file (modules cache / local install)
833
+ if tokenizer_path is None:
834
+ candidate = Path(__file__).parent / tokenizer_filename
835
+ if candidate.exists():
836
+ tokenizer_path = candidate
837
+
838
+ # 3. Download from HuggingFace Hub
839
+ if tokenizer_path is None and hf_hub_download is not None:
840
+ try:
841
+ tokenizer_path = hf_hub_download(
842
+ repo_id=self.config._name_or_path,
843
+ filename=tokenizer_filename,
844
+ )
845
+ except Exception:
846
+ pass
847
+
848
+ if tokenizer_path is None:
849
+ raise FileNotFoundError(
850
+ f"Could not find {tokenizer_filename}. "
851
+ f"Searched {self.config._name_or_path} and "
852
+ f"{Path(__file__).parent}."
853
  )
854
+
855
  self.tokenizer = NllbTokenizer(tokenizer_path)
856
  return self.tokenizer
857