Sixparticle commited on
Commit
1e66cc5
·
1 Parent(s): 20917f5

Fix tokenizer added_tokens format and numpy compatibility

Browse files
Files changed (2) hide show
  1. app.py +8 -8
  2. requirements.txt +1 -0
app.py CHANGED
@@ -29,7 +29,7 @@ def log_runtime_versions() -> None:
29
 
30
 
31
  def sanitize_added_tokens_file(added_tokens_file: str) -> None:
32
- """Normalize malformed added_tokens.json to list format expected by tokenizers."""
33
  if not os.path.exists(added_tokens_file):
34
  return
35
 
@@ -37,15 +37,15 @@ def sanitize_added_tokens_file(added_tokens_file: str) -> None:
37
  with open(added_tokens_file, "r", encoding="utf-8") as fp:
38
  data = json.load(fp)
39
  except Exception:
40
- data = []
41
 
42
- if isinstance(data, list):
43
- sanitized = [item for item in data if isinstance(item, str)]
44
- elif isinstance(data, dict):
45
- # Some repos store empty/object payloads here; tokenizer expects a list.
46
- sanitized = [key for key in data.keys() if isinstance(key, str)]
47
  else:
48
- sanitized = []
49
 
50
  with open(added_tokens_file, "w", encoding="utf-8") as fp:
51
  json.dump(sanitized, fp, ensure_ascii=True)
 
29
 
30
 
31
  def sanitize_added_tokens_file(added_tokens_file: str) -> None:
32
+ """Normalize added_tokens.json to dict format expected by slow tokenizers."""
33
  if not os.path.exists(added_tokens_file):
34
  return
35
 
 
37
  with open(added_tokens_file, "r", encoding="utf-8") as fp:
38
  data = json.load(fp)
39
  except Exception:
40
+ data = {}
41
 
42
+ if isinstance(data, dict):
43
+ sanitized = {k: v for k, v in data.items() if isinstance(k, str) and isinstance(v, int)}
44
+ elif isinstance(data, list):
45
+ # If a list was written by older workaround, convert to empty mapping.
46
+ sanitized = {}
47
  else:
48
+ sanitized = {}
49
 
50
  with open(added_tokens_file, "w", encoding="utf-8") as fp:
51
  json.dump(sanitized, fp, ensure_ascii=True)
requirements.txt CHANGED
@@ -1,6 +1,7 @@
1
  transformers==4.40.2
2
  huggingface_hub==0.36.2
3
  torch==2.1.2
 
4
  sentencepiece>=0.1.96
5
  accelerate>=0.20.0
6
  datasets>=2.0.0
 
1
  transformers==4.40.2
2
  huggingface_hub==0.36.2
3
  torch==2.1.2
4
+ numpy<2
5
  sentencepiece>=0.1.96
6
  accelerate>=0.20.0
7
  datasets>=2.0.0