Sixparticle commited on
Commit ·
1e66cc5
1
Parent(s): 20917f5
Fix tokenizer added_tokens format and numpy compatibility
Browse files- app.py +8 -8
- requirements.txt +1 -0
app.py
CHANGED
|
@@ -29,7 +29,7 @@ def log_runtime_versions() -> None:
|
|
| 29 |
|
| 30 |
|
| 31 |
def sanitize_added_tokens_file(added_tokens_file: str) -> None:
|
| 32 |
-
"""Normalize
|
| 33 |
if not os.path.exists(added_tokens_file):
|
| 34 |
return
|
| 35 |
|
|
@@ -37,15 +37,15 @@ def sanitize_added_tokens_file(added_tokens_file: str) -> None:
|
|
| 37 |
with open(added_tokens_file, "r", encoding="utf-8") as fp:
|
| 38 |
data = json.load(fp)
|
| 39 |
except Exception:
|
| 40 |
-
data =
|
| 41 |
|
| 42 |
-
if isinstance(data,
|
| 43 |
-
sanitized =
|
| 44 |
-
elif isinstance(data,
|
| 45 |
-
#
|
| 46 |
-
sanitized =
|
| 47 |
else:
|
| 48 |
-
sanitized =
|
| 49 |
|
| 50 |
with open(added_tokens_file, "w", encoding="utf-8") as fp:
|
| 51 |
json.dump(sanitized, fp, ensure_ascii=True)
|
|
|
|
| 29 |
|
| 30 |
|
| 31 |
def sanitize_added_tokens_file(added_tokens_file: str) -> None:
|
| 32 |
+
"""Normalize added_tokens.json to dict format expected by slow tokenizers."""
|
| 33 |
if not os.path.exists(added_tokens_file):
|
| 34 |
return
|
| 35 |
|
|
|
|
| 37 |
with open(added_tokens_file, "r", encoding="utf-8") as fp:
|
| 38 |
data = json.load(fp)
|
| 39 |
except Exception:
|
| 40 |
+
data = {}
|
| 41 |
|
| 42 |
+
if isinstance(data, dict):
|
| 43 |
+
sanitized = {k: v for k, v in data.items() if isinstance(k, str) and isinstance(v, int)}
|
| 44 |
+
elif isinstance(data, list):
|
| 45 |
+
# If a list was written by older workaround, convert to empty mapping.
|
| 46 |
+
sanitized = {}
|
| 47 |
else:
|
| 48 |
+
sanitized = {}
|
| 49 |
|
| 50 |
with open(added_tokens_file, "w", encoding="utf-8") as fp:
|
| 51 |
json.dump(sanitized, fp, ensure_ascii=True)
|
requirements.txt
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
transformers==4.40.2
|
| 2 |
huggingface_hub==0.36.2
|
| 3 |
torch==2.1.2
|
|
|
|
| 4 |
sentencepiece>=0.1.96
|
| 5 |
accelerate>=0.20.0
|
| 6 |
datasets>=2.0.0
|
|
|
|
| 1 |
transformers==4.40.2
|
| 2 |
huggingface_hub==0.36.2
|
| 3 |
torch==2.1.2
|
| 4 |
+
numpy<2
|
| 5 |
sentencepiece>=0.1.96
|
| 6 |
accelerate>=0.20.0
|
| 7 |
datasets>=2.0.0
|