Daniel Fried commited on
Commit
488ca72
1 Parent(s): 3d9bc9d

add Nicolas' tokenizer building batch

Browse files
Files changed (2) hide show
  1. modules/app.py +2 -0
  2. tokenizers_patch.py +26 -0
modules/app.py CHANGED
@@ -1,6 +1,8 @@
1
  import sys
2
  from typing import List
3
  import traceback
 
 
4
  from transformers import AutoModelForCausalLM, AutoTokenizer
5
  import json
6
 
1
  import sys
2
  from typing import List
3
  import traceback
4
+ # needs to be imported *before* transformers
5
+ import tokenizers_patch
6
  from transformers import AutoModelForCausalLM, AutoTokenizer
7
  import json
8
 
tokenizers_patch.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import subprocess
4
+
5
+
6
+ print("Getting rustup")
7
+ subprocess.run(
8
+ "curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y",
9
+ shell=True,
10
+ )
11
+ print("Got rustup")
12
+ myenv = os.environ.copy()
13
+ myenv["PATH"] = os.path.expanduser("~/.cargo/bin:") + myenv["PATH"]
14
+ print("RUSTC", os.path.isfile(os.path.expanduser("~/.cargo/bin/rustc")))
15
+ subprocess.run("rustc --version", shell=True, env=myenv)
16
+ subprocess.run(
17
+ "pip install -e git+https://github.com/huggingface/tokenizers/#egg=tokenizers\&subdirectory=bindings/python",
18
+ shell=True,
19
+ env=myenv,
20
+ )
21
+ sys.path.append(
22
+ os.path.join(os.getcwd(), "src", "tokenizers", "bindings", "python", "py_src")
23
+ )
24
+
25
+
26
+ import tokenizers