Jordan Legg commited on
Commit
5b879f4
1 Parent(s): 510f4a2

working build

Browse files
Files changed (4) hide show
  1. .gitignore +22 -0
  2. app.py +62 -0
  3. requirements.txt +4 -0
  4. test.py +10 -0
.gitignore ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.pyo
5
+ *.pyd
6
+ *.egg-info/
7
+ dist/
8
+ build/
9
+ *.whl
10
+
11
+ # Virtual Environment
12
+ venv/
13
+ env/
14
+ ENV/
15
+ .venv/
16
+ .env/
17
+
18
+ # Jupyter Notebook
19
+ .ipynb_checkpoints
20
+
21
+ # Gradio specific
22
+ gradio_cache/
app.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import AutoTokenizer
3
+ import json
4
+ from huggingface_hub import hf_hub_download
5
+
6
+ def get_tokenizer_names(model_name):
7
+ try:
8
+ # First attempt: Try to get names from model_index.json
9
+ model_info_path = hf_hub_download(model_name, filename="model_index.json")
10
+ with open(model_info_path, "r") as f:
11
+ model_info = json.load(f)
12
+
13
+ # Extract tokenizer class names from the JSON
14
+ tokenizer_1_class = model_info.get("tokenizer", ["", "Unknown"])[1]
15
+ tokenizer_2_class = model_info.get("tokenizer_2", ["", "Unknown"])[1]
16
+
17
+ return tokenizer_1_class, tokenizer_2_class
18
+
19
+ except Exception:
20
+ # Second attempt: Fall back to original method
21
+ try:
22
+ model_info = AutoTokenizer.from_pretrained(model_name, subfolder="tokenizer", _from_auto=True)
23
+ config = model_info.init_kwargs
24
+ return config.get('tokenizer_class', 'Unknown'), config.get('tokenizer_2_class', 'Unknown')
25
+ except Exception:
26
+ return "Unknown", "Unknown"
27
+
28
+ def count_tokens(model_name, text):
29
+ # Load the tokenizers from the specified model
30
+ tokenizer_1 = AutoTokenizer.from_pretrained(f"{model_name}", subfolder="tokenizer")
31
+ tokenizer_2 = AutoTokenizer.from_pretrained(f"{model_name}", subfolder="tokenizer_2")
32
+
33
+ # Get tokenizer names
34
+ tokenizer_1_name, tokenizer_2_name = get_tokenizer_names(model_name)
35
+
36
+ # Tokenize the input text
37
+ tokens_1 = tokenizer_1.tokenize(text)
38
+ tokens_2 = tokenizer_2.tokenize(text)
39
+
40
+ # Count the tokens
41
+ count_1 = len(tokens_1)
42
+ count_2 = len(tokens_2)
43
+
44
+ return f"{tokenizer_1_name}: {count_1} tokens", f"{tokenizer_2_name}: {count_2} tokens"
45
+
46
+ # Create a Gradio interface
47
+ iface = gr.Interface(
48
+ fn=count_tokens,
49
+ inputs=[
50
+ gr.Textbox(label="Model Name", placeholder="e.g., black-forest-labs/FLUX.1-dev"),
51
+ gr.Textbox(label="Text", placeholder="Enter text here...")
52
+ ],
53
+ outputs=[
54
+ gr.Textbox(label="Tokenizer 1"),
55
+ gr.Textbox(label="Tokenizer 2")
56
+ ],
57
+ title="Token Counter",
58
+ description="Enter a Hugging Face model name and text to count tokens using the model's tokenizers."
59
+ )
60
+
61
+ # Launch the app
62
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio
2
+ transformers
3
+ protobuf
4
+ sentencepiece
test.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import hf_hub_download
2
+
3
+ # Replace "model_name" with the actual model name
4
+ model_info_path = hf_hub_download("shuttleai/shuttle-3-diffusion", filename="model_index.json")
5
+
6
+ # Now you can read the contents of the file
7
+ with open(model_info_path, "r") as f:
8
+ model_info_content = f.read()
9
+
10
+ print(model_info_content)