Spaces:

takarajordan
/

DiffusionTokenizer

Running

App Files Files Community

Jordan Legg commited on 7 days ago

Commit

5b879f4

•

1 Parent(s): 510f4a2

working build

Browse files

Files changed (4) hide show

.gitignore +22 -0
app.py +62 -0
requirements.txt +4 -0
test.py +10 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,22 @@

+# Python
+__pycache__/
+*.py[cod]
+*.pyo
+*.pyd
+*.egg-info/
+dist/
+build/
+*.whl
+# Virtual Environment
+venv/
+env/
+ENV/
+.venv/
+.env/
+# Jupyter Notebook
+.ipynb_checkpoints
+# Gradio specific
+gradio_cache/

app.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import gradio as gr
+from transformers import AutoTokenizer
+import json
+from huggingface_hub import hf_hub_download
+def get_tokenizer_names(model_name):
+    try:
+        # First attempt: Try to get names from model_index.json
+        model_info_path = hf_hub_download(model_name, filename="model_index.json")
+        with open(model_info_path, "r") as f:
+            model_info = json.load(f)
+        # Extract tokenizer class names from the JSON
+        tokenizer_1_class = model_info.get("tokenizer", ["", "Unknown"])[1]
+        tokenizer_2_class = model_info.get("tokenizer_2", ["", "Unknown"])[1]
+        return tokenizer_1_class, tokenizer_2_class
+    except Exception:
+        # Second attempt: Fall back to original method
+        try:
+            model_info = AutoTokenizer.from_pretrained(model_name, subfolder="tokenizer", _from_auto=True)
+            config = model_info.init_kwargs
+            return config.get('tokenizer_class', 'Unknown'), config.get('tokenizer_2_class', 'Unknown')
+        except Exception:
+            return "Unknown", "Unknown"
+def count_tokens(model_name, text):
+    # Load the tokenizers from the specified model
+    tokenizer_1 = AutoTokenizer.from_pretrained(f"{model_name}", subfolder="tokenizer")
+    tokenizer_2 = AutoTokenizer.from_pretrained(f"{model_name}", subfolder="tokenizer_2")
+    # Get tokenizer names
+    tokenizer_1_name, tokenizer_2_name = get_tokenizer_names(model_name)
+    # Tokenize the input text
+    tokens_1 = tokenizer_1.tokenize(text)
+    tokens_2 = tokenizer_2.tokenize(text)
+    # Count the tokens
+    count_1 = len(tokens_1)
+    count_2 = len(tokens_2)
+    return f"{tokenizer_1_name}: {count_1} tokens", f"{tokenizer_2_name}: {count_2} tokens"
+# Create a Gradio interface
+iface = gr.Interface(
+    fn=count_tokens,
+    inputs=[
+        gr.Textbox(label="Model Name", placeholder="e.g., black-forest-labs/FLUX.1-dev"),
+        gr.Textbox(label="Text", placeholder="Enter text here...")
+    ],
+    outputs=[
+        gr.Textbox(label="Tokenizer 1"),
+        gr.Textbox(label="Tokenizer 2")
+    ],
+    title="Token Counter",
+    description="Enter a Hugging Face model name and text to count tokens using the model's tokenizers."
+)
+# Launch the app
+iface.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+gradio
+transformers
+protobuf
+sentencepiece

test.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from huggingface_hub import hf_hub_download
+# Replace "model_name" with the actual model name
+model_info_path = hf_hub_download("shuttleai/shuttle-3-diffusion", filename="model_index.json")
+# Now you can read the contents of the file
+with open(model_info_path, "r") as f:
+    model_info_content = f.read()
+print(model_info_content)