jonatanklosko commited on
Commit
808e59a
โ€ข
1 Parent(s): 3b6e0d3

Add tokenizer generator

Browse files
Dockerfile CHANGED
@@ -1,4 +1,4 @@
1
- FROM ghcr.io/livebook-dev/livebook:latest-cuda11.8
2
 
3
  ENV LIVEBOOK_APP_SERVICE_NAME "๐Ÿณ Hugging Face - $SPACE_TITLE"
4
  ENV LIVEBOOK_APP_SERVICE_URL "https://huggingface.co/spaces/$SPACE_AUTHOR_NAME/$SPACE_REPO_NAME"
@@ -9,7 +9,12 @@ ENV LIVEBOOK_DATA_PATH "/data"
9
  ENV LIVEBOOK_PORT 7860
10
 
11
  EXPOSE 7860
 
12
  USER root
 
 
 
 
13
  COPY public-apps/ /public-apps
14
  RUN mkdir -p /data
15
  RUN chmod 777 /data
 
1
+ FROM ghcr.io/livebook-dev/livebook:latest
2
 
3
  ENV LIVEBOOK_APP_SERVICE_NAME "๐Ÿณ Hugging Face - $SPACE_TITLE"
4
  ENV LIVEBOOK_APP_SERVICE_URL "https://huggingface.co/spaces/$SPACE_AUTHOR_NAME/$SPACE_REPO_NAME"
 
9
  ENV LIVEBOOK_PORT 7860
10
 
11
  EXPOSE 7860
12
+
13
  USER root
14
+
15
+ RUN apt-get update && apt-get install -y python3 python3-pip python-is-python3
16
+ RUN pip --no-cache-dir install transformers sentencepiece protobuf
17
+
18
  COPY public-apps/ /public-apps
19
  RUN mkdir -p /data
20
  RUN chmod 777 /data
README.md CHANGED
@@ -1,10 +1,10 @@
1
  ---
2
- title: Livebook
3
- emoji: ๐Ÿ““
4
- colorFrom: pink
5
- colorTo: purple
6
  sdk: docker
7
  fullWidth: true
8
  ---
9
 
10
- You can install and run [Livebook](https://livebook.dev/) inside a Hugging Face Space. Here's [a tutorial](https://huggingface.co/docs/hub/spaces-sdks-docker-livebook) on how to do that.
 
1
  ---
2
+ title: Bumblebee tools
3
+ emoji: ๐Ÿ
4
+ colorFrom: yellow
5
+ colorTo: brown
6
  sdk: docker
7
  fullWidth: true
8
  ---
9
 
10
+ Tools for [elixir-nx/bumblebee](https://github.com/elixir-nx/bumblebee).
public-apps/tokenizer-generator.livemd ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- livebook:{"app_settings":{"access_type":"public","auto_shutdown_ms":5000,"multi_session":true,"output_type":"rich","show_source":true,"slug":"tokenizer-generator"}} -->
2
+
3
+ # Tokenizer generator
4
+
5
+ ```elixir
6
+ Mix.install([
7
+ {:kino, "~> 0.10.0"},
8
+ {:req, "~> 0.4.3"}
9
+ ])
10
+ ```
11
+
12
+ ## Info
13
+
14
+ ```elixir
15
+ Kino.Markdown.new("""
16
+ ## Background
17
+
18
+ HuggingFace repositories store tokenizers in two flavours:
19
+
20
+ 1. "slow tokenizer" - corresponds to a tokenizer implemented in Python
21
+ and stored as `tokenizer_config.json`
22
+
23
+ 2. "fast tokenizers" - corresponds to a tokenizer implemented in Rust
24
+ and stored as `tokenizer.json`
25
+
26
+ Many repositories only include files for 1., but the `transformers` library
27
+ automatically converts "slow tokenizer" to "fast tokenizer" whenever possible.
28
+
29
+ Bumblebee relies on the Rust bindings and therefore always requires the
30
+ `tokenizer.json` file. This app generates that file for any repository with the
31
+ "slow tokenizer".
32
+ """)
33
+ ```
34
+
35
+ ## Generator
36
+
37
+ ```elixir
38
+ Kino.Markdown.new("## Converter")
39
+ ```
40
+
41
+ ```elixir
42
+ {version, 0} =
43
+ System.cmd("python", ["-c", "import transformers; print(transformers.__version__, end='')"])
44
+
45
+ Kino.Markdown.new("""
46
+ `tokenizers: #{version}`
47
+ """)
48
+ ```
49
+
50
+ ```elixir
51
+ repo_input = Kino.Input.text("HuggingFace repo")
52
+ ```
53
+
54
+ ```elixir
55
+ repo = Kino.Input.read(repo_input)
56
+
57
+ if repo == "" do
58
+ Kino.interrupt!(:normal, "Enter repository.")
59
+ end
60
+ ```
61
+
62
+ ```elixir
63
+ response =
64
+ Req.post!("https://huggingface.co/api/models/#{repo}/paths-info/main",
65
+ json: %{paths: ["tokenizer.json"]}
66
+ )
67
+
68
+ case response do
69
+ %{status: 200, body: []} ->
70
+ :ok
71
+
72
+ %{status: 200, body: [%{"path" => "tokenizer.json"}]} ->
73
+ Kino.interrupt!(:error, "The tokenizer.json file already exist in the given repository.")
74
+
75
+ _ ->
76
+ Kino.interrupt!(:error, "The repository does not exist or requires authentication.")
77
+ end
78
+ ```
79
+
80
+ ```elixir
81
+ output_dir = Path.join(System.tmp_dir!(), repo)
82
+ ```
83
+
84
+ ````elixir
85
+ script = """
86
+ import sys
87
+ from transformers import AutoTokenizer
88
+
89
+ repo = sys.argv[1]
90
+ output_dir = sys.argv[2]
91
+
92
+
93
+ try:
94
+ tokenizer = AutoTokenizer.from_pretrained(repo)
95
+ assert tokenizer.is_fast
96
+ tokenizer.save_pretrained(output_dir)
97
+ except Exception as error:
98
+ print(error)
99
+ exit(1)
100
+ """
101
+
102
+ case System.cmd("python", ["-c", script, repo, output_dir]) do
103
+ {_, 0} ->
104
+ :ok
105
+
106
+ {output, _} ->
107
+ Kino.Markdown.new("""
108
+ ```
109
+ #{output}
110
+ ```
111
+ """)
112
+ |> Kino.render()
113
+
114
+ Kino.interrupt!(:error, "Tokenizer conversion failed.")
115
+ end
116
+ ````
117
+
118
+ ```elixir
119
+ tokenizer_path = Path.join(output_dir, "tokenizer.json")
120
+
121
+ Kino.Download.new(
122
+ fn -> File.read!(tokenizer_path) end,
123
+ filename: "tokenizer.json",
124
+ label: "tokenizer.json"
125
+ )
126
+ ```
127
+
128
+ `````elixir
129
+ Kino.Markdown.new("""
130
+ ### Next steps
131
+
132
+ 1. Go to https://huggingface.co/#{repo}/upload/main.
133
+
134
+ 2. Upload the `tokenizer.json` file.
135
+
136
+ 3. Add the following description:
137
+
138
+ ````markdown
139
+ Generated with:
140
+
141
+ ```python
142
+ from transformers import AutoTokenizer
143
+
144
+ tokenizer = AutoTokenizer.from_pretrained("#{repo}")
145
+ assert tokenizer.is_fast
146
+ tokenizer.save_pretrained("...")
147
+ ```
148
+ ````
149
+
150
+ 4. Submit the PR.
151
+
152
+ """)
153
+ `````
public-apps/welcome.livemd DELETED
@@ -1,46 +0,0 @@
1
- <!-- livebook:{"app_settings":{"access_type":"public","slug":"welcome"}} -->
2
-
3
- # Livebook <3 Hugging Face
4
-
5
- ```elixir
6
- Mix.install([
7
- {:kino, "~> 0.9"}
8
- ])
9
- ```
10
-
11
- ## Section
12
-
13
- This is the source of a deployed notebook.
14
- This notebook is static and simply renders the markdown content below.
15
-
16
- ```elixir
17
- Kino.Markdown.new("""
18
- Welcome to Livebook in Hugging Face!
19
-
20
- This is a deployed notebook, which is also a perfect place to teach you
21
- the ropes in using Livebook with Hugging Face.
22
-
23
- ## Getting started
24
-
25
- First off, if you want to run your own copy of Livebook,
26
- [check our tutorial](https://news.livebook.dev/livebook-inside-hugging-face-spaces-3LQaRi).
27
- Once you clone the space, remember to set `LIVEBOOK_PASSWORD` as
28
- an environment variable on your Space Settings page (a minimum of
29
- 12 digits is required).
30
-
31
- If you are new to Elixir and Livebook, [head out to the Learn page](/learn)
32
- (it requires a password), there you will find resources to get started
33
- with both.
34
-
35
- ## Deploying notebooks
36
-
37
- Livebook is fully collaborative and it enables you to deploy interactive
38
- and collaborative apps just as well. All of your deployable notebooks will
39
- be in the "public-apps" directory of your Spaces repository.
40
-
41
- To deploy your own notebook on Hugging Face, you must click the
42
- <i class="ri-livebook-deploy"></i> icon on the notebook sidebar, set a "Slug"
43
- for the notebook, mark it as public and then drop its `.livemd` file into
44
- the "public-apps" directory of your Spaces repo.
45
- """)
46
- ```