Yehor Smoliakov commited on
Commit
af58cc7
1 Parent(s): 8ca8f7e
Files changed (8) hide show
  1. .dockerignore +2 -0
  2. .gitattributes +0 -35
  3. .gitignore +5 -0
  4. Dockerfile +61 -0
  5. README.md +27 -5
  6. app.py +190 -0
  7. requirements-dev.txt +1 -0
  8. requirements.txt +4 -0
.dockerignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ .ruff_cache/
2
+ .venv/
.gitattributes CHANGED
@@ -1,35 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ .idea/
2
+ .venv/
3
+ .ruff_cache/
4
+
5
+ flagged/
Dockerfile ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM nvidia/cuda:11.7.1-cudnn8-devel-ubuntu22.04
2
+
3
+ ENV DEBIAN_FRONTEND=noninteractive
4
+
5
+ RUN apt-get update && \
6
+ apt-get upgrade -y && \
7
+ apt-get install -y --no-install-recommends \
8
+ git \
9
+ git-lfs \
10
+ wget \
11
+ curl \
12
+ # python build dependencies \
13
+ build-essential \
14
+ libssl-dev \
15
+ zlib1g-dev \
16
+ libbz2-dev \
17
+ libreadline-dev \
18
+ libsqlite3-dev \
19
+ libncursesw5-dev \
20
+ xz-utils \
21
+ tk-dev \
22
+ libxml2-dev \
23
+ libxmlsec1-dev \
24
+ libffi-dev \
25
+ liblzma-dev \
26
+ # gradio dependencies \
27
+ ffmpeg \
28
+ && apt-get clean \
29
+ && rm -rf /var/lib/apt/lists/*
30
+
31
+
32
+ RUN useradd -m -u 1000 user
33
+ USER user
34
+ ENV HOME=/home/user \
35
+ PATH=/home/user/.local/bin:${PATH}
36
+ WORKDIR ${HOME}/app
37
+
38
+ RUN curl https://pyenv.run | bash
39
+ ENV PATH=${HOME}/.pyenv/shims:${HOME}/.pyenv/bin:${PATH}
40
+ ARG PYTHON_VERSION=3.10.12
41
+ RUN pyenv install ${PYTHON_VERSION} && \
42
+ pyenv global ${PYTHON_VERSION} && \
43
+ pyenv rehash && \
44
+ pip install --no-cache-dir -U pip setuptools wheel && \
45
+ pip install packaging ninja
46
+
47
+ COPY --chown=1000 ./requirements.txt /tmp/requirements.txt
48
+ RUN pip install --no-cache-dir --upgrade -r /tmp/requirements.txt
49
+
50
+ RUN git clone --depth 1 https://huggingface.co/skypro1111/mbart-large-50-verbalization ${HOME}/app/mbart-large-50-verbalization
51
+
52
+ COPY --chown=1000 . ${HOME}/app
53
+ ENV PYTHONPATH=${HOME}/app \
54
+ PYTHONUNBUFFERED=1 \
55
+ GRADIO_ALLOW_FLAGGING=never \
56
+ GRADIO_NUM_PORTS=1 \
57
+ GRADIO_SERVER_NAME=0.0.0.0 \
58
+ GRADIO_THEME=huggingface \
59
+ SYSTEM=spaces
60
+
61
+ CMD ["python", "app.py"]
README.md CHANGED
@@ -1,11 +1,33 @@
1
  ---
2
- title: Normalize Text Uk
3
- emoji: 🔥
4
  colorFrom: blue
5
- colorTo: pink
6
  sdk: docker
7
  pinned: false
8
- license: apache-2.0
9
  ---
10
 
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Normalize Text for Ukrainian
3
+ emoji: 📝
4
  colorFrom: blue
5
+ colorTo: yellow
6
  sdk: docker
7
  pinned: false
 
8
  ---
9
 
10
+ ## Install
11
+
12
+ ```shell
13
+ uv venv --python 3.10
14
+
15
+ source .venv/bin/activate
16
+
17
+ uv pip install -r requirements.txt
18
+
19
+ # in development mode
20
+ uv pip install -r requirements-dev.txt
21
+ ```
22
+
23
+ ## Build image
24
+
25
+ ```shell
26
+ docker build -t normalize-text-uk .
27
+ ```
28
+
29
+ ## Run
30
+
31
+ ```shell
32
+ docker run -it --rm -p 8888:7860 normalize-text-uk
33
+ ```
app.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import time
3
+
4
+ from importlib.metadata import version
5
+
6
+ import torch
7
+ import gradio as gr
8
+
9
+ from transformers import MBartForConditionalGeneration, AutoTokenizer
10
+
11
+ # Config
12
+ model_name = "/home/user/app/mbart-large-50-verbalization"
13
+ concurrency_limit = 5
14
+
15
+ device = "cuda" if torch.cuda.is_available() else "cpu"
16
+
17
+ # Load the model
18
+ model = MBartForConditionalGeneration.from_pretrained(
19
+ model_name,
20
+ low_cpu_mem_usage=True,
21
+ device_map=device,
22
+ )
23
+ model.eval()
24
+
25
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
26
+ tokenizer.src_lang = "uk_XX"
27
+ tokenizer.tgt_lang = "uk_XX"
28
+
29
+ examples = [
30
+ "WP: F-16 навряд чи значно змінять ситуацію на полі бою",
31
+ "Над Україною збили ракету та 7 з 8 Шахедів",
32
+ "Олімпійські ігри-2024. Розклад змагань українських спортсменів на 28 липня",
33
+ "Кампанія Гарріс менш як за тиждень зібрала понад $200 млн",
34
+ "За тиждень Нацбанк продав майже 800 мільйонів доларів на міжбанку",
35
+ "Париж-2024. День 2. Текстова трансляція",
36
+ ]
37
+
38
+ title = "Normalize Text for Ukrainian"
39
+
40
+ # https://www.tablesgenerator.com/markdown_tables
41
+ authors_table = """
42
+ ## Authors
43
+
44
+ Follow them on social networks and **contact** if you need any help or have any questions:
45
+
46
+ | <img src="https://avatars.githubusercontent.com/u/7875085?v=4" width="100"> **Yehor Smoliakov** |
47
+ |-------------------------------------------------------------------------------------------------|
48
+ | https://t.me/smlkw in Telegram |
49
+ | https://x.com/yehor_smoliakov at X |
50
+ | https://github.com/egorsmkv at GitHub |
51
+ | https://huggingface.co/Yehor at Hugging Face |
52
+ | or use egorsmkv@gmail.com |
53
+ """.strip()
54
+
55
+ description_head = f"""
56
+ # {title}
57
+
58
+ ## Overview
59
+
60
+ This space uses https://huggingface.co/skypro1111/mbart-large-50-verbalization model.
61
+
62
+ Paste the text you want to enhance.
63
+ """.strip()
64
+
65
+ description_foot = f"""
66
+ {authors_table}
67
+ """.strip()
68
+
69
+ normalized_text_value = """
70
+ Normalized text will appear here.
71
+
72
+ Choose **an example** below the Normalize button or paste **your text**.
73
+ """.strip()
74
+
75
+ tech_env = f"""
76
+ #### Environment
77
+
78
+ - Python: {sys.version}
79
+ """.strip()
80
+
81
+ tech_libraries = f"""
82
+ #### Libraries
83
+
84
+ - gradio: {version('gradio')}
85
+ """.strip()
86
+
87
+
88
+ def inference(text, progress=gr.Progress()):
89
+ if not text:
90
+ raise gr.Error("Please paste your text.")
91
+
92
+ gr.Info("Starting normalizing", duration=2)
93
+
94
+ progress(0, desc="Normalizing...")
95
+
96
+ results = []
97
+
98
+ sentences = [
99
+ text,
100
+ ]
101
+
102
+ for sentence in progress.tqdm(sentences, desc="Normalizing...", unit="sentence"):
103
+ sentence = sentence.strip()
104
+
105
+ if len(sentence) == 0:
106
+ continue
107
+
108
+ t0 = time.time()
109
+
110
+ input_text = "<verbalization>:" + sentence
111
+
112
+ encoded_input = tokenizer(
113
+ input_text,
114
+ return_tensors="pt",
115
+ padding=True,
116
+ truncation=True,
117
+ max_length=1024,
118
+ ).to(device)
119
+ output_ids = model.generate(
120
+ **encoded_input, max_length=1024, num_beams=5, early_stopping=True
121
+ )
122
+ normalized_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
123
+
124
+ if not normalized_text:
125
+ normalized_text = "-"
126
+
127
+ elapsed_time = round(time.time() - t0, 2)
128
+
129
+ normalized_text = normalized_text.strip()
130
+ results.append(
131
+ {
132
+ "sentence": sentence,
133
+ "normalized_text": normalized_text,
134
+ "elapsed_time": elapsed_time,
135
+ }
136
+ )
137
+
138
+ gr.Info("Finished!", duration=2)
139
+
140
+ result_texts = []
141
+
142
+ for result in results:
143
+ result_texts.append(f'> {result["normalized_text"]}')
144
+ result_texts.append("\n")
145
+
146
+ sum_elapsed_text = sum([result["elapsed_time"] for result in results])
147
+ result_texts.append(f"Elapsed time: {sum_elapsed_text} seconds")
148
+
149
+ return "\n".join(result_texts)
150
+
151
+
152
+ demo = gr.Blocks(
153
+ title=title,
154
+ analytics_enabled=False,
155
+ # theme="huggingface",
156
+ theme=gr.themes.Base(),
157
+ )
158
+
159
+ with demo:
160
+ gr.Markdown(description_head)
161
+
162
+ gr.Markdown("## Usage")
163
+
164
+ with gr.Row():
165
+ text = gr.Textbox(label="Text", autofocus=True, max_lines=1)
166
+ normalized_text = gr.Textbox(
167
+ label="Normalized text",
168
+ placeholder=normalized_text_value,
169
+ show_copy_button=True,
170
+ )
171
+
172
+ gr.Button("Normalize").click(
173
+ inference,
174
+ concurrency_limit=concurrency_limit,
175
+ inputs=text,
176
+ outputs=normalized_text,
177
+ )
178
+
179
+ with gr.Row():
180
+ gr.Examples(label="Choose an example", inputs=text, examples=examples)
181
+
182
+ gr.Markdown(description_foot)
183
+
184
+ gr.Markdown("### Gradio app uses the following technologies:")
185
+ gr.Markdown(tech_env)
186
+ gr.Markdown(tech_libraries)
187
+
188
+ if __name__ == "__main__":
189
+ demo.queue()
190
+ demo.launch()
requirements-dev.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ruff
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio
2
+
3
+ transformers
4
+ accelerate