Raivis Dejus
commited on
Commit
·
e6a2833
1
Parent(s):
d7b5c0f
Adjusting notes
Browse files
app.py
CHANGED
@@ -10,7 +10,7 @@ import tempfile
|
|
10 |
import os
|
11 |
|
12 |
BATCH_SIZE = 8
|
13 |
-
FILE_LIMIT_MB =
|
14 |
YT_LENGTH_LIMIT_S = 300 # limit to 5min YouTube files
|
15 |
|
16 |
device = 0 if torch.cuda.is_available() else "cpu"
|
@@ -33,7 +33,7 @@ def transcribe(model, audio, task):
|
|
33 |
def _return_yt_html_embed(yt_url):
|
34 |
video_id = yt_url.split("?v=")[-1]
|
35 |
HTML_str = (
|
36 |
-
f'<center> <iframe width="
|
37 |
" </center>"
|
38 |
)
|
39 |
return HTML_str
|
@@ -111,11 +111,11 @@ transcribe = gr.Interface(
|
|
111 |
description=("""
|
112 |
Test Latvian speech recognition (STT) models. Three models are available:
|
113 |
|
114 |
-
* [tiny](https://huggingface.co/RaivisDejus/whisper-tiny-lv) - Fastest, requiring least RAM, but also
|
115 |
|
116 |
-
* [small](https://huggingface.co/RaivisDejus/whisper-small-lv) - Reasonably fast, reasonably accurate, requiring reasonable amounts of RAM
|
117 |
|
118 |
-
* [large](https://huggingface.co/AiLab-IMCS-UL/whisper-large-v3-lv-late-cv17) - Most accurate, developed by scientists from [ailab.lv](https://ailab.lv/). Requires most RAM and for best performance should be run on a GPU
|
119 |
|
120 |
To improve speech recognition quality, more data is needed, add your voice on [Balsu talka](https://balsutalka.lv/)
|
121 |
"""
|
@@ -129,7 +129,6 @@ yt_transcribe = gr.Interface(
|
|
129 |
gr.Dropdown([
|
130 |
("tiny", "RaivisDejus/whisper-tiny-lv"),
|
131 |
("small", "RaivisDejus/whisper-small-lv"),
|
132 |
-
("large", "AiLab-IMCS-UL/whisper-large-v3-lv-late-cv17")
|
133 |
], label="Model", value="RaivisDejus/whisper-small-lv"),
|
134 |
gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL (max 5min long)"),
|
135 |
gr.Radio([("Transcribe", "transcribe"), ("Translate to English", "translate",)], label="Task", value="transcribe")
|
@@ -140,12 +139,10 @@ yt_transcribe = gr.Interface(
|
|
140 |
description=("""
|
141 |
Test Latvian speech recognition (STT) models. Three models are available:
|
142 |
|
143 |
-
* [tiny](https://huggingface.co/RaivisDejus/whisper-tiny-lv) - Fastest, requiring least RAM, but also
|
144 |
|
145 |
* [small](https://huggingface.co/RaivisDejus/whisper-small-lv) - Reasonably fast, reasonably accurate, requiring reasonable amounts of RAM
|
146 |
|
147 |
-
* [large](https://huggingface.co/AiLab-IMCS-UL/whisper-large-v3-lv-late-cv17) - Most accurate, developed by scientists from [ailab.lv](https://ailab.lv/). Requires most RAM and for best performance should be run on a GPU
|
148 |
-
|
149 |
To improve speech recognition quality, more data is needed, add your voice on [Balsu talka](https://balsutalka.lv/)
|
150 |
"""
|
151 |
),
|
@@ -155,6 +152,6 @@ yt_transcribe = gr.Interface(
|
|
155 |
with demo:
|
156 |
gr.TabbedInterface([transcribe, yt_transcribe], ["Microphone / Audio file", "YouTube"])
|
157 |
|
158 |
-
demo.queue(max_size=
|
159 |
demo.launch()
|
160 |
|
|
|
10 |
import os
|
11 |
|
12 |
BATCH_SIZE = 8
|
13 |
+
FILE_LIMIT_MB = 1
|
14 |
YT_LENGTH_LIMIT_S = 300 # limit to 5min YouTube files
|
15 |
|
16 |
device = 0 if torch.cuda.is_available() else "cpu"
|
|
|
33 |
def _return_yt_html_embed(yt_url):
|
34 |
video_id = yt_url.split("?v=")[-1]
|
35 |
HTML_str = (
|
36 |
+
f'<center> <iframe width="100%" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
|
37 |
" </center>"
|
38 |
)
|
39 |
return HTML_str
|
|
|
111 |
description=("""
|
112 |
Test Latvian speech recognition (STT) models. Three models are available:
|
113 |
|
114 |
+
* [tiny](https://huggingface.co/RaivisDejus/whisper-tiny-lv) - Fastest, requiring least RAM, but also poor accuracy. On this demo hardware 30 second audio will take ~45 seconds to transcribe.
|
115 |
|
116 |
+
* [small](https://huggingface.co/RaivisDejus/whisper-small-lv) - Reasonably fast, reasonably accurate, requiring reasonable amounts of RAM. On this demo hardware 30 second audio will take ~1 minute to transcribe.
|
117 |
|
118 |
+
* [large](https://huggingface.co/AiLab-IMCS-UL/whisper-large-v3-lv-late-cv17) - Most accurate, developed by scientists from [ailab.lv](https://ailab.lv/). Requires most RAM and for best performance should be run on a GPU. On this demo hardware 30 second audio will take ~4 minutes to transcribe.
|
119 |
|
120 |
To improve speech recognition quality, more data is needed, add your voice on [Balsu talka](https://balsutalka.lv/)
|
121 |
"""
|
|
|
129 |
gr.Dropdown([
|
130 |
("tiny", "RaivisDejus/whisper-tiny-lv"),
|
131 |
("small", "RaivisDejus/whisper-small-lv"),
|
|
|
132 |
], label="Model", value="RaivisDejus/whisper-small-lv"),
|
133 |
gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL (max 5min long)"),
|
134 |
gr.Radio([("Transcribe", "transcribe"), ("Translate to English", "translate",)], label="Task", value="transcribe")
|
|
|
139 |
description=("""
|
140 |
Test Latvian speech recognition (STT) models. Three models are available:
|
141 |
|
142 |
+
* [tiny](https://huggingface.co/RaivisDejus/whisper-tiny-lv) - Fastest, requiring least RAM, but also poor accuracy
|
143 |
|
144 |
* [small](https://huggingface.co/RaivisDejus/whisper-small-lv) - Reasonably fast, reasonably accurate, requiring reasonable amounts of RAM
|
145 |
|
|
|
|
|
146 |
To improve speech recognition quality, more data is needed, add your voice on [Balsu talka](https://balsutalka.lv/)
|
147 |
"""
|
148 |
),
|
|
|
152 |
with demo:
|
153 |
gr.TabbedInterface([transcribe, yt_transcribe], ["Microphone / Audio file", "YouTube"])
|
154 |
|
155 |
+
demo.queue(max_size=3)
|
156 |
demo.launch()
|
157 |
|