Raivis Dejus commited on
Commit
e6a2833
1 Parent(s): d7b5c0f

Adjusting notes

Browse files
Files changed (1) hide show
  1. app.py +7 -10
app.py CHANGED
@@ -10,7 +10,7 @@ import tempfile
10
  import os
11
 
12
  BATCH_SIZE = 8
13
- FILE_LIMIT_MB = 10
14
  YT_LENGTH_LIMIT_S = 300 # limit to 5min YouTube files
15
 
16
  device = 0 if torch.cuda.is_available() else "cpu"
@@ -33,7 +33,7 @@ def transcribe(model, audio, task):
33
  def _return_yt_html_embed(yt_url):
34
  video_id = yt_url.split("?v=")[-1]
35
  HTML_str = (
36
- f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
37
  " </center>"
38
  )
39
  return HTML_str
@@ -111,11 +111,11 @@ transcribe = gr.Interface(
111
  description=("""
112
  Test Latvian speech recognition (STT) models. Three models are available:
113
 
114
- * [tiny](https://huggingface.co/RaivisDejus/whisper-tiny-lv) - Fastest, requiring least RAM, but also least accurate
115
 
116
- * [small](https://huggingface.co/RaivisDejus/whisper-small-lv) - Reasonably fast, reasonably accurate, requiring reasonable amounts of RAM
117
 
118
- * [large](https://huggingface.co/AiLab-IMCS-UL/whisper-large-v3-lv-late-cv17) - Most accurate, developed by scientists from [ailab.lv](https://ailab.lv/). Requires most RAM and for best performance should be run on a GPU
119
 
120
  To improve speech recognition quality, more data is needed, add your voice on [Balsu talka](https://balsutalka.lv/)
121
  """
@@ -129,7 +129,6 @@ yt_transcribe = gr.Interface(
129
  gr.Dropdown([
130
  ("tiny", "RaivisDejus/whisper-tiny-lv"),
131
  ("small", "RaivisDejus/whisper-small-lv"),
132
- ("large", "AiLab-IMCS-UL/whisper-large-v3-lv-late-cv17")
133
  ], label="Model", value="RaivisDejus/whisper-small-lv"),
134
  gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL (max 5min long)"),
135
  gr.Radio([("Transcribe", "transcribe"), ("Translate to English", "translate",)], label="Task", value="transcribe")
@@ -140,12 +139,10 @@ yt_transcribe = gr.Interface(
140
  description=("""
141
  Test Latvian speech recognition (STT) models. Three models are available:
142
 
143
- * [tiny](https://huggingface.co/RaivisDejus/whisper-tiny-lv) - Fastest, requiring least RAM, but also least accurate
144
 
145
  * [small](https://huggingface.co/RaivisDejus/whisper-small-lv) - Reasonably fast, reasonably accurate, requiring reasonable amounts of RAM
146
 
147
- * [large](https://huggingface.co/AiLab-IMCS-UL/whisper-large-v3-lv-late-cv17) - Most accurate, developed by scientists from [ailab.lv](https://ailab.lv/). Requires most RAM and for best performance should be run on a GPU
148
-
149
  To improve speech recognition quality, more data is needed, add your voice on [Balsu talka](https://balsutalka.lv/)
150
  """
151
  ),
@@ -155,6 +152,6 @@ yt_transcribe = gr.Interface(
155
  with demo:
156
  gr.TabbedInterface([transcribe, yt_transcribe], ["Microphone / Audio file", "YouTube"])
157
 
158
- demo.queue(max_size=10)
159
  demo.launch()
160
 
 
10
  import os
11
 
12
  BATCH_SIZE = 8
13
+ FILE_LIMIT_MB = 1
14
  YT_LENGTH_LIMIT_S = 300 # limit to 5min YouTube files
15
 
16
  device = 0 if torch.cuda.is_available() else "cpu"
 
33
  def _return_yt_html_embed(yt_url):
34
  video_id = yt_url.split("?v=")[-1]
35
  HTML_str = (
36
+ f'<center> <iframe width="100%" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
37
  " </center>"
38
  )
39
  return HTML_str
 
111
  description=("""
112
  Test Latvian speech recognition (STT) models. Three models are available:
113
 
114
+ * [tiny](https://huggingface.co/RaivisDejus/whisper-tiny-lv) - Fastest, requiring least RAM, but also poor accuracy. On this demo hardware 30 second audio will take ~45 seconds to transcribe.
115
 
116
+ * [small](https://huggingface.co/RaivisDejus/whisper-small-lv) - Reasonably fast, reasonably accurate, requiring reasonable amounts of RAM. On this demo hardware 30 second audio will take ~1 minute to transcribe.
117
 
118
+ * [large](https://huggingface.co/AiLab-IMCS-UL/whisper-large-v3-lv-late-cv17) - Most accurate, developed by scientists from [ailab.lv](https://ailab.lv/). Requires most RAM and for best performance should be run on a GPU. On this demo hardware 30 second audio will take ~4 minutes to transcribe.
119
 
120
  To improve speech recognition quality, more data is needed, add your voice on [Balsu talka](https://balsutalka.lv/)
121
  """
 
129
  gr.Dropdown([
130
  ("tiny", "RaivisDejus/whisper-tiny-lv"),
131
  ("small", "RaivisDejus/whisper-small-lv"),
 
132
  ], label="Model", value="RaivisDejus/whisper-small-lv"),
133
  gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL (max 5min long)"),
134
  gr.Radio([("Transcribe", "transcribe"), ("Translate to English", "translate",)], label="Task", value="transcribe")
 
139
  description=("""
140
  Test Latvian speech recognition (STT) models. Three models are available:
141
 
142
+ * [tiny](https://huggingface.co/RaivisDejus/whisper-tiny-lv) - Fastest, requiring least RAM, but also poor accuracy
143
 
144
  * [small](https://huggingface.co/RaivisDejus/whisper-small-lv) - Reasonably fast, reasonably accurate, requiring reasonable amounts of RAM
145
 
 
 
146
  To improve speech recognition quality, more data is needed, add your voice on [Balsu talka](https://balsutalka.lv/)
147
  """
148
  ),
 
152
  with demo:
153
  gr.TabbedInterface([transcribe, yt_transcribe], ["Microphone / Audio file", "YouTube"])
154
 
155
+ demo.queue(max_size=3)
156
  demo.launch()
157