Spaces:

RaivisDejus
/

LatvianSpeechRecognition

Running

App Files Files Community

Raivis Dejus commited on May 2

Commit

e6a2833

•

1 Parent(s): d7b5c0f

Adjusting notes

Browse files

Files changed (1) hide show

app.py +7 -10

app.py CHANGED Viewed

@@ -10,7 +10,7 @@ import tempfile
 import os
 BATCH_SIZE = 8
-FILE_LIMIT_MB = 10
 YT_LENGTH_LIMIT_S = 300  # limit to 5min YouTube files
 device = 0 if torch.cuda.is_available() else "cpu"
@@ -33,7 +33,7 @@ def transcribe(model, audio, task):
 def _return_yt_html_embed(yt_url):
     video_id = yt_url.split("?v=")[-1]
     HTML_str = (
-        f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
         " </center>"
     )
     return HTML_str
@@ -111,11 +111,11 @@ transcribe = gr.Interface(
     description=("""
         Test Latvian speech recognition (STT) models. Three models are available:
-        * [tiny](https://huggingface.co/RaivisDejus/whisper-tiny-lv) - Fastest, requiring least RAM, but also least accurate
-        * [small](https://huggingface.co/RaivisDejus/whisper-small-lv) - Reasonably fast, reasonably accurate, requiring reasonable amounts of RAM
-        * [large](https://huggingface.co/AiLab-IMCS-UL/whisper-large-v3-lv-late-cv17) - Most accurate, developed by scientists from [ailab.lv](https://ailab.lv/). Requires most RAM and for best performance should be run on a GPU
         To improve speech recognition quality, more data is needed, add your voice on [Balsu talka](https://balsutalka.lv/)
         """
@@ -129,7 +129,6 @@ yt_transcribe = gr.Interface(
         gr.Dropdown([
             ("tiny", "RaivisDejus/whisper-tiny-lv"),
             ("small", "RaivisDejus/whisper-small-lv"),
-            ("large", "AiLab-IMCS-UL/whisper-large-v3-lv-late-cv17")
         ], label="Model", value="RaivisDejus/whisper-small-lv"),
         gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL (max 5min long)"),
         gr.Radio([("Transcribe", "transcribe"), ("Translate to English", "translate",)], label="Task", value="transcribe")
@@ -140,12 +139,10 @@ yt_transcribe = gr.Interface(
     description=("""
         Test Latvian speech recognition (STT) models. Three models are available:
-        * [tiny](https://huggingface.co/RaivisDejus/whisper-tiny-lv) - Fastest, requiring least RAM, but also least accurate
         * [small](https://huggingface.co/RaivisDejus/whisper-small-lv) - Reasonably fast, reasonably accurate, requiring reasonable amounts of RAM
-        * [large](https://huggingface.co/AiLab-IMCS-UL/whisper-large-v3-lv-late-cv17) - Most accurate, developed by scientists from [ailab.lv](https://ailab.lv/). Requires most RAM and for best performance should be run on a GPU
         To improve speech recognition quality, more data is needed, add your voice on [Balsu talka](https://balsutalka.lv/)
         """
     ),
@@ -155,6 +152,6 @@ yt_transcribe = gr.Interface(
 with demo:
     gr.TabbedInterface([transcribe, yt_transcribe], ["Microphone / Audio file", "YouTube"])
-demo.queue(max_size=10)
 demo.launch()

 import os
 BATCH_SIZE = 8
+FILE_LIMIT_MB = 1
 YT_LENGTH_LIMIT_S = 300  # limit to 5min YouTube files
 device = 0 if torch.cuda.is_available() else "cpu"
 def _return_yt_html_embed(yt_url):
     video_id = yt_url.split("?v=")[-1]
     HTML_str = (
+        f'<center> <iframe width="100%" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
         " </center>"
     )
     return HTML_str
     description=("""
         Test Latvian speech recognition (STT) models. Three models are available:
+        * [tiny](https://huggingface.co/RaivisDejus/whisper-tiny-lv) - Fastest, requiring least RAM, but also poor accuracy. On this demo hardware 30 second audio will take ~45 seconds to transcribe.
+        * [small](https://huggingface.co/RaivisDejus/whisper-small-lv) - Reasonably fast, reasonably accurate, requiring reasonable amounts of RAM. On this demo hardware 30 second audio will take ~1 minute to transcribe.
+        * [large](https://huggingface.co/AiLab-IMCS-UL/whisper-large-v3-lv-late-cv17) - Most accurate, developed by scientists from [ailab.lv](https://ailab.lv/). Requires most RAM and for best performance should be run on a GPU. On this demo hardware 30 second audio will take ~4 minutes to transcribe.
         To improve speech recognition quality, more data is needed, add your voice on [Balsu talka](https://balsutalka.lv/)
         """
         gr.Dropdown([
             ("tiny", "RaivisDejus/whisper-tiny-lv"),
             ("small", "RaivisDejus/whisper-small-lv"),
         ], label="Model", value="RaivisDejus/whisper-small-lv"),
         gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL (max 5min long)"),
         gr.Radio([("Transcribe", "transcribe"), ("Translate to English", "translate",)], label="Task", value="transcribe")
     description=("""
         Test Latvian speech recognition (STT) models. Three models are available:
+        * [tiny](https://huggingface.co/RaivisDejus/whisper-tiny-lv) - Fastest, requiring least RAM, but also poor accuracy
         * [small](https://huggingface.co/RaivisDejus/whisper-small-lv) - Reasonably fast, reasonably accurate, requiring reasonable amounts of RAM
         To improve speech recognition quality, more data is needed, add your voice on [Balsu talka](https://balsutalka.lv/)
         """
     ),
 with demo:
     gr.TabbedInterface([transcribe, yt_transcribe], ["Microphone / Audio file", "YouTube"])
+demo.queue(max_size=3)
 demo.launch()