Files changed (4) hide show
  1. README.md +1 -1
  2. app.py +193 -142
  3. packages.txt +0 -1
  4. requirements.txt +1 -3
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: πŸ“‰
4
  colorFrom: pink
5
  colorTo: yellow
6
  sdk: gradio
7
- sdk_version: 3.38.0
8
  app_file: app.py
9
  pinned: false
10
  ---
 
4
  colorFrom: pink
5
  colorTo: yellow
6
  sdk: gradio
7
+ sdk_version: 3.41.2
8
  app_file: app.py
9
  pinned: false
10
  ---
app.py CHANGED
@@ -1,151 +1,202 @@
1
- import torch
2
-
3
- import gradio as gr
4
- import yt_dlp as youtube_dl
5
- from transformers import pipeline
6
- from transformers.pipelines.audio_utils import ffmpeg_read
7
-
8
- import tempfile
9
  import os
 
 
 
10
 
11
- MODEL_NAME = "openai/whisper-large-v3"
12
- BATCH_SIZE = 8
13
- FILE_LIMIT_MB = 1000
14
- YT_LENGTH_LIMIT_S = 3600 # limit to 1 hour YouTube files
15
-
16
- device = 0 if torch.cuda.is_available() else "cpu"
17
-
18
- pipe = pipeline(
19
- task="automatic-speech-recognition",
20
- model=MODEL_NAME,
21
- chunk_length_s=30,
22
- device=device,
23
- )
24
-
25
-
26
- def transcribe(inputs, task):
27
- if inputs is None:
28
- raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
29
-
30
- text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
31
- return text
32
 
 
33
 
34
- def _return_yt_html_embed(yt_url):
35
- video_id = yt_url.split("?v=")[-1]
36
- HTML_str = (
37
- f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
38
- " </center>"
39
- )
40
- return HTML_str
41
 
42
- def download_yt_audio(yt_url, filename):
43
- info_loader = youtube_dl.YoutubeDL()
44
-
45
- try:
46
- info = info_loader.extract_info(yt_url, download=False)
47
- except youtube_dl.utils.DownloadError as err:
48
- raise gr.Error(str(err))
49
 
50
- file_length = info["duration_string"]
51
- file_h_m_s = file_length.split(":")
52
- file_h_m_s = [int(sub_length) for sub_length in file_h_m_s]
53
 
54
- if len(file_h_m_s) == 1:
55
- file_h_m_s.insert(0, 0)
56
- if len(file_h_m_s) == 2:
57
- file_h_m_s.insert(0, 0)
58
- file_length_s = file_h_m_s[0] * 3600 + file_h_m_s[1] * 60 + file_h_m_s[2]
59
 
60
- if file_length_s > YT_LENGTH_LIMIT_S:
61
- yt_length_limit_hms = time.strftime("%HH:%MM:%SS", time.gmtime(YT_LENGTH_LIMIT_S))
62
- file_length_hms = time.strftime("%HH:%MM:%SS", time.gmtime(file_length_s))
63
- raise gr.Error(f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video.")
64
 
65
- ydl_opts = {"outtmpl": filename, "format": "worstvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best"}
66
-
67
- with youtube_dl.YoutubeDL(ydl_opts) as ydl:
68
- try:
69
- ydl.download([yt_url])
70
- except youtube_dl.utils.ExtractorError as err:
71
- raise gr.Error(str(err))
72
-
73
-
74
- def yt_transcribe(yt_url, task, max_filesize=75.0):
75
- html_embed_str = _return_yt_html_embed(yt_url)
76
-
77
- with tempfile.TemporaryDirectory() as tmpdirname:
78
- filepath = os.path.join(tmpdirname, "video.mp4")
79
- download_yt_audio(yt_url, filepath)
80
- with open(filepath, "rb") as f:
81
- inputs = f.read()
82
-
83
- inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
84
- inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
85
-
86
- text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
87
-
88
- return html_embed_str, text
89
-
90
-
91
- demo = gr.Blocks()
92
-
93
- mf_transcribe = gr.Interface(
94
- fn=transcribe,
95
- inputs=[
96
- gr.inputs.Audio(source="microphone", type="filepath", optional=True),
97
- gr.inputs.Radio(["transcribe", "translate"], label="Task", default="transcribe"),
98
- ],
99
- outputs="text",
100
- layout="horizontal",
101
- theme="huggingface",
102
- title="Whisper Large V3: Transcribe Audio",
103
- description=(
104
- "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the OpenAI Whisper"
105
- f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and πŸ€— Transformers to transcribe audio files"
106
- " of arbitrary length."
107
- ),
108
- allow_flagging="never",
109
- )
110
-
111
- file_transcribe = gr.Interface(
112
- fn=transcribe,
113
- inputs=[
114
- gr.inputs.Audio(source="upload", type="filepath", optional=True, label="Audio file"),
115
- gr.inputs.Radio(["transcribe", "translate"], label="Task", default="transcribe"),
116
- ],
117
- outputs="text",
118
- layout="horizontal",
119
- theme="huggingface",
120
- title="Whisper Large V3: Transcribe Audio",
121
- description=(
122
- "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the OpenAI Whisper"
123
- f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and πŸ€— Transformers to transcribe audio files"
124
- " of arbitrary length."
125
- ),
126
- allow_flagging="never",
127
- )
128
-
129
- yt_transcribe = gr.Interface(
130
- fn=yt_transcribe,
131
- inputs=[
132
- gr.inputs.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
133
- gr.inputs.Radio(["transcribe", "translate"], label="Task", default="transcribe")
134
- ],
135
- outputs=["html", "text"],
136
- layout="horizontal",
137
- theme="huggingface",
138
- title="Whisper Large V3: Transcribe YouTube",
139
- description=(
140
- "Transcribe long-form YouTube videos with the click of a button! Demo uses the OpenAI Whisper checkpoint"
141
- f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and πŸ€— Transformers to transcribe video files of"
142
- " arbitrary length."
143
- ),
144
- allow_flagging="never",
145
- )
146
-
147
- with demo:
148
- gr.TabbedInterface([mf_transcribe, file_transcribe, yt_transcribe], ["Microphone", "Audio file", "YouTube"])
149
-
150
- demo.launch(enable_queue=True)
151
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
+ os.system("pip install git+https://github.com/openai/whisper.git")
3
+ import gradio as gr
4
+ import whisper
5
 
6
+ from share_btn import community_icon_html, loading_icon_html, share_js
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
+ model = whisper.load_model("small")
9
 
 
 
 
 
 
 
 
10
 
11
+
12
+ def inference(audio):
13
+ audio = whisper.load_audio(audio)
14
+ audio = whisper.pad_or_trim(audio)
 
 
 
15
 
16
+ mel = whisper.log_mel_spectrogram(audio).to(model.device)
 
 
17
 
18
+ _, probs = model.detect_language(mel)
 
 
 
 
19
 
20
+ options = whisper.DecodingOptions(fp16 = False)
21
+ result = whisper.decode(model, mel, options)
 
 
22
 
23
+ print(result.text)
24
+ return result.text, gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)
25
+
26
+
27
+
28
+
29
+ css = """
30
+ .gradio-container {
31
+ font-family: 'IBM Plex Sans', sans-serif;
32
+ }
33
+ .gr-button {
34
+ color: white;
35
+ border-color: black;
36
+ background: black;
37
+ }
38
+ input[type='range'] {
39
+ accent-color: black;
40
+ }
41
+ .dark input[type='range'] {
42
+ accent-color: #dfdfdf;
43
+ }
44
+ .container {
45
+ max-width: 730px;
46
+ margin: auto;
47
+ padding-top: 1.5rem;
48
+ }
49
+
50
+ .details:hover {
51
+ text-decoration: underline;
52
+ }
53
+ .gr-button {
54
+ white-space: nowrap;
55
+ }
56
+ .gr-button:focus {
57
+ border-color: rgb(147 197 253 / var(--tw-border-opacity));
58
+ outline: none;
59
+ box-shadow: var(--tw-ring-offset-shadow), var(--tw-ring-shadow), var(--tw-shadow, 0 0 #0000);
60
+ --tw-border-opacity: 1;
61
+ --tw-ring-offset-shadow: var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width) var(--tw-ring-offset-color);
62
+ --tw-ring-shadow: var(--tw-ring-inset) 0 0 0 calc(3px var(--tw-ring-offset-width)) var(--tw-ring-color);
63
+ --tw-ring-color: rgb(191 219 254 / var(--tw-ring-opacity));
64
+ --tw-ring-opacity: .5;
65
+ }
66
+ .footer {
67
+ margin-bottom: 45px;
68
+ margin-top: 35px;
69
+ text-align: center;
70
+ border-bottom: 1px solid #e5e5e5;
71
+ }
72
+ .footer>p {
73
+ font-size: .8rem;
74
+ display: inline-block;
75
+ padding: 0 10px;
76
+ transform: translateY(10px);
77
+ background: white;
78
+ }
79
+ .dark .footer {
80
+ border-color: #303030;
81
+ }
82
+ .dark .footer>p {
83
+ background: #0b0f19;
84
+ }
85
+ .prompt h4{
86
+ margin: 1.25em 0 .25em 0;
87
+ font-weight: bold;
88
+ font-size: 115%;
89
+ }
90
+ .animate-spin {
91
+ animation: spin 1s linear infinite;
92
+ }
93
+ @keyframes spin {
94
+ from {
95
+ transform: rotate(0deg);
96
+ }
97
+ to {
98
+ transform: rotate(360deg);
99
+ }
100
+ }
101
+ #share-btn-container {
102
+ display: flex; margin-top: 1.5rem !important; padding-left: 0.5rem !important; padding-right: 0.5rem !important; background-color: #000000; justify-content: center; align-items: center; border-radius: 9999px !important; width: 13rem;
103
+ }
104
+ #share-btn {
105
+ all: initial; color: #ffffff;font-weight: 600; cursor:pointer; font-family: 'IBM Plex Sans', sans-serif; margin-left: 0.5rem !important; padding-top: 0.25rem !important; padding-bottom: 0.25rem !important;
106
+ }
107
+ #share-btn * {
108
+ all: unset;
109
+ }
110
+ """
111
+
112
+ block = gr.Blocks(css=css)
113
+
114
+
115
+
116
+ with block:
117
+ gr.HTML(
118
+ """
119
+ <div style="text-align: center; max-width: 650px; margin: 0 auto;">
120
+ <div
121
+ style="
122
+ display: inline-flex;
123
+ align-items: center;
124
+ gap: 0.8rem;
125
+ font-size: 1.75rem;
126
+ "
127
+ >
128
+ <svg
129
+ width="0.65em"
130
+ height="0.65em"
131
+ viewBox="0 0 115 115"
132
+ fill="none"
133
+ xmlns="http://www.w3.org/2000/svg"
134
+ >
135
+ <rect width="23" height="23" fill="white"></rect>
136
+ <rect y="69" width="23" height="23" fill="white"></rect>
137
+ <rect x="23" width="23" height="23" fill="#AEAEAE"></rect>
138
+ <rect x="23" y="69" width="23" height="23" fill="#AEAEAE"></rect>
139
+ <rect x="46" width="23" height="23" fill="white"></rect>
140
+ <rect x="46" y="69" width="23" height="23" fill="white"></rect>
141
+ <rect x="69" width="23" height="23" fill="black"></rect>
142
+ <rect x="69" y="69" width="23" height="23" fill="black"></rect>
143
+ <rect x="92" width="23" height="23" fill="#D9D9D9"></rect>
144
+ <rect x="92" y="69" width="23" height="23" fill="#AEAEAE"></rect>
145
+ <rect x="115" y="46" width="23" height="23" fill="white"></rect>
146
+ <rect x="115" y="115" width="23" height="23" fill="white"></rect>
147
+ <rect x="115" y="69" width="23" height="23" fill="#D9D9D9"></rect>
148
+ <rect x="92" y="46" width="23" height="23" fill="#AEAEAE"></rect>
149
+ <rect x="92" y="115" width="23" height="23" fill="#AEAEAE"></rect>
150
+ <rect x="92" y="69" width="23" height="23" fill="white"></rect>
151
+ <rect x="69" y="46" width="23" height="23" fill="white"></rect>
152
+ <rect x="69" y="115" width="23" height="23" fill="white"></rect>
153
+ <rect x="69" y="69" width="23" height="23" fill="#D9D9D9"></rect>
154
+ <rect x="46" y="46" width="23" height="23" fill="black"></rect>
155
+ <rect x="46" y="115" width="23" height="23" fill="black"></rect>
156
+ <rect x="46" y="69" width="23" height="23" fill="black"></rect>
157
+ <rect x="23" y="46" width="23" height="23" fill="#D9D9D9"></rect>
158
+ <rect x="23" y="115" width="23" height="23" fill="#AEAEAE"></rect>
159
+ <rect x="23" y="69" width="23" height="23" fill="black"></rect>
160
+ </svg>
161
+ <h1 style="font-weight: 900; margin-bottom: 7px;">
162
+ Whisper
163
+ </h1>
164
+ </div>
165
+ <p style="margin-bottom: 10px; font-size: 94%">
166
+ Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse audio and is also a multi-task model that can perform multilingual speech recognition as well as speech translation and language identification. This demo cuts audio after around 30 secs.
167
+ </p>
168
+ <p>You can skip the queue by using google colab for the space: <a href="https://colab.research.google.com/drive/1WJ98KHgZxFGrHiMm4TyWZllSew_Af_ff?usp=sharing"><img data-canonical-src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab" src="https://camo.githubusercontent.com/84f0493939e0c4de4e6dbe113251b4bfb5353e57134ffd9fcab6b8714514d4d1/68747470733a2f2f636f6c61622e72657365617263682e676f6f676c652e636f6d2f6173736574732f636f6c61622d62616467652e737667"></a></p>
169
+ </div>
170
+ """
171
+ )
172
+ with gr.Group():
173
+ with gr.Box():
174
+ with gr.Row().style(mobile_collapse=False, equal_height=True):
175
+ audio = gr.Audio(
176
+ label="Input Audio",
177
+ show_label=False,
178
+ source="microphone",
179
+ type="filepath"
180
+ )
181
+
182
+ btn = gr.Button("Transcribe")
183
+ text = gr.Textbox(show_label=False, elem_id="result-textarea")
184
+ with gr.Group(elem_id="share-btn-container"):
185
+ community_icon = gr.HTML(community_icon_html, visible=False)
186
+ loading_icon = gr.HTML(loading_icon_html, visible=False)
187
+ share_button = gr.Button("Share to community", elem_id="share-btn", visible=False)
188
+
189
+
190
+
191
+
192
+ btn.click(inference, inputs=[audio], outputs=[text, community_icon, loading_icon, share_button])
193
+ share_button.click(None, [], [], _js=share_js)
194
+
195
+ gr.HTML('''
196
+ <div class="footer">
197
+ <p>Model by <a href="https://github.com/openai/whisper" style="text-decoration: underline;" target="_blank">OpenAI</a> - Gradio Demo by πŸ€— Hugging Face
198
+ </p>
199
+ </div>
200
+ ''')
201
+
202
+ block.launch()
packages.txt DELETED
@@ -1 +0,0 @@
1
- ffmpeg
 
 
requirements.txt CHANGED
@@ -1,3 +1 @@
1
- git+https://github.com/huggingface/transformers
2
- torch
3
- yt-dlp
 
1
+ transformers