bofenghuang commited on
Commit
839f7b3
1 Parent(s): 2263e75

Add new layout

Browse files
requirements.txt CHANGED
@@ -1,5 +1,7 @@
1
  git+https://github.com/huggingface/transformers
2
  git+https://github.com/openai/whisper.git
3
- torch
 
 
4
  pytube
5
- psutil
 
1
  git+https://github.com/huggingface/transformers
2
  git+https://github.com/openai/whisper.git
3
+ nltk
4
+ pandas
5
+ psutil
6
  pytube
7
+ torch
run_demo_low_api_openai.py ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #! /usr/bin/env python
2
+ # coding=utf-8
3
+ # Copyright 2022 Bofeng Huang
4
+
5
+ import datetime
6
+ import logging
7
+ import os
8
+ import re
9
+ import warnings
10
+
11
+ import gradio as gr
12
+ import pandas as pd
13
+ import psutil
14
+ import pytube as pt
15
+ import torch
16
+ import whisper
17
+ from huggingface_hub import hf_hub_download, model_info
18
+ from nltk.tokenize import sent_tokenize
19
+ from transformers.utils.logging import disable_progress_bar
20
+
21
+ warnings.filterwarnings("ignore")
22
+ disable_progress_bar()
23
+
24
+ DEFAULT_MODEL_NAME = "bofenghuang/whisper-large-v2-cv11-french"
25
+ CHECKPOINT_FILENAME = "checkpoint_openai.pt"
26
+
27
+ GEN_KWARGS = {
28
+ "task": "transcribe",
29
+ "language": "fr",
30
+ # "without_timestamps": True,
31
+ # decode options
32
+ # "beam_size": 5,
33
+ # "patience": 2,
34
+ # disable fallback
35
+ # "compression_ratio_threshold": None,
36
+ # "logprob_threshold": None,
37
+ # vad threshold
38
+ # "no_speech_threshold": None,
39
+ }
40
+
41
+ logging.basicConfig(
42
+ format="%(asctime)s [%(levelname)s] [%(name)s] %(message)s",
43
+ datefmt="%Y-%m-%dT%H:%M:%SZ",
44
+ )
45
+ logger = logging.getLogger(__name__)
46
+ logger.setLevel(logging.DEBUG)
47
+
48
+ # device = 0 if torch.cuda.is_available() else "cpu"
49
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
50
+ logger.info(f"Model will be loaded on device `{device}`")
51
+
52
+ cached_models = {}
53
+
54
+
55
+ def format_timestamp(seconds):
56
+ return str(datetime.timedelta(seconds=round(seconds)))
57
+
58
+
59
+ def _return_yt_html_embed(yt_url):
60
+ video_id = yt_url.split("?v=")[-1]
61
+ HTML_str = (
62
+ f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>' " </center>"
63
+ )
64
+ return HTML_str
65
+
66
+
67
+ def download_audio_from_youtube(yt_url, downloaded_filename="audio.wav"):
68
+ yt = pt.YouTube(yt_url)
69
+ stream = yt.streams.filter(only_audio=True)[0]
70
+ # stream.download(filename="audio.mp3")
71
+ stream.download(filename=downloaded_filename)
72
+ return downloaded_filename
73
+
74
+
75
+ def download_video_from_youtube(yt_url, downloaded_filename="video.mp4"):
76
+ yt = pt.YouTube(yt_url)
77
+ stream = yt.streams.filter(progressive=True, file_extension="mp4").order_by("resolution").desc().first()
78
+ stream.download(filename=downloaded_filename)
79
+ logger.info(f"Download YouTube video from {yt_url}")
80
+ return downloaded_filename
81
+
82
+
83
+ def _print_memory_info():
84
+ memory = psutil.virtual_memory()
85
+ logger.info(
86
+ f"Memory info - Free: {memory.available / (1024 ** 3):.2f} Gb, used: {memory.percent}%, total: {memory.total / (1024 ** 3):.2f} Gb"
87
+ )
88
+
89
+
90
+ def _print_cuda_memory_info():
91
+ used_mem, tot_mem = torch.cuda.mem_get_info()
92
+ logger.info(
93
+ f"CUDA memory info - Free: {used_mem / 1024 ** 3:.2f} Gb, used: {(tot_mem - used_mem) / 1024 ** 3:.2f} Gb, total: {tot_mem / 1024 ** 3:.2f} Gb"
94
+ )
95
+
96
+
97
+ def print_memory_info():
98
+ _print_memory_info()
99
+ _print_cuda_memory_info()
100
+
101
+
102
+ def maybe_load_cached_pipeline(model_name):
103
+ model = cached_models.get(model_name)
104
+ if model is None:
105
+ downloaded_model_path = hf_hub_download(repo_id=model_name, filename=CHECKPOINT_FILENAME)
106
+
107
+ model = whisper.load_model(downloaded_model_path, device=device)
108
+ logger.info(f"`{model_name}` has been loaded on device `{device}`")
109
+
110
+ print_memory_info()
111
+
112
+ cached_models[model_name] = model
113
+ return model
114
+
115
+
116
+ def infer(model, filename, with_timestamps, return_df=False):
117
+ if with_timestamps:
118
+ model_outputs = model.transcribe(filename, **GEN_KWARGS)
119
+ if return_df:
120
+ model_outputs_df = pd.DataFrame(model_outputs["segments"])
121
+ # print(model_outputs)
122
+ # print(model_outputs_df)
123
+ # print(model_outputs_df.info(verbose=True))
124
+ model_outputs_df = model_outputs_df[["start", "end", "text"]]
125
+ model_outputs_df["start"] = model_outputs_df["start"].map(format_timestamp)
126
+ model_outputs_df["end"] = model_outputs_df["end"].map(format_timestamp)
127
+ model_outputs_df["text"] = model_outputs_df["text"].str.strip()
128
+ return model_outputs_df
129
+ else:
130
+ return "\n\n".join(
131
+ [
132
+ f'Segment {segment["id"]+1} from {segment["start"]:.2f}s to {segment["end"]:.2f}s:\n{segment["text"].strip()}'
133
+ for segment in model_outputs["segments"]
134
+ ]
135
+ )
136
+ else:
137
+ text = model.transcribe(filename, without_timestamps=True, **GEN_KWARGS)["text"]
138
+ if return_df:
139
+ return pd.DataFrame({"text": sent_tokenize(text)})
140
+ else:
141
+ return text
142
+
143
+
144
+ def transcribe(microphone, file_upload, with_timestamps, model_name=DEFAULT_MODEL_NAME):
145
+ warn_output = ""
146
+ if (microphone is not None) and (file_upload is not None):
147
+ warn_output = (
148
+ "WARNING: You've uploaded an audio file and used the microphone. "
149
+ "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
150
+ )
151
+
152
+ elif (microphone is None) and (file_upload is None):
153
+ return "ERROR: You have to either use the microphone or upload an audio file"
154
+
155
+ file = microphone if microphone is not None else file_upload
156
+
157
+ model = maybe_load_cached_pipeline(model_name)
158
+ # text = model.transcribe(file, **GEN_KWARGS)["text"]
159
+ # text = infer(model, file, with_timestamps)
160
+ text = infer(model, file, with_timestamps, return_df=True)
161
+
162
+ logger.info(f'Transcription by `{model_name}`:\n{text.to_json(orient="index", force_ascii=False, indent=2)}\n')
163
+
164
+ # return warn_output + text
165
+ return text
166
+
167
+
168
+ def yt_transcribe(yt_url, with_timestamps, model_name=DEFAULT_MODEL_NAME):
169
+ # html_embed_str = _return_yt_html_embed(yt_url)
170
+ audio_file_path = download_audio_from_youtube(yt_url)
171
+
172
+ model = maybe_load_cached_pipeline(model_name)
173
+ # text = model.transcribe("audio.mp3", **GEN_KWARGS)["text"]
174
+ # text = infer(model, audio_file_path, with_timestamps)
175
+ text = infer(model, audio_file_path, with_timestamps, return_df=True)
176
+
177
+ logger.info(f'Transcription by `{model_name}` of "{yt_url}":\n{text.to_json(orient="index", force_ascii=False, indent=2)}\n')
178
+
179
+ # return html_embed_str, text
180
+ return text
181
+
182
+
183
+ def video_transcribe(video_file_path, model_name=DEFAULT_MODEL_NAME):
184
+ if video_file_path is None:
185
+ raise ValueError("Failed to transcribe video as no video_file_path has been defined")
186
+
187
+ audio_file_path = re.sub(r"\.mp4$", ".wav", video_file_path)
188
+ os.system(f'ffmpeg -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{audio_file_path}"')
189
+
190
+ model = maybe_load_cached_pipeline(model_name)
191
+ # text = model.transcribe("audio.mp3", **GEN_KWARGS)["text"]
192
+ text = infer(model, audio_file_path, with_timestamps=True, return_df=True)
193
+
194
+ logger.info(f'Transcription by `{model_name}`:\n{text.to_json(orient="index", force_ascii=False, indent=2)}\n')
195
+
196
+ return text
197
+
198
+
199
+ # load default model
200
+ maybe_load_cached_pipeline(DEFAULT_MODEL_NAME)
201
+
202
+ # default_text_output_df = pd.DataFrame(columns=["start", "end", "text"])
203
+ default_text_output_df = pd.DataFrame(columns=["text"])
204
+
205
+ with gr.Blocks() as demo:
206
+
207
+ with gr.Tab("Transcribe Audio"):
208
+ gr.Markdown(
209
+ f"""
210
+ <div>
211
+ <h1 style='text-align: center'>Whisper French Demo 🇫🇷 : Transcribe Audio</h1>
212
+ </div>
213
+ Transcribe long-form microphone or audio inputs!
214
+
215
+ Demo uses the fine-tuned checkpoint: <a href='https://huggingface.co/{DEFAULT_MODEL_NAME}' target='_blank'><b>{DEFAULT_MODEL_NAME}</b></a> to transcribe audio files of arbitrary length.
216
+ """
217
+ )
218
+
219
+ microphone_input = gr.inputs.Audio(source="microphone", type="filepath", label="Record", optional=True)
220
+ upload_input = gr.inputs.Audio(source="upload", type="filepath", label="Upload File", optional=True)
221
+ with_timestamps_input = gr.Checkbox(label="With timestamps?")
222
+
223
+ microphone_transcribe_btn = gr.Button("Transcribe Audio")
224
+
225
+ # gr.Markdown('''
226
+ # Here you will get generated transcrit.
227
+ # ''')
228
+
229
+ # microphone_text_output = gr.outputs.Textbox(label="Transcription")
230
+ text_output_df2 = gr.DataFrame(
231
+ value=default_text_output_df,
232
+ label="Transcription",
233
+ row_count=(0, "dynamic"),
234
+ max_rows=10,
235
+ wrap=True,
236
+ overflow_row_behaviour="paginate",
237
+ )
238
+
239
+ microphone_transcribe_btn.click(
240
+ transcribe, inputs=[microphone_input, upload_input, with_timestamps_input], outputs=text_output_df2
241
+ )
242
+
243
+ # with gr.Tab("Transcribe YouTube"):
244
+ # gr.Markdown(
245
+ # f"""
246
+ # <div>
247
+ # <h1 style='text-align: center'>Whisper French Demo 🇫🇷 : Transcribe YouTube</h1>
248
+ # </div>
249
+ # Transcribe long-form YouTube videos!
250
+
251
+ # Demo uses the fine-tuned checkpoint: <a href='https://huggingface.co/{DEFAULT_MODEL_NAME}' target='_blank'><b>{DEFAULT_MODEL_NAME}</b></a> to transcribe video files of arbitrary length.
252
+ # """
253
+ # )
254
+
255
+ # yt_link_input2 = gr.inputs.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL")
256
+ # with_timestamps_input2 = gr.Checkbox(label="With timestamps?", value=True)
257
+
258
+ # yt_transcribe_btn = gr.Button("Transcribe YouTube")
259
+
260
+ # # yt_text_output = gr.outputs.Textbox(label="Transcription")
261
+ # text_output_df3 = gr.DataFrame(
262
+ # value=default_text_output_df,
263
+ # label="Transcription",
264
+ # row_count=(0, "dynamic"),
265
+ # max_rows=10,
266
+ # wrap=True,
267
+ # overflow_row_behaviour="paginate",
268
+ # )
269
+ # # yt_html_output = gr.outputs.HTML(label="YouTube Page")
270
+
271
+ # yt_transcribe_btn.click(yt_transcribe, inputs=[yt_link_input2, with_timestamps_input2], outputs=[text_output_df3])
272
+
273
+ with gr.Tab("Transcribe Video"):
274
+ gr.Markdown(
275
+ f"""
276
+ <div>
277
+ <h1 style='text-align: center'>Whisper French Demo 🇫🇷 : Transcribe Video</h1>
278
+ </div>
279
+ Transcribe long-form YouTube videos or uploaded video inputs!
280
+
281
+ Demo uses the fine-tuned checkpoint: <a href='https://huggingface.co/{DEFAULT_MODEL_NAME}' target='_blank'><b>{DEFAULT_MODEL_NAME}</b></a> to transcribe video files of arbitrary length.
282
+ """
283
+ )
284
+
285
+ yt_link_input = gr.inputs.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL")
286
+ download_youtube_btn = gr.Button("Download Youtube video")
287
+ downloaded_video_output = gr.Video(label="Video file", mirror_webcam=False)
288
+ download_youtube_btn.click(download_video_from_youtube, inputs=[yt_link_input], outputs=[downloaded_video_output])
289
+
290
+ video_transcribe_btn = gr.Button("Transcribe video")
291
+ text_output_df = gr.DataFrame(
292
+ value=default_text_output_df,
293
+ label="Transcription",
294
+ row_count=(0, "dynamic"),
295
+ max_rows=10,
296
+ wrap=True,
297
+ overflow_row_behaviour="paginate",
298
+ )
299
+
300
+ video_transcribe_btn.click(video_transcribe, inputs=[downloaded_video_output], outputs=[text_output_df])
301
+
302
+ # demo.launch(server_name="0.0.0.0", debug=True)
303
+ # demo.launch(server_name="0.0.0.0", debug=True, share=True)
304
+ demo.launch(enable_queue=True)
run_demo_openai.py CHANGED
@@ -46,7 +46,7 @@ cached_models = {}
46
  def _print_memory_info():
47
  memory = psutil.virtual_memory()
48
  logger.info(
49
- f"Memory: {memory.total / (1024 ** 3):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 ** 3):.2f}GB"
50
  )
51
 
52
 
@@ -89,102 +89,85 @@ def infer(model, filename, with_timestamps):
89
  return model.transcribe(filename, without_timestamps=True, **GEN_KWARGS)["text"]
90
 
91
 
92
- def transcribe(microphone, file_upload, with_timestamps, model_name=DEFAULT_MODEL_NAME):
 
 
 
 
 
 
 
 
93
  warn_output = ""
 
 
 
 
 
 
94
  if (microphone is not None) and (file_upload is not None):
95
  warn_output = (
96
  "WARNING: You've uploaded an audio file and used the microphone. "
97
  "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
98
  )
99
 
100
- elif (microphone is None) and (file_upload is None):
101
- return "ERROR: You have to either use the microphone or upload an audio file"
102
-
103
- file = microphone if microphone is not None else file_upload
104
- try:
105
- model = maybe_load_cached_pipeline(model_name)
106
- # text = model.transcribe(file, **GEN_KWARGS)["text"]
107
- text = infer(model, file, with_timestamps)
108
-
109
- logger.info(f"Transcription by `{model_name}`:\n{text}\n")
110
- except Exception as e:
111
- logger.info(str(e))
112
-
113
- return warn_output + text
114
-
115
 
116
- def _return_yt_html_embed(yt_url):
117
- video_id = yt_url.split("?v=")[-1]
118
- HTML_str = (
119
- f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>' " </center>"
120
- )
121
- return HTML_str
122
 
 
 
123
 
124
- def yt_transcribe(yt_url, with_timestamps, model_name=DEFAULT_MODEL_NAME):
125
- yt = pt.YouTube(yt_url)
126
- html_embed_str = _return_yt_html_embed(yt_url)
127
- stream = yt.streams.filter(only_audio=True)[0]
128
- stream.download(filename="audio.mp3")
 
 
 
 
129
 
130
  model = maybe_load_cached_pipeline(model_name)
131
- # text = model.transcribe("audio.mp3", **GEN_KWARGS)["text"]
132
- text = infer(model, "audio.mp3", with_timestamps)
133
 
134
- logger.info(f'Transcription by `{model_name}` of "{yt_url}":\n{text}\n')
135
 
136
- return html_embed_str, text
137
 
138
 
139
  # load default model
140
  maybe_load_cached_pipeline(DEFAULT_MODEL_NAME)
141
 
142
- demo = gr.Blocks()
143
-
144
- mf_transcribe = gr.Interface(
145
  fn=transcribe,
146
  inputs=[
147
  gr.inputs.Audio(source="microphone", type="filepath", label="Record", optional=True),
148
  gr.inputs.Audio(source="upload", type="filepath", label="Upload File", optional=True),
149
- gr.Checkbox(label="With timestamps?", value=True),
 
150
  ],
151
- # outputs="text",
152
  outputs=gr.outputs.Textbox(label="Transcription"),
153
  layout="horizontal",
154
  theme="huggingface",
155
- title="Whisper French Demo 🇫🇷 : Transcribe Audio",
156
  description=(
157
- "Transcribe long-form microphone or audio inputs with the click of a button!\n\nDemo uses the the fine-tuned"
158
  f" checkpoint [{DEFAULT_MODEL_NAME}](https://huggingface.co/{DEFAULT_MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
159
  " of arbitrary length."
160
  ),
161
  allow_flagging="never",
162
  )
163
 
164
- yt_transcribe = gr.Interface(
165
- fn=yt_transcribe,
166
- inputs=[
167
- gr.inputs.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
168
- gr.Checkbox(label="With timestamps?", value=True),
169
- ],
170
- # outputs=["html", "text"],
171
- outputs=[
172
- gr.outputs.HTML(label="YouTube Page"),
173
- gr.outputs.Textbox(label="Transcription"),
174
- ],
175
- layout="horizontal",
176
- theme="huggingface",
177
- title="Whisper French Demo 🇫🇷 : Transcribe YouTube",
178
- description=(
179
- "Transcribe long-form YouTube videos with the click of a button!\n\nDemo uses the the fine-tuned checkpoint:"
180
- f" [{DEFAULT_MODEL_NAME}](https://huggingface.co/{DEFAULT_MODEL_NAME}) and 🤗 Transformers to transcribe audio files of"
181
- " arbitrary length."
182
- ),
183
- allow_flagging="never",
184
- )
185
-
186
- with demo:
187
- gr.TabbedInterface([mf_transcribe, yt_transcribe], ["Transcribe Audio", "Transcribe YouTube"])
188
 
189
  # demo.launch(server_name="0.0.0.0", debug=True, share=True)
190
  demo.launch(enable_queue=True)
 
46
  def _print_memory_info():
47
  memory = psutil.virtual_memory()
48
  logger.info(
49
+ f"Memory info - Free: {memory.available / (1024 ** 3):.2f} Gb, used: {memory.percent}%, total: {memory.total / (1024 ** 3):.2f} Gb"
50
  )
51
 
52
 
 
89
  return model.transcribe(filename, without_timestamps=True, **GEN_KWARGS)["text"]
90
 
91
 
92
+ def download_from_youtube(yt_url, downloaded_filename="audio.wav"):
93
+ yt = pt.YouTube(yt_url)
94
+ stream = yt.streams.filter(only_audio=True)[0]
95
+ # stream.download(filename="audio.mp3")
96
+ stream.download(filename=downloaded_filename)
97
+ return downloaded_filename
98
+
99
+
100
+ def transcribe(microphone, file_upload, yt_url, with_timestamps, model_name=DEFAULT_MODEL_NAME):
101
  warn_output = ""
102
+ if (microphone is not None) and (file_upload is not None) and yt_url:
103
+ warn_output = (
104
+ "WARNING: You've uploaded an audio file, used the microphone, and pasted a YouTube URL. "
105
+ "The recorded file from the microphone will be used, the uploaded audio and the YouTube URL will be discarded.\n"
106
+ )
107
+
108
  if (microphone is not None) and (file_upload is not None):
109
  warn_output = (
110
  "WARNING: You've uploaded an audio file and used the microphone. "
111
  "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
112
  )
113
 
114
+ if (microphone is not None) and yt_url:
115
+ warn_output = (
116
+ "WARNING: You've used the microphone and pasted a YouTube URL. "
117
+ "The recorded file from the microphone will be used and the YouTube URL will be discarded.\n"
118
+ )
 
 
 
 
 
 
 
 
 
 
119
 
120
+ if (file_upload is not None) and yt_url:
121
+ warn_output = (
122
+ "WARNING: You've uploaded an audio file and pasted a YouTube URL. "
123
+ "The uploaded audio will be used and the YouTube URL will be discarded.\n"
124
+ )
 
125
 
126
+ elif (microphone is None) and (file_upload is None) and (not yt_url):
127
+ return "ERROR: You have to either use the microphone, upload an audio file or paste a YouTube URL"
128
 
129
+ if microphone is not None:
130
+ file = microphone
131
+ logging_prefix = f"Transcription by `{model_name}` of microphone:"
132
+ elif file_upload is not None:
133
+ file = file_upload
134
+ logging_prefix = f"Transcription by `{model_name}` of uploaded file:"
135
+ else:
136
+ file = download_from_youtube(yt_url)
137
+ logging_prefix = f'Transcription by `{model_name}` of "{yt_url}":'
138
 
139
  model = maybe_load_cached_pipeline(model_name)
140
+ # text = model.transcribe(file, **GEN_KWARGS)["text"]
141
+ text = infer(model, file, with_timestamps)
142
 
143
+ logger.info(logging_prefix + "\n" + text + "\n")
144
 
145
+ return warn_output + text
146
 
147
 
148
  # load default model
149
  maybe_load_cached_pipeline(DEFAULT_MODEL_NAME)
150
 
151
+ demo = gr.Interface(
 
 
152
  fn=transcribe,
153
  inputs=[
154
  gr.inputs.Audio(source="microphone", type="filepath", label="Record", optional=True),
155
  gr.inputs.Audio(source="upload", type="filepath", label="Upload File", optional=True),
156
+ gr.inputs.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL", optional=True),
157
+ gr.Checkbox(label="With timestamps?"),
158
  ],
 
159
  outputs=gr.outputs.Textbox(label="Transcription"),
160
  layout="horizontal",
161
  theme="huggingface",
162
+ title="Whisper French Demo 🇫🇷",
163
  description=(
164
+ "**Transcribe long-form microphone, audio inputs or YouTube videos with the click of a button!** \n\nDemo uses the the fine-tuned"
165
  f" checkpoint [{DEFAULT_MODEL_NAME}](https://huggingface.co/{DEFAULT_MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
166
  " of arbitrary length."
167
  ),
168
  allow_flagging="never",
169
  )
170
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
 
172
  # demo.launch(server_name="0.0.0.0", debug=True, share=True)
173
  demo.launch(enable_queue=True)
run_demo_openai_merged.py DELETED
@@ -1,174 +0,0 @@
1
- import logging
2
- import warnings
3
-
4
- import gradio as gr
5
- import pytube as pt
6
- import psutil
7
- import torch
8
- import whisper
9
- from huggingface_hub import hf_hub_download, model_info
10
- from transformers.utils.logging import disable_progress_bar
11
-
12
- warnings.filterwarnings("ignore")
13
- disable_progress_bar()
14
-
15
- DEFAULT_MODEL_NAME = "bofenghuang/whisper-large-v2-cv11-french"
16
- CHECKPOINT_FILENAME = "checkpoint_openai.pt"
17
-
18
- GEN_KWARGS = {
19
- "task": "transcribe",
20
- "language": "fr",
21
- # "without_timestamps": True,
22
- # decode options
23
- # "beam_size": 5,
24
- # "patience": 2,
25
- # disable fallback
26
- # "compression_ratio_threshold": None,
27
- # "logprob_threshold": None,
28
- # vad threshold
29
- # "no_speech_threshold": None,
30
- }
31
-
32
- logging.basicConfig(
33
- format="%(asctime)s [%(levelname)s] [%(name)s] %(message)s",
34
- datefmt="%Y-%m-%dT%H:%M:%SZ",
35
- )
36
- logger = logging.getLogger(__name__)
37
- logger.setLevel(logging.DEBUG)
38
-
39
- # device = 0 if torch.cuda.is_available() else "cpu"
40
- device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
41
- logger.info(f"Model will be loaded on device `{device}`")
42
-
43
- cached_models = {}
44
-
45
-
46
- def _print_memory_info():
47
- memory = psutil.virtual_memory()
48
- logger.info(
49
- f"Memory info - Free: {memory.available / (1024 ** 3):.2f} Gb, used: {memory.percent}%, total: {memory.total / (1024 ** 3):.2f} Gb"
50
- )
51
-
52
-
53
- def print_cuda_memory_info():
54
- used_mem, tot_mem = torch.cuda.mem_get_info()
55
- logger.info(
56
- f"CUDA memory info - Free: {used_mem / 1024 ** 3:.2f} Gb, used: {(tot_mem - used_mem) / 1024 ** 3:.2f} Gb, total: {tot_mem / 1024 ** 3:.2f} Gb"
57
- )
58
-
59
-
60
- def print_memory_info():
61
- _print_memory_info()
62
- print_cuda_memory_info()
63
-
64
-
65
- def maybe_load_cached_pipeline(model_name):
66
- model = cached_models.get(model_name)
67
- if model is None:
68
- downloaded_model_path = hf_hub_download(repo_id=model_name, filename=CHECKPOINT_FILENAME)
69
-
70
- model = whisper.load_model(downloaded_model_path, device=device)
71
- logger.info(f"`{model_name}` has been loaded on device `{device}`")
72
-
73
- print_memory_info()
74
-
75
- cached_models[model_name] = model
76
- return model
77
-
78
-
79
- def infer(model, filename, with_timestamps):
80
- if with_timestamps:
81
- model_outputs = model.transcribe(filename, **GEN_KWARGS)
82
- return "\n\n".join(
83
- [
84
- f'Segment {segment["id"]+1} from {segment["start"]:.2f}s to {segment["end"]:.2f}s:\n{segment["text"].strip()}'
85
- for segment in model_outputs["segments"]
86
- ]
87
- )
88
- else:
89
- return model.transcribe(filename, without_timestamps=True, **GEN_KWARGS)["text"]
90
-
91
-
92
- def download_from_youtube(yt_url, downloaded_filename="audio.wav"):
93
- yt = pt.YouTube(yt_url)
94
- stream = yt.streams.filter(only_audio=True)[0]
95
- # stream.download(filename="audio.mp3")
96
- stream.download(filename=downloaded_filename)
97
- return downloaded_filename
98
-
99
-
100
- def transcribe(microphone, file_upload, yt_url, with_timestamps, model_name=DEFAULT_MODEL_NAME):
101
- warn_output = ""
102
- if (microphone is not None) and (file_upload is not None) and yt_url:
103
- warn_output = (
104
- "WARNING: You've uploaded an audio file, used the microphone, and pasted a YouTube URL. "
105
- "The recorded file from the microphone will be used, the uploaded audio and the YouTube URL will be discarded.\n"
106
- )
107
-
108
- if (microphone is not None) and (file_upload is not None):
109
- warn_output = (
110
- "WARNING: You've uploaded an audio file and used the microphone. "
111
- "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
112
- )
113
-
114
- if (microphone is not None) and yt_url:
115
- warn_output = (
116
- "WARNING: You've used the microphone and pasted a YouTube URL. "
117
- "The recorded file from the microphone will be used and the YouTube URL will be discarded.\n"
118
- )
119
-
120
- if (file_upload is not None) and yt_url:
121
- warn_output = (
122
- "WARNING: You've uploaded an audio file and pasted a YouTube URL. "
123
- "The uploaded audio will be used and the YouTube URL will be discarded.\n"
124
- )
125
-
126
- elif (microphone is None) and (file_upload is None) and (not yt_url):
127
- return "ERROR: You have to either use the microphone, upload an audio file or paste a YouTube URL"
128
-
129
- if microphone is not None:
130
- file = microphone
131
- logging_prefix = f"Transcription by `{model_name}` of microphone:"
132
- elif file_upload is not None:
133
- file = file_upload
134
- logging_prefix = f"Transcription by `{model_name}` of uploaded file:"
135
- else:
136
- file = download_from_youtube(yt_url)
137
- logging_prefix = f'Transcription by `{model_name}` of "{yt_url}":'
138
-
139
- model = maybe_load_cached_pipeline(model_name)
140
- # text = model.transcribe(file, **GEN_KWARGS)["text"]
141
- text = infer(model, file, with_timestamps)
142
-
143
- logger.info(logging_prefix + "\n" + text + "\n")
144
-
145
- return warn_output + text
146
-
147
-
148
- # load default model
149
- maybe_load_cached_pipeline(DEFAULT_MODEL_NAME)
150
-
151
- demo = gr.Interface(
152
- fn=transcribe,
153
- inputs=[
154
- gr.inputs.Audio(source="microphone", type="filepath", label="Record", optional=True),
155
- gr.inputs.Audio(source="upload", type="filepath", label="Upload File", optional=True),
156
- gr.inputs.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL", optional=True),
157
- gr.Checkbox(label="With timestamps?"),
158
- ],
159
- # outputs="text",
160
- outputs=gr.outputs.Textbox(label="Transcription"),
161
- layout="horizontal",
162
- theme="huggingface",
163
- title="Whisper French Demo 🇫🇷 : Transcribe Audio",
164
- description=(
165
- "**Transcribe long-form microphone, audio inputs or YouTube videos with the click of a button!** \n\nDemo uses the the fine-tuned"
166
- f" checkpoint [{DEFAULT_MODEL_NAME}](https://huggingface.co/{DEFAULT_MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
167
- " of arbitrary length."
168
- ),
169
- allow_flagging="never",
170
- )
171
-
172
-
173
- # demo.launch(server_name="0.0.0.0", debug=True, share=True)
174
- demo.launch(enable_queue=True)