Sangmin commited on
Commit
202fe0b
1 Parent(s): 7ea54f2

Add an option to toggle timestamps

Browse files

If Return Timestamps is true, the app transcribes audio/video files with timestamps in the SRT format.

Files changed (1) hide show
  1. app.py +22 -7
app.py CHANGED
@@ -22,13 +22,25 @@ pipe = pipeline(
22
  device=device,
23
  )
24
 
25
-
26
- def transcribe(inputs, task):
 
 
 
 
 
 
 
 
27
  if inputs is None:
28
  raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
29
 
30
- text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
31
- return text
 
 
 
 
32
 
33
 
34
  def _return_yt_html_embed(yt_url):
@@ -95,6 +107,7 @@ mf_transcribe = gr.Interface(
95
  inputs=[
96
  gr.inputs.Audio(source="microphone", type="filepath", optional=True),
97
  gr.inputs.Radio(["transcribe", "translate"], label="Task", default="transcribe"),
 
98
  ],
99
  outputs="text",
100
  layout="horizontal",
@@ -113,13 +126,14 @@ file_transcribe = gr.Interface(
113
  inputs=[
114
  gr.inputs.Audio(source="upload", type="filepath", optional=True, label="Audio file"),
115
  gr.inputs.Radio(["transcribe", "translate"], label="Task", default="transcribe"),
 
116
  ],
117
  outputs="text",
118
  layout="horizontal",
119
  theme="huggingface",
120
- title="Whisper Large V3: Transcribe Audio",
121
  description=(
122
- "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
123
  f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
124
  " of arbitrary length."
125
  ),
@@ -130,7 +144,8 @@ yt_transcribe = gr.Interface(
130
  fn=yt_transcribe,
131
  inputs=[
132
  gr.inputs.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
133
- gr.inputs.Radio(["transcribe", "translate"], label="Task", default="transcribe")
 
134
  ],
135
  outputs=["html", "text"],
136
  layout="horizontal",
 
22
  device=device,
23
  )
24
 
25
+ def chunks_to_srt(chunks):
26
+ srt_format = ""
27
+ for i, chunk in enumerate(chunks, 1):
28
+ start_time, end_time = chunk['timestamp']
29
+ start_time_hms = "{:02}:{:02}:{:02},{:03}".format(int(start_time // 3600), int((start_time % 3600) // 60), int(start_time % 60), int((start_time % 1) * 1000))
30
+ end_time_hms = "{:02}:{:02}:{:02},{:03}".format(int(end_time // 3600), int((end_time % 3600) // 60), int(end_time % 60), int((end_time % 1) * 1000))
31
+ srt_format += f"{i}\n{start_time_hms} --> {end_time_hms}\n{chunk['text']}\n\n"
32
+ return srt_format
33
+
34
+ def transcribe(inputs, task, return_timestamps):
35
  if inputs is None:
36
  raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
37
 
38
+ text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=return_timestamps)
39
+
40
+ if return_timestamps:
41
+ return chunks_to_srt(result['chunks'])
42
+ else:
43
+ return result['text']
44
 
45
 
46
  def _return_yt_html_embed(yt_url):
 
107
  inputs=[
108
  gr.inputs.Audio(source="microphone", type="filepath", optional=True),
109
  gr.inputs.Radio(["transcribe", "translate"], label="Task", default="transcribe"),
110
+ gr.inputs.Checkbox(label="Return timestamps"),
111
  ],
112
  outputs="text",
113
  layout="horizontal",
 
126
  inputs=[
127
  gr.inputs.Audio(source="upload", type="filepath", optional=True, label="Audio file"),
128
  gr.inputs.Radio(["transcribe", "translate"], label="Task", default="transcribe"),
129
+ gr.inputs.Checkbox(label="Return timestamps"),
130
  ],
131
  outputs="text",
132
  layout="horizontal",
133
  theme="huggingface",
134
+ title="Whisper Large V3: Transcribe Audio/Video",
135
  description=(
136
+ "Transcribe long-form microphone or audio inputs with the click of a button! The app uses the"
137
  f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
138
  " of arbitrary length."
139
  ),
 
144
  fn=yt_transcribe,
145
  inputs=[
146
  gr.inputs.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
147
+ gr.inputs.Radio(["transcribe", "translate"], label="Task", default="transcribe"),
148
+ gr.inputs.Checkbox(label="Return timestamps"),
149
  ],
150
  outputs=["html", "text"],
151
  layout="horizontal",