danielwm994 commited on
Commit
a9ecf96
·
verified ·
1 Parent(s): f1059c9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -30
app.py CHANGED
@@ -1,9 +1,11 @@
1
  import spaces
2
  import torch
 
3
  import gradio as gr
4
  import yt_dlp as youtube_dl
5
  from transformers import pipeline
6
  from transformers.pipelines.audio_utils import ffmpeg_read
 
7
  import tempfile
8
  import os
9
 
@@ -21,27 +23,20 @@ pipe = pipeline(
21
  device=device,
22
  )
23
 
 
24
  @spaces.GPU
25
  def transcribe(inputs, task):
26
  if inputs is None:
27
  raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
28
-
29
- # Perform transcription and get result with word-level timestamps
30
- result = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps="word")
31
-
32
  text = result["text"]
33
- timestamps = result.get("chunks", [])
34
-
35
- word_timestamps = []
36
- for chunk in timestamps:
37
- # Ensure the "words" key is present in each chunk
38
- if "words" in chunk:
39
- for word_info in chunk["words"]:
40
- word_timestamps.append(f"{word_info['word']} [{word_info['start']:.2f}-{word_info['end']:.2f}]")
41
- else:
42
- word_timestamps.append("No word-level timestamps available for this chunk.")
43
 
44
- return "\n".join(word_timestamps)
 
45
 
46
  def _return_yt_html_embed(yt_url):
47
  video_id = yt_url.split("?v=")[-1]
@@ -95,20 +90,13 @@ def yt_transcribe(yt_url, task, max_filesize=75.0):
95
  inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
96
  inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
97
 
98
- result = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps="word")
99
-
100
  text = result["text"]
101
- timestamps = result.get("chunks", [])
102
-
103
- word_timestamps = []
104
- for chunk in timestamps:
105
- if "words" in chunk:
106
- for word_info in chunk["words"]:
107
- word_timestamps.append(f"{word_info['word']} [{word_info['start']:.2f}-{word_info['end']:.2f}]")
108
- else:
109
- word_timestamps.append("No word-level timestamps available for this chunk.")
110
 
111
- return html_embed_str, "\n".join(word_timestamps)
112
 
113
 
114
  demo = gr.Blocks()
@@ -119,7 +107,7 @@ mf_transcribe = gr.Interface(
119
  gr.Audio(sources="microphone", type="filepath"),
120
  gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
121
  ],
122
- outputs="text",
123
  title="Whisper Large V3: Transcribe Audio",
124
  description=(
125
  "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
@@ -135,7 +123,7 @@ file_transcribe = gr.Interface(
135
  gr.Audio(sources="upload", type="filepath", label="Audio file"),
136
  gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
137
  ],
138
- outputs="text",
139
  title="Whisper Large V3: Transcribe Audio",
140
  description=(
141
  "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
@@ -151,7 +139,7 @@ yt_transcribe = gr.Interface(
151
  gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
152
  gr.Radio(["transcribe", "translate"], label="Task", value="transcribe")
153
  ],
154
- outputs=["html", "text"],
155
  title="Whisper Large V3: Transcribe YouTube",
156
  description=(
157
  "Transcribe long-form YouTube videos with the click of a button! Demo uses the checkpoint"
 
1
  import spaces
2
  import torch
3
+
4
  import gradio as gr
5
  import yt_dlp as youtube_dl
6
  from transformers import pipeline
7
  from transformers.pipelines.audio_utils import ffmpeg_read
8
+
9
  import tempfile
10
  import os
11
 
 
23
  device=device,
24
  )
25
 
26
+
27
  @spaces.GPU
28
  def transcribe(inputs, task):
29
  if inputs is None:
30
  raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
31
+
32
+ result = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)
 
 
33
  text = result["text"]
34
+ timestamps = result["chunks"]
35
+
36
+ timestamp_str = "\n".join([f"[{chunk['timestamp']}] {chunk['text']}" for chunk in timestamps])
 
 
 
 
 
 
 
37
 
38
+ return text, timestamp_str
39
+
40
 
41
  def _return_yt_html_embed(yt_url):
42
  video_id = yt_url.split("?v=")[-1]
 
90
  inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
91
  inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
92
 
93
+ result = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)
 
94
  text = result["text"]
95
+ timestamps = result["chunks"]
96
+
97
+ timestamp_str = "\n".join([f"[{chunk['timestamp']}] {chunk['text']}" for chunk in timestamps])
 
 
 
 
 
 
98
 
99
+ return html_embed_str, text, timestamp_str
100
 
101
 
102
  demo = gr.Blocks()
 
107
  gr.Audio(sources="microphone", type="filepath"),
108
  gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
109
  ],
110
+ outputs=["text", "text"], # Output both text and timestamps
111
  title="Whisper Large V3: Transcribe Audio",
112
  description=(
113
  "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
 
123
  gr.Audio(sources="upload", type="filepath", label="Audio file"),
124
  gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
125
  ],
126
+ outputs=["text", "text"], # Output both text and timestamps
127
  title="Whisper Large V3: Transcribe Audio",
128
  description=(
129
  "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
 
139
  gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
140
  gr.Radio(["transcribe", "translate"], label="Task", value="transcribe")
141
  ],
142
+ outputs=["html", "text", "text"], # Output both text and timestamps
143
  title="Whisper Large V3: Transcribe YouTube",
144
  description=(
145
  "Transcribe long-form YouTube videos with the click of a button! Demo uses the checkpoint"