Spaces:
Runtime error
Runtime error
thealphamerc
commited on
Commit
·
fdad218
1
Parent(s):
19befe8
Added youtube video to text support
Browse files- .gitignore +1 -0
- Output/audio.txt +0 -1
- Output/audio2.txt +0 -1
- Output/audio3.json +0 -173
- Output/audio3.txt +0 -1
- app.py +43 -4
- requirements.txt +1 -0
.gitignore
CHANGED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
output/
|
Output/audio.txt
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
Quatlin, quatlin quatlin quatlin quatlin. Anti-six.
|
|
|
|
Output/audio2.txt
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
to gain life in all that...
|
|
|
|
Output/audio3.json
DELETED
@@ -1,173 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"id": 0,
|
4 |
-
"seek": 0,
|
5 |
-
"start": 0.0,
|
6 |
-
"end": 1.52,
|
7 |
-
"text": " Come and sit on a rock.",
|
8 |
-
"tokens": [
|
9 |
-
50363,
|
10 |
-
7911,
|
11 |
-
290,
|
12 |
-
1650,
|
13 |
-
319,
|
14 |
-
257,
|
15 |
-
3881,
|
16 |
-
13,
|
17 |
-
50439
|
18 |
-
],
|
19 |
-
"temperature": 0.0,
|
20 |
-
"avg_logprob": -0.34572365704704733,
|
21 |
-
"compression_ratio": 1.356164383561644,
|
22 |
-
"no_speech_prob": 0.01958448439836502
|
23 |
-
},
|
24 |
-
{
|
25 |
-
"id": 1,
|
26 |
-
"seek": 0,
|
27 |
-
"start": 1.52,
|
28 |
-
"end": 5.08,
|
29 |
-
"text": " Overlooking the river's blow, he wears a hat and some glasses.",
|
30 |
-
"tokens": [
|
31 |
-
50439,
|
32 |
-
3827,
|
33 |
-
11534,
|
34 |
-
262,
|
35 |
-
7850,
|
36 |
-
338,
|
37 |
-
6611,
|
38 |
-
11,
|
39 |
-
339,
|
40 |
-
17326,
|
41 |
-
257,
|
42 |
-
6877,
|
43 |
-
290,
|
44 |
-
617,
|
45 |
-
15232,
|
46 |
-
13,
|
47 |
-
50617
|
48 |
-
],
|
49 |
-
"temperature": 0.0,
|
50 |
-
"avg_logprob": -0.34572365704704733,
|
51 |
-
"compression_ratio": 1.356164383561644,
|
52 |
-
"no_speech_prob": 0.01958448439836502
|
53 |
-
},
|
54 |
-
{
|
55 |
-
"id": 2,
|
56 |
-
"seek": 0,
|
57 |
-
"start": 5.08,
|
58 |
-
"end": 7.36,
|
59 |
-
"text": " A smile on his face.",
|
60 |
-
"tokens": [
|
61 |
-
50617,
|
62 |
-
317,
|
63 |
-
8212,
|
64 |
-
319,
|
65 |
-
465,
|
66 |
-
1986,
|
67 |
-
13,
|
68 |
-
50731
|
69 |
-
],
|
70 |
-
"temperature": 0.0,
|
71 |
-
"avg_logprob": -0.34572365704704733,
|
72 |
-
"compression_ratio": 1.356164383561644,
|
73 |
-
"no_speech_prob": 0.01958448439836502
|
74 |
-
},
|
75 |
-
{
|
76 |
-
"id": 3,
|
77 |
-
"seek": 0,
|
78 |
-
"start": 7.36,
|
79 |
-
"end": 8.56,
|
80 |
-
"text": " He's not lost.",
|
81 |
-
"tokens": [
|
82 |
-
50731,
|
83 |
-
679,
|
84 |
-
338,
|
85 |
-
407,
|
86 |
-
2626,
|
87 |
-
13,
|
88 |
-
50791
|
89 |
-
],
|
90 |
-
"temperature": 0.0,
|
91 |
-
"avg_logprob": -0.34572365704704733,
|
92 |
-
"compression_ratio": 1.356164383561644,
|
93 |
-
"no_speech_prob": 0.01958448439836502
|
94 |
-
},
|
95 |
-
{
|
96 |
-
"id": 4,
|
97 |
-
"seek": 0,
|
98 |
-
"start": 8.56,
|
99 |
-
"end": 10.4,
|
100 |
-
"text": " The water rushes by.",
|
101 |
-
"tokens": [
|
102 |
-
50791,
|
103 |
-
383,
|
104 |
-
1660,
|
105 |
-
38596,
|
106 |
-
416,
|
107 |
-
13,
|
108 |
-
50883
|
109 |
-
],
|
110 |
-
"temperature": 0.0,
|
111 |
-
"avg_logprob": -0.34572365704704733,
|
112 |
-
"compression_ratio": 1.356164383561644,
|
113 |
-
"no_speech_prob": 0.01958448439836502
|
114 |
-
},
|
115 |
-
{
|
116 |
-
"id": 5,
|
117 |
-
"seek": 0,
|
118 |
-
"start": 10.4,
|
119 |
-
"end": 12.08,
|
120 |
-
"text": " A constant sound.",
|
121 |
-
"tokens": [
|
122 |
-
50883,
|
123 |
-
317,
|
124 |
-
6937,
|
125 |
-
2128,
|
126 |
-
13,
|
127 |
-
50967
|
128 |
-
],
|
129 |
-
"temperature": 0.0,
|
130 |
-
"avg_logprob": -0.34572365704704733,
|
131 |
-
"compression_ratio": 1.356164383561644,
|
132 |
-
"no_speech_prob": 0.01958448439836502
|
133 |
-
},
|
134 |
-
{
|
135 |
-
"id": 6,
|
136 |
-
"seek": 0,
|
137 |
-
"start": 12.08,
|
138 |
-
"end": 13.68,
|
139 |
-
"text": " It takes in the view.",
|
140 |
-
"tokens": [
|
141 |
-
50967,
|
142 |
-
632,
|
143 |
-
2753,
|
144 |
-
287,
|
145 |
-
262,
|
146 |
-
1570,
|
147 |
-
13,
|
148 |
-
51047
|
149 |
-
],
|
150 |
-
"temperature": 0.0,
|
151 |
-
"avg_logprob": -0.34572365704704733,
|
152 |
-
"compression_ratio": 1.356164383561644,
|
153 |
-
"no_speech_prob": 0.01958448439836502
|
154 |
-
},
|
155 |
-
{
|
156 |
-
"id": 7,
|
157 |
-
"seek": 0,
|
158 |
-
"start": 13.68,
|
159 |
-
"end": 14.48,
|
160 |
-
"text": " The mountains.",
|
161 |
-
"tokens": [
|
162 |
-
51047,
|
163 |
-
383,
|
164 |
-
12269,
|
165 |
-
13,
|
166 |
-
51087
|
167 |
-
],
|
168 |
-
"temperature": 0.0,
|
169 |
-
"avg_logprob": -0.34572365704704733,
|
170 |
-
"compression_ratio": 1.356164383561644,
|
171 |
-
"no_speech_prob": 0.01958448439836502
|
172 |
-
}
|
173 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Output/audio3.txt
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
Come and sit on a rock. Overlooking the river's blow, he wears a hat and some glasses. A smile on his face. He's not lost. The water rushes by. A constant sound. It takes in the view. The mountains.
|
|
|
|
app.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import os
|
2 |
os.system("pip install git+https://github.com/openai/whisper.git")
|
|
|
3 |
import gradio as gr
|
4 |
from subprocess import call
|
5 |
import whisper
|
@@ -43,7 +44,7 @@ model = whisper.load_model("base")
|
|
43 |
|
44 |
inputs = gr.components.Audio(type="filepath", label="Add audio file")
|
45 |
outputs = gr.components.Textbox()
|
46 |
-
title = "
|
47 |
description = "An example of using TTS to generate speech from text."
|
48 |
article = ""
|
49 |
examples = [
|
@@ -67,7 +68,7 @@ def transcribe(inputs):
|
|
67 |
# inputs = f.read()
|
68 |
|
69 |
# load audio and pad/trim it to fit 30 seconds
|
70 |
-
result = model.transcribe(audio=inputs, language='
|
71 |
word_timestamps=False, verbose=True)
|
72 |
# ---------------------------------------------------
|
73 |
|
@@ -75,6 +76,22 @@ def transcribe(inputs):
|
|
75 |
return result["text"]
|
76 |
|
77 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
audio_chunked = gr.Interface(
|
79 |
fn=transcribe,
|
80 |
inputs=inputs,
|
@@ -100,11 +117,33 @@ microphone_chunked = gr.Interface(
|
|
100 |
description=description,
|
101 |
article=article,
|
102 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
|
104 |
demo = gr.Blocks()
|
105 |
with demo:
|
106 |
-
gr.TabbedInterface([audio_chunked, microphone_chunked], [
|
107 |
-
|
108 |
demo.queue(concurrency_count=1, max_size=5)
|
109 |
demo.launch(show_api=False)
|
110 |
|
|
|
1 |
import os
|
2 |
os.system("pip install git+https://github.com/openai/whisper.git")
|
3 |
+
from pytube import YouTube
|
4 |
import gradio as gr
|
5 |
from subprocess import call
|
6 |
import whisper
|
|
|
44 |
|
45 |
inputs = gr.components.Audio(type="filepath", label="Add audio file")
|
46 |
outputs = gr.components.Textbox()
|
47 |
+
title = "Transcribe multi-lingual audio clips"
|
48 |
description = "An example of using TTS to generate speech from text."
|
49 |
article = ""
|
50 |
examples = [
|
|
|
68 |
# inputs = f.read()
|
69 |
|
70 |
# load audio and pad/trim it to fit 30 seconds
|
71 |
+
result = model.transcribe(audio=inputs, language='english',
|
72 |
word_timestamps=False, verbose=True)
|
73 |
# ---------------------------------------------------
|
74 |
|
|
|
76 |
return result["text"]
|
77 |
|
78 |
|
79 |
+
# Transcribe youtube video
|
80 |
+
# define function for transcription
|
81 |
+
def youtube_transcript(url):
|
82 |
+
try:
|
83 |
+
if url:
|
84 |
+
yt = YouTube(url, use_oauth=True)
|
85 |
+
source = yt.streams.filter(progressive=True, file_extension='mp4').order_by(
|
86 |
+
'resolution').desc().first().download('output/youtube')
|
87 |
+
|
88 |
+
transcript = model.transcribe(source)
|
89 |
+
return transcript["text"]
|
90 |
+
except Exception as e:
|
91 |
+
print('Error: ', e)
|
92 |
+
return 'Error: ' + str(e)
|
93 |
+
|
94 |
+
|
95 |
audio_chunked = gr.Interface(
|
96 |
fn=transcribe,
|
97 |
inputs=inputs,
|
|
|
117 |
description=description,
|
118 |
article=article,
|
119 |
)
|
120 |
+
youtube_chunked = gr.Interface(
|
121 |
+
fn=youtube_transcript,
|
122 |
+
inputs=[
|
123 |
+
gr.inputs.Textbox(label="Youtube URL", type="text"),
|
124 |
+
],
|
125 |
+
outputs=[
|
126 |
+
gr.outputs.Textbox(label="Transcription").style(
|
127 |
+
show_copy_button=True),
|
128 |
+
],
|
129 |
+
allow_flagging="never",
|
130 |
+
title=title,
|
131 |
+
|
132 |
+
description=description,
|
133 |
+
article=article,
|
134 |
+
examples=[
|
135 |
+
[ "https://www.youtube.com/watch?v=nlMuHtV82q8&ab_channel=NothingforSale24",],
|
136 |
+
["https://www.youtube.com/watch?v=JzPfMbG1vrE&ab_channel=ExplainerVideosByLauren",],
|
137 |
+
["https://www.youtube.com/watch?v=S68vvV0kod8&ab_channel=Pearl-CohnTelevision"]
|
138 |
+
|
139 |
+
],
|
140 |
+
|
141 |
+
)
|
142 |
|
143 |
demo = gr.Blocks()
|
144 |
with demo:
|
145 |
+
gr.TabbedInterface([youtube_chunked, audio_chunked, microphone_chunked], [
|
146 |
+
"Youtube", "Audio File", "Microphone"])
|
147 |
demo.queue(concurrency_count=1, max_size=5)
|
148 |
demo.launch(show_api=False)
|
149 |
|
requirements.txt
CHANGED
@@ -1,2 +1,3 @@
|
|
1 |
whisper
|
2 |
gradio===3.27.0
|
|
|
|
1 |
whisper
|
2 |
gradio===3.27.0
|
3 |
+
pytube
|