thealphamerc commited on
Commit
fdad218
·
1 Parent(s): 19befe8

Added youtube video to text support

Browse files
Files changed (7) hide show
  1. .gitignore +1 -0
  2. Output/audio.txt +0 -1
  3. Output/audio2.txt +0 -1
  4. Output/audio3.json +0 -173
  5. Output/audio3.txt +0 -1
  6. app.py +43 -4
  7. requirements.txt +1 -0
.gitignore CHANGED
@@ -0,0 +1 @@
 
 
1
+ output/
Output/audio.txt DELETED
@@ -1 +0,0 @@
1
- Quatlin, quatlin quatlin quatlin quatlin. Anti-six.
 
 
Output/audio2.txt DELETED
@@ -1 +0,0 @@
1
- to gain life in all that...
 
 
Output/audio3.json DELETED
@@ -1,173 +0,0 @@
1
- [
2
- {
3
- "id": 0,
4
- "seek": 0,
5
- "start": 0.0,
6
- "end": 1.52,
7
- "text": " Come and sit on a rock.",
8
- "tokens": [
9
- 50363,
10
- 7911,
11
- 290,
12
- 1650,
13
- 319,
14
- 257,
15
- 3881,
16
- 13,
17
- 50439
18
- ],
19
- "temperature": 0.0,
20
- "avg_logprob": -0.34572365704704733,
21
- "compression_ratio": 1.356164383561644,
22
- "no_speech_prob": 0.01958448439836502
23
- },
24
- {
25
- "id": 1,
26
- "seek": 0,
27
- "start": 1.52,
28
- "end": 5.08,
29
- "text": " Overlooking the river's blow, he wears a hat and some glasses.",
30
- "tokens": [
31
- 50439,
32
- 3827,
33
- 11534,
34
- 262,
35
- 7850,
36
- 338,
37
- 6611,
38
- 11,
39
- 339,
40
- 17326,
41
- 257,
42
- 6877,
43
- 290,
44
- 617,
45
- 15232,
46
- 13,
47
- 50617
48
- ],
49
- "temperature": 0.0,
50
- "avg_logprob": -0.34572365704704733,
51
- "compression_ratio": 1.356164383561644,
52
- "no_speech_prob": 0.01958448439836502
53
- },
54
- {
55
- "id": 2,
56
- "seek": 0,
57
- "start": 5.08,
58
- "end": 7.36,
59
- "text": " A smile on his face.",
60
- "tokens": [
61
- 50617,
62
- 317,
63
- 8212,
64
- 319,
65
- 465,
66
- 1986,
67
- 13,
68
- 50731
69
- ],
70
- "temperature": 0.0,
71
- "avg_logprob": -0.34572365704704733,
72
- "compression_ratio": 1.356164383561644,
73
- "no_speech_prob": 0.01958448439836502
74
- },
75
- {
76
- "id": 3,
77
- "seek": 0,
78
- "start": 7.36,
79
- "end": 8.56,
80
- "text": " He's not lost.",
81
- "tokens": [
82
- 50731,
83
- 679,
84
- 338,
85
- 407,
86
- 2626,
87
- 13,
88
- 50791
89
- ],
90
- "temperature": 0.0,
91
- "avg_logprob": -0.34572365704704733,
92
- "compression_ratio": 1.356164383561644,
93
- "no_speech_prob": 0.01958448439836502
94
- },
95
- {
96
- "id": 4,
97
- "seek": 0,
98
- "start": 8.56,
99
- "end": 10.4,
100
- "text": " The water rushes by.",
101
- "tokens": [
102
- 50791,
103
- 383,
104
- 1660,
105
- 38596,
106
- 416,
107
- 13,
108
- 50883
109
- ],
110
- "temperature": 0.0,
111
- "avg_logprob": -0.34572365704704733,
112
- "compression_ratio": 1.356164383561644,
113
- "no_speech_prob": 0.01958448439836502
114
- },
115
- {
116
- "id": 5,
117
- "seek": 0,
118
- "start": 10.4,
119
- "end": 12.08,
120
- "text": " A constant sound.",
121
- "tokens": [
122
- 50883,
123
- 317,
124
- 6937,
125
- 2128,
126
- 13,
127
- 50967
128
- ],
129
- "temperature": 0.0,
130
- "avg_logprob": -0.34572365704704733,
131
- "compression_ratio": 1.356164383561644,
132
- "no_speech_prob": 0.01958448439836502
133
- },
134
- {
135
- "id": 6,
136
- "seek": 0,
137
- "start": 12.08,
138
- "end": 13.68,
139
- "text": " It takes in the view.",
140
- "tokens": [
141
- 50967,
142
- 632,
143
- 2753,
144
- 287,
145
- 262,
146
- 1570,
147
- 13,
148
- 51047
149
- ],
150
- "temperature": 0.0,
151
- "avg_logprob": -0.34572365704704733,
152
- "compression_ratio": 1.356164383561644,
153
- "no_speech_prob": 0.01958448439836502
154
- },
155
- {
156
- "id": 7,
157
- "seek": 0,
158
- "start": 13.68,
159
- "end": 14.48,
160
- "text": " The mountains.",
161
- "tokens": [
162
- 51047,
163
- 383,
164
- 12269,
165
- 13,
166
- 51087
167
- ],
168
- "temperature": 0.0,
169
- "avg_logprob": -0.34572365704704733,
170
- "compression_ratio": 1.356164383561644,
171
- "no_speech_prob": 0.01958448439836502
172
- }
173
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Output/audio3.txt DELETED
@@ -1 +0,0 @@
1
- Come and sit on a rock. Overlooking the river's blow, he wears a hat and some glasses. A smile on his face. He's not lost. The water rushes by. A constant sound. It takes in the view. The mountains.
 
 
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import os
2
  os.system("pip install git+https://github.com/openai/whisper.git")
 
3
  import gradio as gr
4
  from subprocess import call
5
  import whisper
@@ -43,7 +44,7 @@ model = whisper.load_model("base")
43
 
44
  inputs = gr.components.Audio(type="filepath", label="Add audio file")
45
  outputs = gr.components.Textbox()
46
- title = "Audio To text⚡️"
47
  description = "An example of using TTS to generate speech from text."
48
  article = ""
49
  examples = [
@@ -67,7 +68,7 @@ def transcribe(inputs):
67
  # inputs = f.read()
68
 
69
  # load audio and pad/trim it to fit 30 seconds
70
- result = model.transcribe(audio=inputs, language='hindi',
71
  word_timestamps=False, verbose=True)
72
  # ---------------------------------------------------
73
 
@@ -75,6 +76,22 @@ def transcribe(inputs):
75
  return result["text"]
76
 
77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  audio_chunked = gr.Interface(
79
  fn=transcribe,
80
  inputs=inputs,
@@ -100,11 +117,33 @@ microphone_chunked = gr.Interface(
100
  description=description,
101
  article=article,
102
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
  demo = gr.Blocks()
105
  with demo:
106
- gr.TabbedInterface([audio_chunked, microphone_chunked], [
107
- "Audio File", "Microphone"])
108
  demo.queue(concurrency_count=1, max_size=5)
109
  demo.launch(show_api=False)
110
 
 
1
  import os
2
  os.system("pip install git+https://github.com/openai/whisper.git")
3
+ from pytube import YouTube
4
  import gradio as gr
5
  from subprocess import call
6
  import whisper
 
44
 
45
  inputs = gr.components.Audio(type="filepath", label="Add audio file")
46
  outputs = gr.components.Textbox()
47
+ title = "Transcribe multi-lingual audio clips"
48
  description = "An example of using TTS to generate speech from text."
49
  article = ""
50
  examples = [
 
68
  # inputs = f.read()
69
 
70
  # load audio and pad/trim it to fit 30 seconds
71
+ result = model.transcribe(audio=inputs, language='english',
72
  word_timestamps=False, verbose=True)
73
  # ---------------------------------------------------
74
 
 
76
  return result["text"]
77
 
78
 
79
+ # Transcribe youtube video
80
+ # define function for transcription
81
+ def youtube_transcript(url):
82
+ try:
83
+ if url:
84
+ yt = YouTube(url, use_oauth=True)
85
+ source = yt.streams.filter(progressive=True, file_extension='mp4').order_by(
86
+ 'resolution').desc().first().download('output/youtube')
87
+
88
+ transcript = model.transcribe(source)
89
+ return transcript["text"]
90
+ except Exception as e:
91
+ print('Error: ', e)
92
+ return 'Error: ' + str(e)
93
+
94
+
95
  audio_chunked = gr.Interface(
96
  fn=transcribe,
97
  inputs=inputs,
 
117
  description=description,
118
  article=article,
119
  )
120
+ youtube_chunked = gr.Interface(
121
+ fn=youtube_transcript,
122
+ inputs=[
123
+ gr.inputs.Textbox(label="Youtube URL", type="text"),
124
+ ],
125
+ outputs=[
126
+ gr.outputs.Textbox(label="Transcription").style(
127
+ show_copy_button=True),
128
+ ],
129
+ allow_flagging="never",
130
+ title=title,
131
+
132
+ description=description,
133
+ article=article,
134
+ examples=[
135
+ [ "https://www.youtube.com/watch?v=nlMuHtV82q8&ab_channel=NothingforSale24",],
136
+ ["https://www.youtube.com/watch?v=JzPfMbG1vrE&ab_channel=ExplainerVideosByLauren",],
137
+ ["https://www.youtube.com/watch?v=S68vvV0kod8&ab_channel=Pearl-CohnTelevision"]
138
+
139
+ ],
140
+
141
+ )
142
 
143
  demo = gr.Blocks()
144
  with demo:
145
+ gr.TabbedInterface([youtube_chunked, audio_chunked, microphone_chunked], [
146
+ "Youtube", "Audio File", "Microphone"])
147
  demo.queue(concurrency_count=1, max_size=5)
148
  demo.launch(show_api=False)
149
 
requirements.txt CHANGED
@@ -1,2 +1,3 @@
1
  whisper
2
  gradio===3.27.0
 
 
1
  whisper
2
  gradio===3.27.0
3
+ pytube