Maximofn commited on
Commit
d51a666
·
1 Parent(s): 6c1a0f6

Trascribe text

Browse files
assets/youtube-no-thumbnails.webp ADDED
assets/youtube_error.webp ADDED
create conda environment.md CHANGED
@@ -17,7 +17,7 @@ pip install huggingface-hub
17
  pip install torch
18
  pip install torchaudio
19
  <!-- pip install pysndfile==1.0.0 -->
20
- mamba install -y -c conda-forge libsndfile==1.0.31 pyperclip
21
 
22
  # Download videos
23
  pip install twitch-dl
 
17
  pip install torch
18
  pip install torchaudio
19
  <!-- pip install pysndfile==1.0.0 -->
20
+ mamba install -y -c conda-forge libsndfile==1.0.31 pyperclip ipywidgets
21
 
22
  # Download videos
23
  pip install twitch-dl
requirements.txt CHANGED
@@ -1,11 +1,13 @@
1
- fairseq2
2
- git+https://github.com/facebookresearch/seamless_communication
3
- gradio
4
- huggingface_hub
5
- torch
6
- torchaudio
7
  # libsndfile==1.0.31
8
  # pysndfile
 
 
 
 
9
  twitch-dl
10
  pytube
11
  pyperclip
 
 
 
1
+ # fairseq2
2
+ # git+https://github.com/facebookresearch/seamless_communication
 
 
 
 
3
  # libsndfile==1.0.31
4
  # pysndfile
5
+ gradio
6
+ # huggingface_hub
7
+ torch
8
+ # torchaudio
9
  twitch-dl
10
  pytube
11
  pyperclip
12
+ transformers
13
+ git+https://github.com/openai/whisper.git
translatube.py CHANGED
@@ -4,42 +4,76 @@ import urllib.parse as urlparse
4
  from pytube import YouTube
5
  import re
6
  import subprocess
 
7
  from lang_list import ORIGINAL_LANGUAGE_NAME_TO_CODE, S2ST_TARGET_ORIGINAL_LANGUAGE_NAMES
 
8
  import torch
9
- from seamless_communication.models.inference import Translator
10
- import time
 
 
 
 
11
 
12
  YOUTUBE = "youtube"
13
  TWITCH = "twitch"
14
-
15
- # Initialize a Translator object with a multitask model, vocoder on the GPU.
16
- # translator = Translator("seamlessM4T_large", vocoder_name_or_card="vocoder_36langs", device=torch.device("cuda:0"))
17
 
18
  def copy_url_from_clipboard():
19
  return pyperclip.paste()
20
 
21
  def clear_video_url():
22
- return ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
  def get_youtube_thumbnail(video_id):
25
  thumbnail_url = f"https://img.youtube.com/vi/{video_id}/0.jpg"
26
  return thumbnail_url
27
 
28
  def get_youtube_video_id(url):
29
- parsed_url = urlparse.urlparse(url)
30
- video_id = urlparse.parse_qs(parsed_url.query).get('v')
31
- if video_id:
32
- thumbnail_url = get_youtube_thumbnail(video_id[0])
33
  return thumbnail_url
34
  else:
35
- return None
 
 
 
 
 
 
36
 
37
  def is_valid_url(url):
38
- button = gr.Button(size="sm", value="translate", min_width="10px", scale=0, visible=True)
39
- original_audio = gr.Audio(label="Original audio", elem_id="original_audio", visible=True, interactive=False)
40
- translated_audio = gr.Audio(label="Translated audio", elem_id="translated_audio", visible=True)
41
  source_languaje = gr.Dropdown(visible=True, label="Source languaje", show_label=True, value="English", choices=ORIGINAL_LANGUAGE_NAME_TO_CODE, scale=1, interactive=True)
42
  target_languaje = gr.Dropdown(visible=True, label="Target languaje", show_label=True, value="Español", choices=S2ST_TARGET_ORIGINAL_LANGUAGE_NAMES, scale=1, interactive=True)
 
 
 
 
 
 
43
  if "youtube" in url.lower() or "youtu.be" in url.lower():
44
  thumbnail = get_youtube_video_id(url)
45
  if thumbnail:
@@ -47,21 +81,64 @@ def is_valid_url(url):
47
  gr.Image(value=thumbnail, visible=True, show_download_button=False, container=False),
48
  source_languaje,
49
  target_languaje,
50
- button,
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  gr.Textbox(value=YOUTUBE, label="Stream page", elem_id="stream_page", visible=False),
52
  original_audio,
 
53
  translated_audio,
54
- )
 
 
55
  elif "twitch" in url.lower() or "twitch.tv" in url.lower():
56
  return (
57
  gr.Image(value="assets/twitch.webp", visible=True, show_download_button=False, container=False),
58
  source_languaje,
59
  target_languaje,
60
- button,
61
  gr.Textbox(value=TWITCH, label="Stream page", elem_id="stream_page", visible=False),
62
  original_audio,
 
63
  translated_audio,
64
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
  def get_audio_from_video(url, stream_page):
67
  if stream_page == YOUTUBE:
@@ -78,25 +155,71 @@ def get_audio_from_video(url, stream_page):
78
  audio_stream = audio_streams.filter(abr=abr_list[0]).first()
79
 
80
  # Download the audio
81
- audio_stream.download(filename="audio.mp3")
 
82
 
83
- return gr.Audio("audio.mp3", label="Original audio", elem_id="original_audio", visible=True)
 
 
 
84
  elif stream_page == TWITCH:
85
  # Get the video id
86
  video_id = re.search("\d{10}", url).group(0)
87
 
88
  # Download the video
89
- subprocess.run(["twitch-dl", "download", "--overwrite", "-q", "audio_only", "--output", "audio.mkv", video_id])
 
 
 
 
 
 
90
 
91
- return gr.Audio("audio.mkv", label="Original audio", elem_id="original_audio", visible=True)
 
 
 
 
 
 
 
 
 
92
 
93
- # def translate_audio(input_audio, target_languaje):
94
- # print("Translating audio...")
95
- # time.sleep(5)
96
- # print("Translating audio...")
97
- # _, wav, _ = translator.predict(input_audio, "s2st", target_languaje)
98
- # return gr.Audio(wav, label="Translated audio", elem_id="translated_audio", visible=True)
 
 
 
 
 
 
99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
  with gr.Blocks() as demo:
102
  with gr.Row(variant="panel"):
@@ -104,7 +227,6 @@ with gr.Blocks() as demo:
104
  copy_button = gr.Button(size="sm", icon="icons/copy.svg", value="", min_width="10px", scale=0)
105
  delete_button = gr.Button(size="sm", icon="icons/delete.svg", value="", min_width="10px", scale=0)
106
  copy_button.click(fn=copy_url_from_clipboard, outputs=url_textbox)
107
- delete_button.click(fn=clear_video_url, outputs=url_textbox)
108
 
109
  stream_page = gr.Textbox(label="Stream page", elem_id="stream_page", visible=False)
110
  visible = False
@@ -112,14 +234,53 @@ with gr.Blocks() as demo:
112
  image = gr.Image(visible=visible, scale=1)
113
  with gr.Column():
114
  with gr.Row():
115
- source_languaje = gr.Dropdown(visible=visible, label="Source languaje", choices=ORIGINAL_LANGUAGE_NAME_TO_CODE, scale=1, interactive=True)
116
- target_languaje = gr.Dropdown(visible=visible, label="Target languaje", choices=S2ST_TARGET_ORIGINAL_LANGUAGE_NAMES, scale=1, interactive=True)
117
- translate_button = gr.Button(size="lg", value="translate", min_width="10px", visible=visible)
 
 
118
 
119
- original_audio = gr.Audio(label="Original audio", elem_id="original_audio", visible=visible)
 
 
 
 
 
120
  translated_audio = gr.Audio(label="Translated audio", elem_id="translated_audio", visible=visible)
121
- url_textbox.change(fn=is_valid_url, inputs=url_textbox, outputs=[image, source_languaje, target_languaje, translate_button, stream_page, original_audio, translated_audio])
122
- translate_button.click(fn=get_audio_from_video, inputs=[url_textbox, stream_page], outputs=original_audio)
123
- # original_audio.change(fn=translate_audio, inputs=[original_audio, target_languaje], outputs=translated_audio)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
  demo.launch()
 
4
  from pytube import YouTube
5
  import re
6
  import subprocess
7
+ import time
8
  from lang_list import ORIGINAL_LANGUAGE_NAME_TO_CODE, S2ST_TARGET_ORIGINAL_LANGUAGE_NAMES
9
+
10
  import torch
11
+ import whisper
12
+
13
+ # get device
14
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
15
+ # device = torch.device("cpu")
16
+ model = whisper.load_model("large-v2", device=device)
17
 
18
  YOUTUBE = "youtube"
19
  TWITCH = "twitch"
20
+ ERROR = "error"
 
 
21
 
22
  def copy_url_from_clipboard():
23
  return pyperclip.paste()
24
 
25
  def clear_video_url():
26
+ visible = False
27
+ image = gr.Image(visible=visible, scale=1)
28
+ source_languaje = gr.Dropdown(visible=visible, label="Source languaje", show_label=True, value="English", choices=ORIGINAL_LANGUAGE_NAME_TO_CODE, scale=1, interactive=True)
29
+ target_languaje = gr.Dropdown(visible=visible, label="Target languaje", show_label=True, value="Español", choices=S2ST_TARGET_ORIGINAL_LANGUAGE_NAMES, scale=1, interactive=True)
30
+ get_audio_button = gr.Button(size="lg", value="get audio", min_width="10px", scale=0, visible=visible)
31
+ transcribe_audio_button = gr.Button(size="lg", value="transcribe audio", min_width="10px", scale=0, visible=visible)
32
+ original_audio = gr.Audio(label="Original audio", elem_id="original_audio", visible=visible, interactive=False)
33
+ original_audio_transcribed = gr.Textbox(label="Original audio transcribed", elem_id="original_audio_transcribed", interactive=False, visible=visible)
34
+ original_audio_translated = gr.Textbox(label="Original audio translated", elem_id="original_audio_translated", interactive=False, visible=visible)
35
+ translated_audio = gr.Audio(label="Translated audio", elem_id="translated_audio", visible=visible)
36
+ transcribe_audio_button = gr.Button(size="lg", value="transcribe audio", min_width="10px", scale=0, visible=visible)
37
+ return (
38
+ "",
39
+ image,
40
+ source_languaje,
41
+ target_languaje,
42
+ get_audio_button,
43
+ transcribe_audio_button,
44
+ original_audio,
45
+ original_audio_transcribed,
46
+ translated_audio,
47
+ original_audio_translated,
48
+ )
49
 
50
  def get_youtube_thumbnail(video_id):
51
  thumbnail_url = f"https://img.youtube.com/vi/{video_id}/0.jpg"
52
  return thumbnail_url
53
 
54
  def get_youtube_video_id(url):
55
+ if "youtu.be" in url.lower():
56
+ yt = YouTube(url)
57
+ thumbnail_url = yt.thumbnail_url
 
58
  return thumbnail_url
59
  else:
60
+ parsed_url = urlparse.urlparse(url)
61
+ video_id = urlparse.parse_qs(parsed_url.query).get('v')
62
+ if video_id:
63
+ thumbnail_url = get_youtube_thumbnail(video_id[0])
64
+ return thumbnail_url
65
+ else:
66
+ return None
67
 
68
  def is_valid_url(url):
 
 
 
69
  source_languaje = gr.Dropdown(visible=True, label="Source languaje", show_label=True, value="English", choices=ORIGINAL_LANGUAGE_NAME_TO_CODE, scale=1, interactive=True)
70
  target_languaje = gr.Dropdown(visible=True, label="Target languaje", show_label=True, value="Español", choices=S2ST_TARGET_ORIGINAL_LANGUAGE_NAMES, scale=1, interactive=True)
71
+ get_audio_button = gr.Button(size="lg", value="get audio", min_width="10px", scale=0, visible=True)
72
+ original_audio = gr.Audio(label="Original audio", elem_id="original_audio", visible=True, interactive=False)
73
+ original_audio_transcribed = gr.Textbox(label="Original audio transcribed", elem_id="original_audio_transcribed", interactive=False, visible=True)
74
+ original_audio_translated = gr.Textbox(label="Original audio translated", elem_id="original_audio_translated", interactive=False, visible=True)
75
+ translated_audio = gr.Audio(label="Translated audio", elem_id="translated_audio", visible=True)
76
+ transcribe_audio_button = gr.Button(size="lg", value="transcribe audio", min_width="10px", scale=0, visible=True)
77
  if "youtube" in url.lower() or "youtu.be" in url.lower():
78
  thumbnail = get_youtube_video_id(url)
79
  if thumbnail:
 
81
  gr.Image(value=thumbnail, visible=True, show_download_button=False, container=False),
82
  source_languaje,
83
  target_languaje,
84
+ get_audio_button,
85
+ gr.Textbox(value=YOUTUBE, label="Stream page", elem_id="stream_page", visible=False),
86
+ original_audio,
87
+ original_audio_transcribed,
88
+ translated_audio,
89
+ transcribe_audio_button,
90
+ original_audio_translated,
91
+ )
92
+ else:
93
+ return (
94
+ gr.Image(value="assets/youtube-no-thumbnails.webp", visible=True, show_download_button=False, container=False),
95
+ source_languaje,
96
+ target_languaje,
97
+ get_audio_button,
98
  gr.Textbox(value=YOUTUBE, label="Stream page", elem_id="stream_page", visible=False),
99
  original_audio,
100
+ original_audio_transcribed,
101
  translated_audio,
102
+ transcribe_audio_button,
103
+ original_audio_translated,
104
+ )
105
  elif "twitch" in url.lower() or "twitch.tv" in url.lower():
106
  return (
107
  gr.Image(value="assets/twitch.webp", visible=True, show_download_button=False, container=False),
108
  source_languaje,
109
  target_languaje,
110
+ get_audio_button,
111
  gr.Textbox(value=TWITCH, label="Stream page", elem_id="stream_page", visible=False),
112
  original_audio,
113
+ original_audio_transcribed,
114
  translated_audio,
115
+ transcribe_audio_button,
116
+ original_audio_translated,
117
+ )
118
+ else:
119
+ visible = False
120
+ image = gr.Image(value="assets/youtube_error.webp", visible=visible, show_download_button=False, container=False)
121
+ source_languaje = gr.Dropdown(visible=visible, label="Source languaje", show_label=True, value="English", choices=ORIGINAL_LANGUAGE_NAME_TO_CODE, scale=1, interactive=True)
122
+ target_languaje = gr.Dropdown(visible=visible, label="Target languaje", show_label=True, value="Español", choices=S2ST_TARGET_ORIGINAL_LANGUAGE_NAMES, scale=1, interactive=True)
123
+ get_audio_button = gr.Button(size="lg", value="get audio", min_width="10px", scale=0, visible=visible)
124
+ stream_page = gr.Textbox(value=ERROR, label="Stream page", elem_id="stream_page", visible=visible)
125
+ original_audio = gr.Audio(label="Original audio", elem_id="original_audio", visible=visible, interactive=False)
126
+ original_audio_transcribed = gr.Textbox(label="Original audio transcribed", elem_id="original_audio_transcribed", interactive=False, visible=visible)
127
+ original_audio_translated = gr.Textbox(label="Original audio translated", elem_id="original_audio_translated", interactive=False, visible=visible)
128
+ translated_audio = gr.Audio(label="Translated audio", elem_id="translated_audio", visible=visible)
129
+ transcribe_audio_button = gr.Button(size="lg", value="transcribe audio", min_width="10px", scale=0, visible=visible)
130
+ return (
131
+ image,
132
+ source_languaje,
133
+ target_languaje,
134
+ get_audio_button,
135
+ stream_page,
136
+ original_audio,
137
+ original_audio_transcribed,
138
+ translated_audio,
139
+ transcribe_audio_button,
140
+ original_audio_translated,
141
+ )
142
 
143
  def get_audio_from_video(url, stream_page):
144
  if stream_page == YOUTUBE:
 
155
  audio_stream = audio_streams.filter(abr=abr_list[0]).first()
156
 
157
  # Download the audio
158
+ filename = "audio.mp3"
159
+ audio_stream.download(filename=filename)
160
 
161
+ return (
162
+ gr.Audio(value=filename, label="Original audio", elem_id="original_audio", visible=True, interactive=False),
163
+ gr.Textbox(value=filename, label="Stream page", elem_id="stream_page", visible=False)
164
+ )
165
  elif stream_page == TWITCH:
166
  # Get the video id
167
  video_id = re.search("\d{10}", url).group(0)
168
 
169
  # Download the video
170
+ filename = "audio.mkv"
171
+ subprocess.run(["twitch-dl", "download", "--overwrite", "-q", "audio_only", "--output", filename, video_id])
172
+
173
+ return (
174
+ gr.Audio(value=filename, label="Original audio", elem_id="original_audio", visible=True, interactive=False),
175
+ gr.Textbox(value=filename, label="Stream page", elem_id="stream_page", visible=False)
176
+ )
177
 
178
+ def trascribe_audio(audio_path):
179
+ audio = whisper.load_audio(audio_path)
180
+ audio = whisper.pad_or_trim(audio)
181
+
182
+ mel = whisper.log_mel_spectrogram(audio).to(model.device)
183
+
184
+ _, probs = model.detect_language(mel)
185
+
186
+ options = whisper.DecodingOptions(fp16 = False)
187
+ result = whisper.decode(model, mel, options)
188
 
189
+ # Save the result to a file
190
+ filename = "result.txt"
191
+ with open(filename, "w") as f:
192
+ f.write(result.text)
193
+
194
+ # Remove audio file
195
+ # subprocess.run(["rm", audio_path])
196
+
197
+ return (
198
+ result.text,
199
+ gr.Textbox(value=filename, label="Original audio transcribed", elem_id="original_audio_transcribed", visible=False)
200
+ )
201
 
202
+ def translate(original_audio_transcribed_path, source_languaje, target_languaje):
203
+ # Translate
204
+ with open(original_audio_transcribed_path, "r") as f:
205
+ text = f.read()
206
+ translated = text
207
+
208
+ # Save the result to a file
209
+ filename = "translated_text.txt"
210
+ with open(filename, "w") as f:
211
+ f.write(text)
212
+
213
+ # Remove audio file
214
+ # subprocess.run(["rm", original_audio_transcribed_path])
215
+
216
+ return (
217
+ translated,
218
+ gr.Textbox(value=filename, label="Original audio translated", elem_id="original_audio_translated", visible=False)
219
+ )
220
+
221
+ def tex2speech(original_audio_translated_path):
222
+ pass
223
 
224
  with gr.Blocks() as demo:
225
  with gr.Row(variant="panel"):
 
227
  copy_button = gr.Button(size="sm", icon="icons/copy.svg", value="", min_width="10px", scale=0)
228
  delete_button = gr.Button(size="sm", icon="icons/delete.svg", value="", min_width="10px", scale=0)
229
  copy_button.click(fn=copy_url_from_clipboard, outputs=url_textbox)
 
230
 
231
  stream_page = gr.Textbox(label="Stream page", elem_id="stream_page", visible=False)
232
  visible = False
 
234
  image = gr.Image(visible=visible, scale=1)
235
  with gr.Column():
236
  with gr.Row():
237
+ source_languaje = gr.Dropdown(visible=visible, label="Source languaje", show_label=True, value="English", choices=ORIGINAL_LANGUAGE_NAME_TO_CODE, scale=1, interactive=True)
238
+ target_languaje = gr.Dropdown(visible=visible, label="Target languaje", show_label=True, value="Español", choices=S2ST_TARGET_ORIGINAL_LANGUAGE_NAMES, scale=1, interactive=True)
239
+ with gr.Row():
240
+ get_audio_button = gr.Button(size="lg", value="get audio", min_width="10px", scale=0, visible=visible)
241
+ transcribe_audio_button = gr.Button(size="lg", value="transcribe audio", min_width="10px", scale=0, visible=visible)
242
 
243
+ original_audio = gr.Audio(label="Original audio", elem_id="original_audio", visible=visible, interactive=False)
244
+ original_audio_path = gr.Textbox(label="Stream page", elem_id="stream_page", visible=False)
245
+ original_audio_transcribed = gr.Textbox(label="Original audio transcribed", elem_id="original_audio_transcribed", interactive=False, visible=visible)
246
+ original_audio_transcribed_path = gr.Textbox(label="Original audio transcribed", elem_id="original_audio_transcribed", visible=False)
247
+ original_audio_translated = gr.Textbox(label="Original audio translated", elem_id="original_audio_translated", interactive=False, visible=visible)
248
+ original_audio_translated_path = gr.Textbox(label="Original audio translated", elem_id="original_audio_translated", visible=False)
249
  translated_audio = gr.Audio(label="Translated audio", elem_id="translated_audio", visible=visible)
250
+ url_textbox.change(
251
+ fn=is_valid_url,
252
+ inputs=url_textbox,
253
+ outputs=[
254
+ image,
255
+ source_languaje,
256
+ target_languaje,
257
+ get_audio_button,
258
+ stream_page,
259
+ original_audio,
260
+ original_audio_transcribed,
261
+ translated_audio,
262
+ transcribe_audio_button,
263
+ original_audio_translated,
264
+ ]
265
+ )
266
+ delete_button.click(
267
+ fn=clear_video_url,
268
+ outputs=[
269
+ url_textbox,
270
+ image,
271
+ source_languaje,
272
+ target_languaje,
273
+ get_audio_button,
274
+ transcribe_audio_button,
275
+ original_audio,
276
+ original_audio_transcribed,
277
+ translated_audio,
278
+ original_audio_translated,
279
+ ]
280
+ )
281
+ get_audio_button.click(fn=get_audio_from_video, inputs=[url_textbox, stream_page], outputs=[original_audio, original_audio_path])
282
+ original_audio.change(fn=trascribe_audio, inputs=original_audio_path, outputs=[original_audio_transcribed, original_audio_transcribed_path])
283
+ original_audio_transcribed.change(fn=translate, inputs=[original_audio_transcribed_path, source_languaje, target_languaje], outputs=[original_audio_translated, original_audio_translated_path])
284
+ original_audio_translated.change(fn=tex2speech, inputs=original_audio_translated_path, outputs=translated_audio)
285
 
286
  demo.launch()