Spaces:

altryne
/

vidtranslator

Build error

App Files Files Community

Alex Volkov commited on Oct 13, 2022

Commit

2e0131e

1 Parent(s): 5efed34

Captions API support

Browse files

Files changed (4) hide show

app.py +56 -12
download.py +127 -41
requirements.txt +3 -2
utils/apis.py +6 -5

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import gradio
 import gradio as gr
-from download import download_generator
 import anvil.media
 import os
 import dotenv
@@ -24,16 +24,29 @@ preload_model: str = args.get("preload")
 LANG_CHOICES = sorted([x.capitalize() for x in LANGUAGES.values()])
 LANG_CHOICES.insert(0, "Autodetect")
-url_input = gr.Textbox(label="Youtube/Twitter/etc video URL (supports many services)", value='https://twitter.com/starsonxh/status/1552945347194142720', lines=1, elem_id="url_input")
 # download_status = gr.Textbox(label="Status:", value='', lines=1, elem_id="download_status")
 download_status = gr.Checkbox(label="", elem_id="download_status", interactive=False)
 translate_action = gr.Checkbox(label="Auto translate to english", elem_id='translate_toggle', interactive=True, value=True)
 init_video = gr.Video(label="Upload video manually", visible=True, interactive=True, mirror_webcam=False)
 init_audio = gr.Audio(label="Downloaded audio", visible=False)
 output_text = gr.Textbox(label="Output text", lines=5, visible=False, max_lines=10, interactive=True, elem_id="output_text")
 sub_video = gr.Video(label="Subbed video", visible=False, mirror_webcam=False)
 def predownload(url, translate_action, source_language):
   files = []
@@ -54,10 +67,13 @@ def predownload(url, translate_action, source_language):
                                               label=f"Subtitles transcribed from {response['whisper_result'].get('language')} (detected language)")
     if 'srt_path' in response:
       files.append(response["srt_path"])
     if 'sub_video' in response:
       updates_object[sub_video] = gr.update(visible=True, value=response["sub_video"],
                                             label=f"Subbed video: {meta['id']}_translated.mp4")
       files.append(response["sub_video"])
     updates_object[output_file] = gr.update(value=files, visible=len(files) > 0, label=f"Output Files")
@@ -105,9 +121,10 @@ with gr.Blocks(css='@import "file=static/css/main.css";', theme='darkpeach', tit
     with gr.Column():
       sub_video.render()
-  outputs = [download_status, init_video, init_audio, output_text, sub_video, output_file ]
   inputs = [url_input, translate_action, source_language]
   action_btn.click(fn=predownload, inputs=inputs, outputs=outputs, api_name='predownload')
   url_input.submit(fn=predownload, inputs=inputs, outputs=outputs)
@@ -116,7 +133,7 @@ with gr.Blocks(css='@import "file=static/css/main.css";', theme='darkpeach', tit
   translate_action.change(fn=lambda x: {action_btn: gr.update(value=f"Translate" if x else "Transcribe")},
                           inputs=[translate_action], outputs=[action_btn])
   gr.HTML("""<div class='footer'>
     <div class="relative">
   <div class="absolute inset-0 flex items-center" aria-hidden="true">
@@ -131,10 +148,40 @@ with gr.Blocks(css='@import "file=static/css/main.css";', theme='darkpeach', tit
      </div>""")
   def init_video_manual_upload(url, init_video):
-    print(url)
-    print(init_video)
-  init_video.change(fn=init_video_manual_upload, inputs=[url_input, init_video], outputs=[])
   # Render imported buttons for API bindings
   render_api_elements(url_input,download_status, output_text, sub_video, output_file)
@@ -142,9 +189,6 @@ with gr.Blocks(css='@import "file=static/css/main.css";', theme='darkpeach', tit
 queue_placeholder = demo.queue()
-@anvil.server.callable
-def temp():
-  return 'temp worked'
 if __name__ == "__main__":
   gradio.close_all()

 import gradio
 import gradio as gr
+from download import download_generator, user_uploaded_video_generator
 import anvil.media
 import os
 import dotenv
 LANG_CHOICES = sorted([x.capitalize() for x in LANGUAGES.values()])
 LANG_CHOICES.insert(0, "Autodetect")
+VIDEO_HTML = """
+<video
+  class="video-js"
+  controls
+  preload="auto"
+  width="640"
+  height="264"
+  data-setup='{}'>
+  <source src="{src}" type="video/mp4">
+  <track kind="captions" src="{en_vtt}" srclang="en" label="English" default>
+</video>
+"""
+url_input = gr.Textbox(label="Youtube/Twitter/etc video URL (supports many services)", lines=1, elem_id="url_input")
 # download_status = gr.Textbox(label="Status:", value='', lines=1, elem_id="download_status")
 download_status = gr.Checkbox(label="", elem_id="download_status", interactive=False)
 translate_action = gr.Checkbox(label="Auto translate to english", elem_id='translate_toggle', interactive=True, value=True)
 init_video = gr.Video(label="Upload video manually", visible=True, interactive=True, mirror_webcam=False)
 init_audio = gr.Audio(label="Downloaded audio", visible=False)
 output_text = gr.Textbox(label="Output text", lines=5, visible=False, max_lines=10, interactive=True, elem_id="output_text")
+output_text_2 = gr.Textbox(label="Output text 2", lines=5, visible=False, max_lines=10, interactive=True, elem_id="output_text")
 sub_video = gr.Video(label="Subbed video", visible=False, mirror_webcam=False)
+sub_video_html = gr.HTML(value=f"<div> Please wait for video to load </div>")
 def predownload(url, translate_action, source_language):
   files = []
                                               label=f"Subtitles transcribed from {response['whisper_result'].get('language')} (detected language)")
     if 'srt_path' in response:
       files.append(response["srt_path"])
+    if 'vtt_path' in response:
+      files.append(response["srt_path"])
     if 'sub_video' in response:
       updates_object[sub_video] = gr.update(visible=True, value=response["sub_video"],
                                             label=f"Subbed video: {meta['id']}_translated.mp4")
+      updates_object[sub_video_html] = gr.update(value=VIDEO_HTML.format(src=f"file={response['sub_video']}") )
       files.append(response["sub_video"])
     updates_object[output_file] = gr.update(value=files, visible=len(files) > 0, label=f"Output Files")
     with gr.Column():
       sub_video.render()
+      sub_video_html.render()
+  outputs = [download_status, init_video, init_audio, output_text, sub_video, output_file, sub_video_html]
   inputs = [url_input, translate_action, source_language]
   action_btn.click(fn=predownload, inputs=inputs, outputs=outputs, api_name='predownload')
   url_input.submit(fn=predownload, inputs=inputs, outputs=outputs)
   translate_action.change(fn=lambda x: {action_btn: gr.update(value=f"Translate" if x else "Transcribe")},
                           inputs=[translate_action], outputs=[action_btn])
+  examples = gr.Examples([["https://twitter.com/starsonxh/status/1552945347194142720", "Adam"], ["https://twitter.com/starsonxh/status/1552945347194142720", "Eve"]], [url_input, output_text] )
   gr.HTML("""<div class='footer'>
     <div class="relative">
   <div class="absolute inset-0 flex items-center" aria-hidden="true">
      </div>""")
   def init_video_manual_upload(url, init_video):
+    if url:
+      return False
+    files = []
+    for response in user_uploaded_video_generator(init_video):
+      updates_object = {}
+      updates_object[download_status] = gr.update(label=f"{response.get('message')}")
+      if 'audio' in response:
+        updates_object[init_audio] = gr.update(visible=True, value=response["audio"],
+                                               label=f"Extracted audio")
+        files.append(response["audio"])
+        files.append(response["video"])
+      if 'srt_path' in response:
+        updates_object[output_text] = gr.update(value=response['srt_path'], visible=True)
+        files.append(response["srt_path"])
+        updates_object[sub_video_html] = gr.update(value=VIDEO_HTML % f"file={response['sub_video']}")
+      if 'vtt_path' in response:
+        updates_object[output_text_2] = gr.update(value=response['vtt_path'], visible=True)
+        files.append(response["vtt_path"])
+        updates_object[sub_video_html] = gr.update(value=VIDEO_HTML.format(src=f"file={response['sub_video']}", en_vtt=f"file={response['vtt_path']}"))
+      #
+      # updates_object[output_file] = gr.update(value=files, visible=len(files) > 0, label=f"Output Files")
+      yield updates_object
+  init_video.change(fn=init_video_manual_upload,
+                    inputs=[url_input, init_video],
+                    outputs=[download_status, init_audio, sub_video_html, output_file])
   # Render imported buttons for API bindings
   render_api_elements(url_input,download_status, output_text, sub_video, output_file)
 queue_placeholder = demo.queue()
 if __name__ == "__main__":
   gradio.close_all()

download.py CHANGED Viewed

@@ -1,11 +1,12 @@
 import sys
 import time
 from pathlib import Path
 import anvil.server
 import anvil.media
-from whisper.utils import write_srt
-from youtube_dl import YoutubeDL
-from youtube_dl.utils import DownloadError
 import os
 import tempfile
 import json
@@ -61,6 +62,7 @@ def download_generator(url, translate_action=True, source_language='Autodetect',
     raise e
   srt_path = tempdir / f"{meta['id']}.srt"
   if not corrected_subtitles:
     ### Step 3 : Transcribe with whisper
@@ -70,9 +72,12 @@ def download_generator(url, translate_action=True, source_language='Autodetect',
       with open(srt_path, "w", encoding="utf-8") as srt:
         write_srt(whisper_result["segments"], file=srt)
       whisper_result["srt"] = Path(srt_path).read_text()
-      yield {"message": f"Transcribe successful", "whisper_result": whisper_result, "meta": meta, "srt_path": srt_path}
     except Exception as e:
       os.chdir(original_dir)
       yield {"message": f"{e}"}
@@ -106,51 +111,95 @@ def download_generator(url, translate_action=True, source_language='Autodetect',
     yield {"message": f"{e}"}
-def caption_generator(tweet_url, language="Autodetect", model_size=model_size):
-  # Download the file
-  try:
-    print(f"Downloading {tweet_url} ")
-    meta = check_download(tweet_url)
-    tempdir = output_dir / f"{meta['id']}"
-    print(f"Downloaded {meta['id']}.mp3 from {meta['uploader_id']} and url {meta['webpage_url']}")
-  except Exception as e:
-    print(f"Could not download file: {e}")
-    raise
-  try:
-    print(f"Starting audio only download with URL {tweet_url}, this may take a while")
-    meta, video, audio = download(tweet_url, tempdir, keepVideo=False)
-    print(f"Downloaded video and extracted audio")
-  except Exception as e:
-    print(f"Could not download file: {e}")
-    raise
   # Run whisper on the audio with language unless auto
   try:
-    print(f"Starting whisper transcribe with {meta['id']}.mp3")
-    transcribe_whisper_result = transcribe(audio, translate_action=False, language=language, override_model_size=model_size)
     detected_language = LANGUAGES[transcribe_whisper_result["language"]]
-    translate_whisper_result = transcribe(audio, translate_action=True, language=detected_language, override_model_size=model_size)
-    srt = get_srt(transcribe_whisper_result["segments"])
-    en_srt = get_srt(translate_whisper_result["segments"])
-    print(f"Transcribe successful!")
   except Exception as e:
     print(f"Could not transcribe file: {e}")
     return
-  return_dict = {
-    "detected_language": LANGUAGES[transcribe_whisper_result["language"]],
-    "requested_language": language,
-    "text": transcribe_whisper_result["text"],
-    "en_text": translate_whisper_result["text"],
-    "srt": srt,
-    "en_srt": en_srt,
-    "meta": meta,
-  }
-  return return_dict
   # Run whisper with translation task enabled (and save to different srt file)
   # Call anvil background task with both files, and both the plain texts
@@ -164,7 +213,7 @@ def progress_hook(d):
     print(filename)
     yield f"Downloaded {filename}"
-def download(url, tempdir, format="bestvideo[ext=mp4]+bestaudio/best", verbose=False, keepVideo=True):
   try:
     ydl_opts = {
       "format": format,
@@ -175,10 +224,10 @@ def download(url, tempdir, format="bestvideo[ext=mp4]+bestaudio/best", verbose=F
         'preferredquality': '192',
       }],
       "skip_download": False,
-      "outtmpl": f"{tempdir}/%(id)s.%(ext)s",
       "noplaylist": True,
       "verbose": verbose,
-      "quiet": True,
       "progress_hooks": [progress_hook],
     }
@@ -197,6 +246,35 @@ def download(url, tempdir, format="bestvideo[ext=mp4]+bestaudio/best", verbose=F
     else:
       return meta, None, str(audio.resolve())
 def check_download(url):
   ydl_opts = {
@@ -217,6 +295,14 @@ def check_download(url):
     return meta
 def transcribe(audio, translate_action=True, language='Autodetect', override_model_size=''):
   task = "translate" if translate_action else "transcribe"
   model_size_to_load = override_model_size if override_model_size else model_size
   print(f'Starting {task} with whisper size {model_size_to_load} on {audio}')

+import shutil
 import sys
 import time
 from pathlib import Path
 import anvil.server
 import anvil.media
+from whisper.utils import write_srt, write_vtt
+from yt_dlp import YoutubeDL
+from yt_dlp.utils import DownloadError
 import os
 import tempfile
 import json
     raise e
   srt_path = tempdir / f"{meta['id']}.srt"
+  vtt_path = tempdir / f"{meta['id']}.vtt"
   if not corrected_subtitles:
     ### Step 3 : Transcribe with whisper
       with open(srt_path, "w", encoding="utf-8") as srt:
         write_srt(whisper_result["segments"], file=srt)
+      with open(vtt_path, "w", encoding="utf-8") as vtt:
+        write_vtt(whisper_result["segments"], file=vtt)
       whisper_result["srt"] = Path(srt_path).read_text()
+      whisper_result["vtt"] = Path(vtt_path).read_text()
+      yield {"message": f"Transcribe successful", "whisper_result": whisper_result, "meta": meta, "srt_path": srt_path, "vtt_path": vtt_path}
     except Exception as e:
       os.chdir(original_dir)
       yield {"message": f"{e}"}
     yield {"message": f"{e}"}
+def user_uploaded_video_generator(video, translate_action=True, source_language='Autodetect', corrected_subtitles=None):
+  video_name = Path(video).stem
+  # create tempdir
+  tempdir = output_dir / video_name
+  tempdir.mkdir(parents=True, exist_ok=True)
+  # copy video with shutil.copy2
+  video_path = tempdir / Path(video).name
+  shutil.copy2(video, video_path)
+  yield {"message": f"Extracting audio from {video_name}", "video": video_path}
+  # TODO : extract audio from videos
+  output_audio = tempdir / f"{video_name}.mp3"
+  ffmpeg.input(video_path).output(filename=output_audio).run()
+  yield {"message": f"Got audio from {video_name}", "video": video, "audio": output_audio}
   # Run whisper on the audio with language unless auto
   try:
+    audio_file = output_audio
+    print(f"Starting whisper transcribe with {output_audio}")
+    transcribe_whisper_result = transcribe(audio_file, translate_action=False, language='Autodetect', override_model_size=model_size)
+    yield {"message": f"Finished transcription, starting translation to {transcribe_whisper_result['language']}"}
     detected_language = LANGUAGES[transcribe_whisper_result["language"]]
+    translate_whisper_result = transcribe(audio_file, translate_action=True, language=detected_language, override_model_size=model_size)
+    yield {"message": f"Finished translation to English, preparing subtitle files"}
+    with open(tempdir / f"{video_name}.vtt", "w", encoding="utf-8") as vtt:
+      write_vtt(transcribe_whisper_result['segments'], file=vtt)
+    # yield {"message": f"Created VTT files", "vtt_path": f"{video_name}.vtt", "vtt_en_path": f"{video_name}.en.vtt"}
+    # write_srt(transcribe_whisper_result['segments'], tempdir / f"{video_name}.srt")
+    # write_srt(translate_whisper_result['segments'], tempdir / f"{video_name}_en.srt")
+    # yield {"message": f"Created SRT files", "srt_path": f"{video_name}.srt", "srt_en_path": f"{video_name}.en.srt"}
+    # print(f"Transcribe successful!")
   except Exception as e:
     print(f"Could not transcribe file: {e}")
     return
+def caption_generator(social_media_url,uid, language="Autodetect", model_size=model_size):
+  with tempfile.TemporaryDirectory() as tempdir:
+    tempdir = Path(tempdir)
+    # try:
+    #   print(f"Downloading {social_media_url} ")
+    #   meta = check_download(social_media_url)
+    #   print(f"Downloaded {meta['id']}.mp3 from {meta['uploader_id']} and url {meta['webpage_url']}")
+    # except Exception as e:
+    #   print(f"Could not download file: {e}")
+    #   raise
+    try:
+      print(f"Starting audio only download with URL {social_media_url}, this may take a while")
+      meta, audio = download_audio(social_media_url, tempdir, id=uid)
+      print(f"Downloaded video and extracted audio")
+    except Exception as e:
+      print(f"Could not download file: {e}")
+      raise
+    # Run whisper on the audio with language unless auto
+    try:
+      print(f"Starting whisper transcribe with {uid}.mp3")
+      transcribe_whisper_result = transcribe(audio, translate_action=False, language=language, override_model_size=model_size)
+      detected_language = LANGUAGES[transcribe_whisper_result["language"]]
+      translate_whisper_result = transcribe(audio, translate_action=True, language=detected_language, override_model_size=model_size)
+      print(f"Transcribe successful!, writing files")
+      vtt_path = tempdir / f"{transcribe_whisper_result['language']}.vtt"
+      en_vtt_path = tempdir / f"en.vtt"
+      with open(vtt_path.resolve(), "w", encoding="utf-8") as vtt:
+        write_vtt(transcribe_whisper_result["segments"], file=vtt)
+      with open(en_vtt_path.resolve(), "w", encoding="utf-8") as en_vtt:
+        write_vtt(transcribe_whisper_result["segments"], file=en_vtt)
+    except Exception as e:
+      print(f"Could not transcribe file: {e}")
+      return
+    whisper_result_captions =  [
+      {
+        "language_tag": transcribe_whisper_result["language"],
+        "vtt_file": anvil.BlobMedia(content_type="text/plain", content=vtt_path.read_bytes(), name=f"{uid}.{transcribe_whisper_result['language']}.vtt")
+      },
+      {
+        "language_tag": "en",
+        "vtt_file": anvil.BlobMedia(content_type="text/plain", content=vtt_path.read_bytes(), name=f"{uid}.en.vtt")
+      }
+    ]
+    return 'success', whisper_result_captions
   # Run whisper with translation task enabled (and save to different srt file)
   # Call anvil background task with both files, and both the plain texts
     print(filename)
     yield f"Downloaded {filename}"
+def download(url, tempdir, format="bestvideo[ext=mp4]+bestaudio/best", verbose=False, keepVideo=True, filename="%(id)s.%(ext)s"):
   try:
     ydl_opts = {
       "format": format,
         'preferredquality': '192',
       }],
       "skip_download": False,
+      "outtmpl": f"{tempdir}/{filename}",
       "noplaylist": True,
       "verbose": verbose,
+      "quiet": False,
       "progress_hooks": [progress_hook],
     }
     else:
       return meta, None, str(audio.resolve())
+def download_audio(url, tempdir, format="bestaudio/best", verbose=False, id=None):
+  filename = f"{id}.%(ext)s"
+  try:
+    ydl_opts = {
+      "format": format,
+      "keepvideo": False,
+      'postprocessors': [{
+        'key': 'FFmpegExtractAudio',
+        'preferredcodec': 'mp3',
+        'preferredquality': '192',
+      }],
+      "skip_download": False,
+      "outtmpl": f"{tempdir}/{filename}",
+      "noplaylist": True,
+      "verbose": verbose,
+      "quiet": False,
+      "progress_hooks": [progress_hook],
+    }
+    ydl = YoutubeDL(ydl_opts)
+    meta = ydl.extract_info(
+      url,
+      download=True,
+    )
+  except DownloadError as e:
+    raise e
+  else:
+    audio = tempdir / f"{id}.mp3"
+    return meta, str(audio.resolve())
 def check_download(url):
   ydl_opts = {
     return meta
 def transcribe(audio, translate_action=True, language='Autodetect', override_model_size=''):
+  """
+  Transcribe audio file with whisper
+  :param audio: - The audio file to transcribe
+  :param translate_action: Bool - Whether to translate to English or keep original language
+  :param language: String - The language to transcribe to, default is Autodetect
+  :param override_model_size: Bool - Whether to override the model size
+  :return:
+  """
   task = "translate" if translate_action else "transcribe"
   model_size_to_load = override_model_size if override_model_size else model_size
   print(f'Starting {task} with whisper size {model_size_to_load} on {audio}')

requirements.txt CHANGED Viewed

@@ -1,8 +1,9 @@
-youtube-dl==2021.12.17
 whisper @ git+https://github.com/openai/whisper.git@main#egg=whisper==1.1.5
 anvil-uplink==0.4.0
 gradio==3.4.0
 python-dotenv==0.21.0
 aiohttp==3.8.3
 aiohttp-requests==0.1.3
-fsspec=2022.8.2

+youtube-dl==2021.12.17 #remove this, moved to yt-dlp
 whisper @ git+https://github.com/openai/whisper.git@main#egg=whisper==1.1.5
 anvil-uplink==0.4.0
 gradio==3.4.0
 python-dotenv==0.21.0
 aiohttp==3.8.3
 aiohttp-requests==0.1.3
+fsspec=2022.8.2
+yt-dlp==2022.10.4

utils/apis.py CHANGED Viewed

@@ -15,7 +15,7 @@ from download import download_generator, caption_generator
 dotenv.load_dotenv()
 @anvil.server.callable
 def call_gradio_api(api_name='test_api', data=()):
   port = os.environ.get('SERVER_PORT', 8111)
@@ -64,16 +64,16 @@ def test_api(url=''):
   return f"I've slept for 15 seconds and now I'm done. "
 #TODO: add telegram error handler here
-def caption(tweet_url="", language="Autodetect", override_model_size=""):
   """
   :param media_id: The twitter media ID object
   :param user_id_str: The twitter user ID string
   :param tweet_url: tweet URL can potentially not exist in the future, so we can upload on behalf of the user
   :return:
   """
-  response = caption_generator(tweet_url, language, override_model_size)
-  return json.dumps(response)
 def render_api_elements(url_input, download_status, output_text, sub_video, output_file):
   with gr.Group(elem_id='fake_ass_group') as api_buttons:
@@ -97,6 +97,7 @@ def render_api_elements(url_input, download_status, output_text, sub_video, outp
              fn=caption,
              inputs=[
                      gr.Text(label='tweet_url'),
                      gr.Text(label='language (optional)'),
                      gr.Dropdown(label='Model Size', choices=['base', 'tiny', 'small', 'medium', 'large']),
                      ],

 dotenv.load_dotenv()
+@anvil.server.background_task
 @anvil.server.callable
 def call_gradio_api(api_name='test_api', data=()):
   port = os.environ.get('SERVER_PORT', 8111)
   return f"I've slept for 15 seconds and now I'm done. "
 #TODO: add telegram error handler here
+def caption(downloadable_url="",uid="", language="Autodetect", override_model_size=""):
   """
   :param media_id: The twitter media ID object
   :param user_id_str: The twitter user ID string
   :param tweet_url: tweet URL can potentially not exist in the future, so we can upload on behalf of the user
   :return:
   """
+  status, whisper_result_captions = caption_generator(downloadable_url, uid, language, override_model_size)
+  anvil.server.launch_background_task('add_captions_to_video', uid, whisper_result_captions)
+  return {'status': status, 'message': 'started a background process to upload subtitles to {uid}' }
 def render_api_elements(url_input, download_status, output_text, sub_video, output_file):
   with gr.Group(elem_id='fake_ass_group') as api_buttons:
              fn=caption,
              inputs=[
                      gr.Text(label='tweet_url'),
+                     gr.Text(label='media_uid'),
                      gr.Text(label='language (optional)'),
                      gr.Dropdown(label='Model Size', choices=['base', 'tiny', 'small', 'medium', 'large']),
                      ],