whisper-demo-es-medium

Build error

App Files Files Community

juancopi81 commited on Dec 21, 2022

Commit

6249bc9

•

1 Parent(s): 73a3627

Create YouTube Illustrated Summary

Browse files

Files changed (9) hide show

.gitignore +132 -0
README.md +4 -4
app.py +241 -87
requirements.txt +10 -2
textprocessor.py +56 -0
transcriber.py +12 -0
utils.py +6 -0
videocreator.py +46 -0
youtubeaudioextractor.py +25 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,132 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# vscode
+.vscode/

README.md CHANGED Viewed

@@ -1,15 +1,15 @@
 ---
-title: Whisper Demo
-emoji: 🇪🇸
 colorFrom: indigo
 colorTo: red
 sdk: gradio
-sdk_version: 3.9.1
 app_file: app.py
 pinned: false
 tags:
 - whisper-event
-duplicated_from: whisper-event/whisper-demo
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: YouTube to Illustrated Summary
+emoji: 🏢
 colorFrom: indigo
 colorTo: red
 sdk: gradio
+sdk_version: 3.14.0
 app_file: app.py
 pinned: false
+license: openrail
 tags:
 - whisper-event
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -1,97 +1,251 @@
-import torch
 import gradio as gr
-import pytube as pt
 from transformers import pipeline
-from huggingface_hub import model_info
-MODEL_NAME = "juancopi81/whisper-medium-es" #this always needs to stay in line 8 :D sorry for the hackiness
 lang = "es"
-device = 0 if torch.cuda.is_available() else "cpu"
-pipe = pipeline(
     task="automatic-speech-recognition",
-    model=MODEL_NAME,
     chunk_length_s=30,
     device=device,
 )
-pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")
-def transcribe(microphone, file_upload):
-    warn_output = ""
-    if (microphone is not None) and (file_upload is not None):
-        warn_output = (
-            "WARNING: You've uploaded an audio file and used the microphone. "
-            "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
-        )
-    elif (microphone is None) and (file_upload is None):
-        return "ERROR: You have to either use the microphone or upload an audio file"
-    file = microphone if microphone is not None else file_upload
-    text = pipe(file)["text"]
-    return warn_output + text
-def _return_yt_html_embed(yt_url):
-    video_id = yt_url.split("?v=")[-1]
-    HTML_str = (
-        f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
-        " </center>"
     )
-    return HTML_str
-def yt_transcribe(yt_url):
-    yt = pt.YouTube(yt_url)
-    html_embed_str = _return_yt_html_embed(yt_url)
-    stream = yt.streams.filter(only_audio=True)[0]
-    stream.download(filename="audio.mp3")
-    text = pipe("audio.mp3")["text"]
-    return html_embed_str, text
-demo = gr.Blocks()
-mf_transcribe = gr.Interface(
-    fn=transcribe,
-    inputs=[
-        gr.inputs.Audio(source="microphone", type="filepath", optional=True),
-        gr.inputs.Audio(source="upload", type="filepath", optional=True),
-    ],
-    outputs="text",
-    layout="horizontal",
-    theme="huggingface",
-    title="Whisper Demo: Transcribe Audio",
-    description=(
-        "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the the fine-tuned"
-        f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
-        " of arbitrary length."
-    ),
-    allow_flagging="never",
-)
-yt_transcribe = gr.Interface(
-    fn=yt_transcribe,
-    inputs=[gr.inputs.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL")],
-    outputs=["html", "text"],
-    layout="horizontal",
-    theme="huggingface",
-    title="Whisper Demo: Transcribe YouTube",
-    description=(
-        "Transcribe long-form YouTube videos with the click of a button! Demo uses the the fine-tuned checkpoint:"
-        f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files of"
-        " arbitrary length."
-    ),
-    allow_flagging="never",
-)
-with demo:
-    gr.TabbedInterface([mf_transcribe, yt_transcribe], ["Transcribe Audio", "Transcribe YouTube"])
-demo.launch(enable_queue=True)

 import gradio as gr
+from typing import Any
+import torch
 from transformers import pipeline
+from diffusers import StableDiffusionPipeline
+from TTS.api import TTS
+import utils
+from youtubeaudioextractor import PytubeAudioExtractor
+from transcriber import Transcriber
+from textprocessor import TextProcessor
+from videocreator import VideoCreator
+TRANSCRIBER_MODEL_NAME = "juancopi81/whisper-medium-es"
 lang = "es"
+device = "cuda" if torch.cuda.is_available() else "cpu"
+dtype = torch.float16 if device == "cuda" else torch.float32
+# Detect if code is running in Colab
+is_colab = utils.is_google_colab()
+colab_instruction = "" if is_colab else """
+<p>You can skip the queue using Colab:
+<a href="">
+<img data-canonical-src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg"></a></p>"""
+device_print = "GPU 🔥" if torch.cuda.is_available() else "CPU 🥶"
+# Initialize components
+audio_extractor = PytubeAudioExtractor()
+transcription_pipe = pipeline(
     task="automatic-speech-recognition",
+    model=TRANSCRIBER_MODEL_NAME,
     chunk_length_s=30,
     device=device,
 )
+transcription_pipe.model.config.forced_decoder_ids = transcription_pipe.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")
+audio_transcriber = Transcriber(transcription_pipe)
+openai_model = "text-davinci-003"
+text_processor = TextProcessor(openai_model)
+image_model_id = "runwayml/stable-diffusion-v1-5"
+image_pipeline = StableDiffusionPipeline.from_pretrained(image_model_id,
+                                                         torch_dtype=dtype,
+                                                         revision="fp16")
+image_pipeline = image_pipeline.to(device)
+vo_model_name = TTS.list_models()[22]
+# Init TTS
+tts = TTS(vo_model_name)
+video_creator = VideoCreator(tts, image_pipeline)
+def datapipeline(url: str) -> Any:
+    audio_path_file = audio_extractor.extract(url)
+    print(f"Audio file created at: {audio_path_file}")
+    transcribed_text = audio_transcriber.transcribe(audio_path_file)
+    print("Audio transcription ready!")
+    json_scenes = text_processor.get_json_scenes(transcribed_text)
+    print("Scenes ready")
+    video = video_creator.create_video(json_scenes)
+    return video, video
+css = """
+        a {
+            color: inherit;
+            text-decoration: underline;
+        }
+        .gradio-container {
+            font-family: 'IBM Plex Sans', sans-serif;
+        }
+        .gr-button {
+            color: white;
+            border-color: #000000;
+            background: #000000;
+        }
+        input[type='range'] {
+            accent-color: #000000;
+        }
+        .dark input[type='range'] {
+            accent-color: #dfdfdf;
+        }
+        .container {
+            max-width: 730px;
+            margin: auto;
+            padding-top: 1.5rem;
+        }
+        #gallery {
+            min-height: 22rem;
+            margin-bottom: 15px;
+            margin-left: auto;
+            margin-right: auto;
+            border-bottom-right-radius: .5rem !important;
+            border-bottom-left-radius: .5rem !important;
+        }
+        #gallery>div>.h-full {
+            min-height: 20rem;
+        }
+        .details:hover {
+            text-decoration: underline;
+        }
+        .gr-button {
+            white-space: nowrap;
+        }
+        .gr-button:focus {
+            border-color: rgb(147 197 253 / var(--tw-border-opacity));
+            outline: none;
+            box-shadow: var(--tw-ring-offset-shadow), var(--tw-ring-shadow), var(--tw-shadow, 0 0 #0000);
+            --tw-border-opacity: 1;
+            --tw-ring-offset-shadow: var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width) var(--tw-ring-offset-color);
+            --tw-ring-shadow: var(--tw-ring-inset) 0 0 0 calc(3px var(--tw-ring-offset-width)) var(--tw-ring-color);
+            --tw-ring-color: rgb(191 219 254 / var(--tw-ring-opacity));
+            --tw-ring-opacity: .5;
+        }
+        #advanced-btn {
+            font-size: .7rem !important;
+            line-height: 19px;
+            margin-top: 12px;
+            margin-bottom: 12px;
+            padding: 2px 8px;
+            border-radius: 14px !important;
+        }
+        #advanced-options {
+            margin-bottom: 20px;
+        }
+        .footer {
+            margin-bottom: 45px;
+            margin-top: 35px;
+            text-align: center;
+            border-bottom: 1px solid #e5e5e5;
+        }
+        .footer>p {
+            font-size: .8rem;
+            display: inline-block;
+            padding: 0 10px;
+            transform: translateY(10px);
+            background: white;
+        }
+        .dark .footer {
+            border-color: #303030;
+        }
+        .dark .footer>p {
+            background: #0b0f19;
+        }
+        .acknowledgments h4{
+            margin: 1.25em 0 .25em 0;
+            font-weight: bold;
+            font-size: 115%;
+        }
+        #container-advanced-btns{
+            display: flex;
+            flex-wrap: wrap;
+            justify-content: space-between;
+            align-items: center;
+        }
+        .animate-spin {
+            animation: spin 1s linear infinite;
+        }
+        @keyframes spin {
+            from {
+                transform: rotate(0deg);
+            }
+            to {
+                transform: rotate(360deg);
+            }
+        }
+        #share-btn-container {
+            display: flex; padding-left: 0.5rem !important; padding-right: 0.5rem !important; background-color: #000000; justify-content: center; align-items: center; border-radius: 9999px !important; width: 13rem;
+        }
+        #share-btn {
+            all: initial; color: #ffffff;font-weight: 600; cursor:pointer; font-family: 'IBM Plex Sans', sans-serif; margin-left: 0.5rem !important; padding-top: 0.25rem !important; padding-bottom: 0.25rem !important;
+        }
+        #share-btn * {
+            all: unset;
+        }
+        .gr-form{
+            flex: 1 1 50%; border-top-right-radius: 0; border-bottom-right-radius: 0;
+        }
+        #prompt-container{
+            gap: 0;
+        }
+        #generated_id{
+            min-height: 700px
+        }
+"""
+block = gr.Blocks(css=css)
+with block as demo:
+    gr.HTML(
+        f"""
+            <div style="text-align: center; max-width: 650px; margin: 0 auto;">
+              <div
+                style="
+                  display: inline-flex;
+                  align-items: center;
+                  gap: 0.8rem;
+                  font-size: 1.75rem;
+                "
+              >
+                <h1 style="font-weight: 900; margin-bottom: 7px;">
+                  YouTube to Illustraded Summary
+                </h1>
+              </div>
+              <p style="margin-bottom: 10px; font-size: 94%">
+                Enter the URL of a YouTuve video (Spanish) and you'll recive a video with an illustraded summary.
+                It works for audio books, history lessons, etc. Try it out with a short video (less than 10 minutes).
+              </p>
+              <p style="margin-bottom: 10px; font-size: 94%">
+                Running on <b>{device_print}</b>
+              </p>
+            </div>
+        """
     )
+    with gr.Group():
+        with gr.Box():
+            with gr.Row().style(mobile_collapse=False, equal_height=True):
+                url = gr.Textbox(
+                    label="Enter the URL of the YouTubeVideo", show_label=False, max_lines=1
+                ).style(
+                    border=(True, False, True, True),
+                    rounded=(True, False, False, True),
+                    container=False,
+                )
+                btn = gr.Button("Run").style(
+                    margin=False,
+                    rounded=(False, True, True, False),
+                )
+        video_output = gr.Video()
+        file_output = gr.File()
+        btn.click(datapipeline,
+                  inputs=[url],
+                  outputs=[video_output, file_output])
+    gr.HTML(
+            """
+                <div class="footer">
+                    <p>This demos is part of the Whisper Sprint (Dec. 2022).</a>
+                    </p>
+                </div>
+           """
+        )
+    gr.Markdown('''
+      [![Twitter Follow](https://img.shields.io/twitter/follow/juancopi81?style=social)](https://twitter.com/juancopi81)
+      ![visitors](https://visitor-badge.glitch.me/badge?page_id=Juancopi81.yt-illustraded-summary)
+    ''')
+if not is_colab:
+    demo.queue(concurrency_count=1)
+demo.launch(debug=is_colab, share=is_colab)

requirements.txt CHANGED Viewed

@@ -1,3 +1,11 @@
-git+https://github.com/huggingface/transformers
 torch
-pytube

 torch
+pytube
+git+https://github.com/huggingface/transformers
+openai
+typing
+git+https://github.com/huggingface/diffusers.git
+accelerate
+TTS
+moviepy
+imageio==2.4.1
+tensorboard

textprocessor.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import os
+import json
+from typing import Dict
+import openai
+context_prompt = """
+You are a creator of ilustraded books building a series of scenes for your book.
+Your boss asked you to write a summary and the illustrations of the following text:
+$TRANSCRIPTION
+You have to write the summary using a maximum of 7 scenes, and using the following JSON format:
+Write your answer in JSON format that has: The number of the scene, the summary for each scene, and the Illustration for each scene.
+The value for "Summary" should be in Spanish and it should not be longer than 30 words.
+The value for "Illustration" should be in English and no longer than 20 words. It should have a detail description of an illustration for this scene in English with many details, and a artistic style for the illustration that matches the text.
+Just answer with the JSON object, so your boss can easily parse it.
+"""
+openai.api_key = os.getenv("SECRET_KEY_OPENAI")
+class TextProcessor:
+    def __init__(self,
+                 model: str = "text-davinci-003",
+                 temperature: float = 0.7,
+                 max_tokens: int = 2500,
+                 top_p: int = 1,
+                 frequency_penalty: int = 0,
+                 presence_penalty: int = 0) -> None:
+        self.model = model
+        self.temperature = temperature
+        self.max_tokens = max_tokens
+        self.top_p = top_p
+        self.frequency_penalty = frequency_penalty
+        self.presence_penalty = presence_penalty
+    def get_json_scenes(self,
+                        prompt: str) -> Dict:
+        gpt_prompt = context_prompt.replace("$TRANSCRIPTION", prompt)
+        response = openai.Completion.create(
+            model=self.model,
+            prompt=gpt_prompt,
+            temperature=self.temperature,
+            max_tokens=self.max_tokens,
+            top_p=self.top_p,
+            frequency_penalty=self.frequency_penalty,
+            presence_penalty=self.presence_penalty
+        )
+        scenes = json.loads(response["choices"][0]["text"])
+        if (type(scenes) == list):
+            scenes = {i: d for i, d in enumerate(scenes)}
+        return scenes

transcriber.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from transformers import pipeline
+class Transcriber:
+    def __init__(self, pipe: pipeline) -> None:
+        self.pipe = pipe
+    def transcribe(self, file_path: str = "yt_audio.mp3") -> str:
+        try:
+            transcription = self.pipe(file_path)["text"]
+            return transcription
+        except:
+            return "ERROR: No audio file found to transcribe"

utils.py ADDED Viewed

	@@ -0,0 +1,6 @@

+def is_google_colab():
+    try:
+        import google.colab
+        return True
+    except:
+        return False

videocreator.py ADDED Viewed

	@@ -0,0 +1,46 @@

+from typing import Dict
+from moviepy.editor import VideoFileClip, concatenate_videoclips
+class VideoCreator:
+    def __init__(self,
+                 tts_pipeline,
+                 image_pipeline) -> None:
+        self.tts_pipeline = tts_pipeline
+        self.image_pipeline = image_pipeline
+    def create_video(self, scenes: Dict) -> Dict:
+        videos_dict = {}
+        for index, scene in enumerate(scenes):
+            video_scene = self._create_video_from_scene(scenes[scene])
+            videos_dict[scene] = video_scene
+        merged_video = self._merge_videos(videos_dict)
+        return merged_video
+    def _create_video_from_scene(self, scene: Dict) -> str:
+        audio_file = self._get_audio_from_text(scene["Summary"])
+        bg_image = self._get_bg_image_from_description(scene["Illustration"])
+        video = gr.make_waveform(audio=audio_file,
+                                 bg_image=bg_image)
+        return video
+    def _get_audio_from_text(self, voice_over: str) -> str:
+        self.tts_pipeline.tts_to_file(text=voice_over,
+                                      file_path="output.wav")
+        return "output.wav"
+    def _get_bg_image_from_description(self, img_desc: str):
+        images = self.image_pipeline(img_desc)
+        print("Image generated!")
+        image_output = images.images[0]
+        image_output.save("img.png")
+        return "img.png"
+    def _merge_videos(self, videos_dict: Dict):
+        videos_to_concatenate = []
+        for video in range(len(videos_dict)):
+            video_clip = VideoFileClip(videos_dict[video])
+            videos_to_concatenate.append(video_clip)
+        final_video = concatenate_videoclips(videos_to_concatenate)
+        final_video.write_videofile("final_video.mp4")

youtubeaudioextractor.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from abc import ABC, abstractmethod
+import pytube as pt
+class YouTubeAudioExtractor(ABC):
+    @abstractmethod
+    def extract(self, url: str, save_path: str) -> str:
+        pass
+class PytubeAudioExtractor(YouTubeAudioExtractor):
+    def __init__(self,
+                 only_audio: bool = True,
+                 extension: str = ".mp3") -> None:
+        self.only_audio = only_audio
+        self.extension = extension
+    def extract(self, url: str,
+                save_path: str = "yt_audio") -> str:
+        yt = pt.YouTube(url)
+        stream = yt.streams.filter(only_audio=self.only_audio)[0]
+        filename = save_path + self.extension
+        stream.download(filename=filename)
+        return filename