Spaces:

BenjaminB
/

gistillery

Runtime error

App Files Files Community

Benjamin Bossan commited on May 19, 2023

Commit

2cbbc23

1 Parent(s): 308c6f6

Add youtube transcription processor

Browse files

Files changed (9) hide show

Dockerfile +1 -1
demo.py +2 -0
pyproject.toml +2 -1
requirements-dev.txt +1 -0
requirements.txt +2 -0
src/gistillery/config.py +2 -0
src/gistillery/media.py +68 -0
src/gistillery/preprocessing.py +75 -2
src/gistillery/registry.py +5 -3

Dockerfile CHANGED Viewed

@@ -1,6 +1,6 @@
 FROM pytorch/pytorch:latest
-RUN apt update && apt install -y && rm -rf /var/lib/apt/lists/*
 # Set up a new user named "user" with user ID 1000
 RUN useradd -m -u 1000 user

 FROM pytorch/pytorch:latest
+RUN apt update && apt install -y && apt install ffmpeg && rm -rf /var/lib/apt/lists/*
 # Set up a new user named "user" with user ID 1000
 RUN useradd -m -u 1000 user

demo.py CHANGED Viewed

@@ -6,6 +6,8 @@ client = httpx.Client()
 def submit(inputs):
     payload = {"content": inputs, "author": "anna nymous"}
     httpx.post("http://localhost:8080/submit/", json=payload)

 def submit(inputs):
+    if not inputs:
+        return
     payload = {"content": inputs, "author": "anna nymous"}
     httpx.post("http://localhost:8080/submit/", json=payload)

pyproject.toml CHANGED Viewed

@@ -16,7 +16,8 @@ addopts = "--cov=src --cov-report=term-missing"
 [tool.mypy]
 no_implicit_optional = true
 strict = true
 [[tool.mypy.overrides]]
-module = "huggingface_hub,trafilatura,transformers.*"
 ignore_missing_imports = true

 [tool.mypy]
 no_implicit_optional = true
 strict = true
+plugins = "numpy.typing.mypy_plugin"
 [[tool.mypy.overrides]]
+module = "huggingface_hub,trafilatura,transformers.*,pytube"
 ignore_missing_imports = true

requirements-dev.txt CHANGED Viewed

@@ -5,3 +5,4 @@ ruff
 pytest
 pytest-cov
 types-Pillow

 pytest
 pytest-cov
 types-Pillow
+types-urllib3

requirements.txt CHANGED Viewed

@@ -8,3 +8,5 @@ charset-normalizer
 trafilatura
 pillow
 gradio

 trafilatura
 pillow
 gradio
+urllib3
+pytube

src/gistillery/config.py CHANGED Viewed

@@ -8,6 +8,8 @@ class Config(BaseSettings):
     hf_hub_token: str = "missing"
     hf_agent: str = "https://api-inference.huggingface.co/models/bigcode/starcoder"
     db_file_name: Path = Path("sqlite-data.db")
     class Config:
         # load .env file by default, with provisio to use other .env files if set

     hf_hub_token: str = "missing"
     hf_agent: str = "https://api-inference.huggingface.co/models/bigcode/starcoder"
     db_file_name: Path = Path("sqlite-data.db")
+    sampling_rate: int = 16_000  # audio transcription
+    max_yt_length: int = 1800  # in minutes
     class Config:
         # load .env file by default, with provisio to use other .env files if set

src/gistillery/media.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import subprocess
+import tempfile
+import numpy as np
+import numpy.typing as npt
+import pytube
+def download_yt_audio(url: str, max_length: int) -> str:
+    yt = pytube.YouTube(url)
+    if (max_length is not None) and (yt.length > max_length):
+        raise ValueError(f"Youtube video exceeds max length of {max_length}")
+    video = yt.streams.filter(only_audio=True).first()
+    tmp_path = tempfile.mkdtemp()
+    fname = video.download(output_path=tmp_path)
+    assert isinstance(fname, str)
+    return fname
+def check_ffmpeg_installed() -> None:
+    cmd = ["ffmpeg", "-version"]  # sic
+    try:
+        subprocess.run(cmd, check=True)
+    except FileNotFoundError as exc:
+        raise RuntimeError("This feature requires ffmpeg to be installed") from exc
+# from openai whisper
+def load_audio(file: str, sampling_rate: int) -> npt.NDArray[np.float32]:
+    """Open an audio file and read as mono waveform, resampling as necessary
+    Parameters
+    ----------
+    file: str
+        The audio file to open
+    sampling_rate: int
+        The sample rate to resample the audio if necessary
+    Returns
+    -------
+    A NumPy array containing the audio waveform, in float32 dtype.
+    """
+    check_ffmpeg_installed()  # BB
+    # This launches a subprocess to decode audio while down-mixing
+    # and resampling as necessary.  Requires the ffmpeg CLI in PATH.
+    # fmt: off
+    cmd = [
+        "ffmpeg",
+        "-nostdin",
+        "-threads", "0",
+        "-i", file,
+        "-f", "s16le",
+        "-ac", "1",
+        "-acodec", "pcm_s16le",
+        "-ar", str(sampling_rate),
+        "-"
+    ]
+    # fmt: on
+    try:
+        out = subprocess.run(cmd, capture_output=True, check=True).stdout
+    except subprocess.CalledProcessError as e:
+        raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
+    return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0

src/gistillery/preprocessing.py CHANGED Viewed

@@ -4,15 +4,18 @@ import logging
 import re
 from typing import Optional
 import trafilatura
 from httpx import Client
 from PIL import Image
 from gistillery.base import JobInput
 from gistillery.tools import get_agent
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
@@ -109,3 +112,73 @@ class ImageUrlProcessor(Processor):
         image = Image.open(io.BytesIO(response.content)).convert('RGB')
         caption = get_agent().run("Caption the following image", image=image)
         return str(caption)

 import re
 from typing import Optional
+import torch
 import trafilatura
+import urllib3
 from httpx import Client
 from PIL import Image
+from transformers import AutoProcessor, WhisperForConditionalGeneration
 from gistillery.base import JobInput
+from gistillery.config import get_config
+from gistillery.media import download_yt_audio, load_audio
 from gistillery.tools import get_agent
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
         image = Image.open(io.BytesIO(response.content)).convert('RGB')
         caption = get_agent().run("Caption the following image", image=image)
         return str(caption)
+class YoutubeUrlProcessor(Processor):
+    """Download yt audio, transcribe with whisper"""
+    def __init__(self) -> None:
+        self.client = Client()
+        self.url = Optional[str]
+        self.template = "{url}\n\n{content}"
+        self.processor = AutoProcessor.from_pretrained("openai/whisper-small.en")
+        self.model = WhisperForConditionalGeneration.from_pretrained(
+            "openai/whisper-small.en"
+        )
+        self.hosts = {"www.youtube.com", "youtube.com", "youtu.be"}
+    def match(self, input: JobInput) -> bool:
+        url = get_url(input.content.strip())
+        if url is None:
+            return False
+        parsed = urllib3.util.parse_url(url)
+        if parsed.host not in self.hosts:
+            return False
+        self.url = url
+        return True
+    @staticmethod
+    def make_batch(input_ids: torch.Tensor, max_len: int) -> torch.Tensor:
+        """Create batches from last dimension, pad last batch if necessary
+        Examples
+        >>> import torch
+        >>> x = torch.zeros((1, 10, 213))
+        >>> YoutubeUrlProcessor.make_batch(x, max_len=100).shape
+        torch.Size([3, 10, 100])
+        """
+        # ugly workaround, transformers whisper implementation requires a
+        # specific shape of input length, probably there is a better way...
+        batches = input_ids.split(max_len, dim=-1)  # type: ignore
+        last = batches[-1]
+        n = last.shape[-1]
+        last = torch.nn.functional.pad(last, (1, max_len - n - 1), value=0.0)
+        batches = batches[:-1] + (last,)
+        return torch.concat(batches)
+    def process(self, input: JobInput) -> str:
+        if not isinstance(self.url, str):
+            raise TypeError("self.url must be a string")
+        config = get_config()
+        fname = download_yt_audio(self.url, max_length=config.max_yt_length)
+        audio = load_audio(fname, sampling_rate=config.sampling_rate)
+        inputs = self.processor(
+            audio,
+            return_tensors='pt',
+            sampling_rate=config.sampling_rate,
+            max_length=-1,
+        )
+        batch = self.make_batch(
+            inputs['input_features'], max_len=2 * self.model.config.max_source_positions
+        )
+        generated_ids = self.model.generate(batch)
+        transcription = self.processor.batch_decode(
+            generated_ids, skip_special_tokens=True
+        )
+        return self.template.format(url=self.url, content=" ".join(transcription))

src/gistillery/registry.py CHANGED Viewed

@@ -1,11 +1,12 @@
 from gistillery.base import JobInput
-from gistillery.tools import Summarizer, Tagger, HfDefaultSummarizer, HfDefaultTagger
 from gistillery.preprocessing import (
     Processor,
     RawTextProcessor,
-    ImageUrlProcessor,
-    DefaultUrlProcessor,
 )
 class ToolRegistry:
@@ -57,6 +58,7 @@ def get_tool_registry() -> ToolRegistry:
     tagger = HfDefaultTagger()
     _registry = ToolRegistry()
     _registry.register_processor(ImageUrlProcessor())
     _registry.register_processor(DefaultUrlProcessor())
     _registry.register_processor(RawTextProcessor())

 from gistillery.base import JobInput
 from gistillery.preprocessing import (
+    DefaultUrlProcessor,
+    ImageUrlProcessor,
     Processor,
     RawTextProcessor,
+    YoutubeUrlProcessor,
 )
+from gistillery.tools import HfDefaultSummarizer, HfDefaultTagger, Summarizer, Tagger
 class ToolRegistry:
     tagger = HfDefaultTagger()
     _registry = ToolRegistry()
+    _registry.register_processor(YoutubeUrlProcessor())
     _registry.register_processor(ImageUrlProcessor())
     _registry.register_processor(DefaultUrlProcessor())
     _registry.register_processor(RawTextProcessor())