import base64 import os import time from dataclasses import dataclass from datetime import timedelta import banana_dev as banana import gradio as gr from loguru import logger from pydub import AudioSegment api_key = os.environ["BANANA_API_KEY"] model_key = os.environ["BANANA_MODEL_KEY"] password = os.environ["PASSWORD"] SECONDS_IN_HOUR = 3600 SECONDS_IN_MINUTE = 60 HOURS_IN_DAY = 24 MICROSECONDS_IN_MILLISECOND = 1000 def timedelta_to_srt_timestamp(timedelta_timestamp): r""" Convert a :py:class:`~datetime.timedelta` to an SRT timestamp. .. doctest:: >>> import datetime >>> delta = datetime.timedelta(hours=1, minutes=23, seconds=4) >>> timedelta_to_srt_timestamp(delta) '01:23:04,000' :param datetime.timedelta timedelta_timestamp: A datetime to convert to an SRT timestamp :returns: The timestamp in SRT format :rtype: str """ hrs, secs_remainder = divmod(timedelta_timestamp.seconds, SECONDS_IN_HOUR) hrs += timedelta_timestamp.days * HOURS_IN_DAY mins, secs = divmod(secs_remainder, SECONDS_IN_MINUTE) msecs = timedelta_timestamp.microseconds // MICROSECONDS_IN_MILLISECOND return "%02d:%02d:%02d,%03d" % (hrs, mins, secs, msecs) def timedelta_to_otr_timestamp(timedelta_timestamp): output = timedelta_to_srt_timestamp(timedelta_timestamp) if output.startswith("00:"): output = output[3:] return output[:-4] @dataclass class Segment: text: str start: float end: float @property def start_ts(self) -> str: return timedelta_to_otr_timestamp(timedelta(seconds=self.start)) @property def end_ts(self): return timedelta_to_otr_timestamp(timedelta(seconds=self.end)) def __str__(self): return f"{self.start_ts} {self.text}" def to_otr(self): sep = " " return f'
' def transcribe(audio=None, url=None): if audio: audio_b64 = base64.b64encode(audio.export().read()).decode("ascii") payload = {"audio_b64": audio_b64} else: payload = {"url": url} response = banana.run(api_key, model_key, payload) print(response) if "error" in response: raise gr.Error(response["error"]) # TODO: not sure why response dict contains multiple model outputs return response["modelOutputs"][0] def run_demo(password, microphone, file_upload): if password not in [os.environ["PASSWORD"], os.environ["ROOT_PASSWORD"]]: raise gr.Error("Der Zugriffscode ist falsch.") if (microphone is not None) and (file_upload is not None): logger.warning( "Achtung: Sie haben sowohl eine Datei hochgeladen als auch über das Mikrofon aufgenommen." " Wir verwenden nur die Datei, die Sie hochgeladen haben." ) elif (microphone is None) and (file_upload is None): raise gr.Error( "Sie müssen entweder eine Datei hochladen oder über das Mikrofon aufnehmen." ) file = microphone if microphone is not None else file_upload start = time.time() cutoff = None if password == os.environ["ROOT_PASSWORD"] else 60_000 transcription = transcribe(AudioSegment.from_file(file)[:cutoff]) segments = [] for seg in transcription["segments"]: text = seg["text"].strip() if not segments or segments[-1].text[-1] in ".:?!": segments.append(Segment(text, seg["start"], seg["end"])) else: segments[-1].text += " " + text logger.info(f"transcription took {time.time()-start:.3f}s") return "\n\n".join(str(s) for s in segments) demo = gr.Interface( fn=run_demo, inputs=[ # gr.Textbox(label="Email", type="email"), gr.Textbox(label="Zugriffscode (siehe oben)"), gr.Audio(source="microphone", type="filepath", label="Aufnehmen"), gr.Audio(source="upload", type="filepath", label="Datei hochladen"), ], outputs=gr.Textbox(label="Automatisches Transkript"), allow_flagging="never", css="footer {visibility: hidden} .meta-text {visibility: hidden}", ) demo.launch() {sep}{self.text}