Spaces:
Build error
Build error
Commit
·
e3eb20a
1
Parent(s):
d8cd210
adding translation
Browse files- .gitignore +2 -0
- app.py +119 -13
- requirements.txt +36 -2
.gitignore
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
local
|
| 2 |
+
.idea
|
app.py
CHANGED
|
@@ -1,23 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import whisper
|
| 3 |
import numpy as np
|
| 4 |
import pytube as pt
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
print(
|
| 8 |
-
f"Model is {'multilingual' if
|
| 9 |
-
f"and has {sum(np.prod(p.shape) for p in
|
| 10 |
)
|
| 11 |
|
| 12 |
-
options = dict(language='
|
| 13 |
-
transcribe_options = dict(task="
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
|
| 17 |
def transcribe(audio):
|
| 18 |
-
transcription =
|
| 19 |
-
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
|
| 23 |
def youtube_transcribe(url):
|
|
@@ -36,16 +142,16 @@ mic_interface = gr.Interface(
|
|
| 36 |
)
|
| 37 |
|
| 38 |
audio_interface = gr.Interface(
|
| 39 |
-
fn=
|
| 40 |
inputs=gr.Audio(type="filepath"),
|
| 41 |
-
outputs=["text", "text"],
|
| 42 |
title="Transcribir y traducir audio",
|
| 43 |
)
|
| 44 |
|
| 45 |
video_interface = gr.Interface(
|
| 46 |
-
fn=
|
| 47 |
inputs="video",
|
| 48 |
-
outputs=["text", "text"],
|
| 49 |
title="Transcribir y traducir audio",
|
| 50 |
)
|
| 51 |
|
|
|
|
| 1 |
+
import os.path
|
| 2 |
+
import pathlib
|
| 3 |
+
from io import StringIO
|
| 4 |
import gradio as gr
|
| 5 |
import whisper
|
| 6 |
import numpy as np
|
| 7 |
import pytube as pt
|
| 8 |
+
import ffmpeg
|
| 9 |
+
import textwrap
|
| 10 |
+
import cv2
|
| 11 |
+
from transformers import pipeline
|
| 12 |
|
| 13 |
+
APP_DIR = pathlib.Path(__file__).parent.absolute()
|
| 14 |
+
|
| 15 |
+
LOCAL_DIR = APP_DIR / "local"
|
| 16 |
+
LOCAL_DIR.mkdir(exist_ok=True)
|
| 17 |
+
save_dir = LOCAL_DIR / "output"
|
| 18 |
+
save_dir.mkdir(exist_ok=True)
|
| 19 |
+
|
| 20 |
+
transcriber = whisper.load_model("base")
|
| 21 |
print(
|
| 22 |
+
f"Model is {'multilingual' if transcriber.is_multilingual else 'English-only'} "
|
| 23 |
+
f"and has {sum(np.prod(p.shape) for p in transcriber.parameters()):,} parameters."
|
| 24 |
)
|
| 25 |
|
| 26 |
+
options = dict(language='en', beam_size=5, best_of=5)
|
| 27 |
+
transcribe_options = dict(task="translate", **options)
|
| 28 |
+
|
| 29 |
+
translator = pipeline("translation_en_to_es", model="Helsinki-NLP/opus-mt-en-es")
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def translate(text):
|
| 33 |
+
return translator(text)[0]["translation_text"]
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def format_timestamp(seconds, always_include_hours=False, fractional_seperator='.'):
|
| 37 |
+
assert seconds >= 0, "non-negative timestamp expected"
|
| 38 |
+
milliseconds = round(seconds * 1000.0)
|
| 39 |
+
|
| 40 |
+
hours = milliseconds // 3_600_000
|
| 41 |
+
milliseconds -= hours * 3_600_000
|
| 42 |
+
|
| 43 |
+
minutes = milliseconds // 60_000
|
| 44 |
+
milliseconds -= minutes * 60_000
|
| 45 |
+
|
| 46 |
+
seconds = milliseconds // 1_000
|
| 47 |
+
milliseconds -= seconds * 1_000
|
| 48 |
+
|
| 49 |
+
hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
|
| 50 |
+
return f"{hours_marker}{minutes:02d}:{seconds:02d}{fractional_seperator}{milliseconds:03d}"
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def process_text(text: str, max_line_width=None):
|
| 54 |
+
if max_line_width is None or max_line_width < 0:
|
| 55 |
+
return text
|
| 56 |
+
|
| 57 |
+
lines = textwrap.wrap(text, width=max_line_width, tabsize=4)
|
| 58 |
+
return '\n'.join(lines)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def write_srt(transcript, file, max_line_width=None):
|
| 62 |
+
for i, segment in enumerate(transcript, start=1):
|
| 63 |
+
text = process_text(segment['text'].strip(), max_line_width).replace('-->', '->')
|
| 64 |
+
|
| 65 |
+
# write srt lines
|
| 66 |
+
print(
|
| 67 |
+
f"{i}\n"
|
| 68 |
+
f"{format_timestamp(segment['start'], always_include_hours=True, fractional_seperator=',')} --> "
|
| 69 |
+
f"{format_timestamp(segment['end'], always_include_hours=True, fractional_seperator=',')}\n"
|
| 70 |
+
f"{text}\n",
|
| 71 |
+
file=file,
|
| 72 |
+
flush=True,
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def get_subs(segments, max_line_width):
|
| 77 |
+
for sentence in segments:
|
| 78 |
+
sentence['text'] = translate(sentence["text"])
|
| 79 |
+
segment_stream = StringIO()
|
| 80 |
+
write_srt(segments, file=segment_stream, max_line_width=max_line_width)
|
| 81 |
+
segment_stream.seek(0)
|
| 82 |
+
return segment_stream.read()
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def generate_subtitled_video(video, audio, transcript):
|
| 86 |
+
video_file = ffmpeg.input(video)
|
| 87 |
+
audio_file = ffmpeg.input(audio)
|
| 88 |
+
ffmpeg.concat(video_file.filter(
|
| 89 |
+
"subtitles", transcript
|
| 90 |
+
), audio_file, v=1, a=1).output(f"{save_dir}/final.mp4").run(quiet=True, overwrite_output=True)
|
| 91 |
+
return f"{save_dir}/final.mp4"
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def generate_subtitled_audio(audio, transcript):
|
| 95 |
+
if not os.path.exists(f'{save_dir}/cover.jpg'):
|
| 96 |
+
cover = np.zeros([320, 640, 3], dtype=np.uint8)
|
| 97 |
+
cv2.imwrite(f'{save_dir}/cover.jpg', cover)
|
| 98 |
+
os.system(f'ffmpeg -y -loop 1 -i {save_dir}/cover.jpg '
|
| 99 |
+
f'-i {audio} -c:v libx264 '
|
| 100 |
+
f'-tune stillimage -c:a aac -b:a 192k -pix_fmt yuv420p -shortest '
|
| 101 |
+
f'-vf "subtitles={transcript}" {save_dir}/final.mp4')
|
| 102 |
+
return f"{save_dir}/final.mp4"
|
| 103 |
|
| 104 |
|
| 105 |
def transcribe(audio):
|
| 106 |
+
transcription = transcriber.transcribe(audio, **transcribe_options)
|
| 107 |
+
srt = get_subs(transcription["segments"], 80)
|
| 108 |
+
with open(f"{save_dir}/transcript.srt", "w+", encoding='utf8') as f:
|
| 109 |
+
f.writelines(srt)
|
| 110 |
+
f.close()
|
| 111 |
+
return transcription["text"], srt
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def transcribe_audio(audio):
|
| 115 |
+
transcription, translation = transcribe(audio)
|
| 116 |
+
return generate_subtitled_audio(audio, f"{save_dir}/transcript.srt"), transcription, translation
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def transcribe_video(video):
|
| 120 |
+
audio = ffmpeg.input(video)
|
| 121 |
+
audio = ffmpeg.output(audio, f"{save_dir}/output.wav", acodec="pcm_s16le", ac=1, ar="16k")
|
| 122 |
+
ffmpeg.run(audio, overwrite_output=True)
|
| 123 |
+
audio = whisper.load_audio(f"{save_dir}/output.wav")
|
| 124 |
+
transcription, translation = transcribe(audio)
|
| 125 |
+
return generate_subtitled_video(video, f"{save_dir}/output.wav",
|
| 126 |
+
f"{save_dir}/transcript.srt"), transcription, translation
|
| 127 |
|
| 128 |
|
| 129 |
def youtube_transcribe(url):
|
|
|
|
| 142 |
)
|
| 143 |
|
| 144 |
audio_interface = gr.Interface(
|
| 145 |
+
fn=transcribe_audio,
|
| 146 |
inputs=gr.Audio(type="filepath"),
|
| 147 |
+
outputs=["video", "text", "text"],
|
| 148 |
title="Transcribir y traducir audio",
|
| 149 |
)
|
| 150 |
|
| 151 |
video_interface = gr.Interface(
|
| 152 |
+
fn=transcribe_video,
|
| 153 |
inputs="video",
|
| 154 |
+
outputs=["video", "text", "text"],
|
| 155 |
title="Transcribir y traducir audio",
|
| 156 |
)
|
| 157 |
|
requirements.txt
CHANGED
|
@@ -1,16 +1,23 @@
|
|
| 1 |
aiohttp==3.8.3
|
| 2 |
aiosignal==1.3.1
|
|
|
|
| 3 |
anyio==3.6.2
|
| 4 |
async-timeout==4.0.2
|
| 5 |
attrs==22.1.0
|
|
|
|
| 6 |
bcrypt==4.0.1
|
|
|
|
|
|
|
| 7 |
certifi==2022.9.24
|
| 8 |
cffi==1.15.1
|
| 9 |
charset-normalizer==2.1.1
|
| 10 |
click==8.1.3
|
|
|
|
| 11 |
contourpy==1.0.6
|
| 12 |
cryptography==38.0.4
|
| 13 |
cycler==0.11.0
|
|
|
|
|
|
|
| 14 |
fastapi==0.88.0
|
| 15 |
ffmpeg-python==0.2.0
|
| 16 |
ffmpy==0.3.0
|
|
@@ -19,13 +26,19 @@ fonttools==4.38.0
|
|
| 19 |
frozenlist==1.3.3
|
| 20 |
fsspec==2022.11.0
|
| 21 |
future==0.18.2
|
|
|
|
|
|
|
| 22 |
gradio==3.12.0
|
| 23 |
h11==0.12.0
|
| 24 |
httpcore==0.15.0
|
| 25 |
httpx==0.23.1
|
| 26 |
huggingface-hub==0.11.1
|
| 27 |
idna==3.4
|
|
|
|
|
|
|
| 28 |
Jinja2==3.1.2
|
|
|
|
|
|
|
| 29 |
kiwisolver==1.4.4
|
| 30 |
linkify-it-py==1.0.3
|
| 31 |
markdown-it-py==2.1.0
|
|
@@ -36,39 +49,60 @@ mdurl==0.1.2
|
|
| 36 |
more-itertools==9.0.0
|
| 37 |
multidict==6.0.3
|
| 38 |
numpy==1.23.5
|
|
|
|
| 39 |
orjson==3.8.3
|
| 40 |
packaging==21.3
|
| 41 |
pandas==1.5.2
|
| 42 |
paramiko==2.12.0
|
| 43 |
Pillow==9.3.0
|
|
|
|
|
|
|
|
|
|
| 44 |
pycparser==2.21
|
| 45 |
pycryptodome==3.16.0
|
| 46 |
pydantic==1.10.2
|
|
|
|
| 47 |
pydub==0.25.1
|
|
|
|
|
|
|
| 48 |
PyNaCl==1.5.0
|
| 49 |
pyparsing==3.0.9
|
|
|
|
| 50 |
python-dateutil==2.8.2
|
| 51 |
python-multipart==0.0.5
|
|
|
|
| 52 |
pytz==2022.6
|
|
|
|
| 53 |
PyYAML==6.0
|
| 54 |
regex==2022.10.31
|
| 55 |
requests==2.28.1
|
| 56 |
rfc3986==1.5.0
|
|
|
|
|
|
|
| 57 |
semantic-version==2.10.0
|
|
|
|
|
|
|
| 58 |
setuptools-rust==1.5.2
|
| 59 |
six==1.16.0
|
|
|
|
| 60 |
sniffio==1.3.0
|
| 61 |
starlette==0.22.0
|
|
|
|
| 62 |
tokenizers==0.13.2
|
|
|
|
|
|
|
| 63 |
torch==1.13.0
|
|
|
|
| 64 |
tqdm==4.64.1
|
| 65 |
transformers==4.25.1
|
| 66 |
typing_extensions==4.4.0
|
|
|
|
|
|
|
| 67 |
uc-micro-py==1.0.1
|
| 68 |
urllib3==1.26.13
|
| 69 |
uvicorn==0.20.0
|
|
|
|
| 70 |
websockets==10.4
|
| 71 |
whisper @ git+https://github.com/openai/whisper.git@fd8f80c8b880dd7c284c109ca7f03dbe978bc532
|
| 72 |
yarl==1.8.2
|
| 73 |
-
|
| 74 |
-
pytube~=12.1.0
|
|
|
|
| 1 |
aiohttp==3.8.3
|
| 2 |
aiosignal==1.3.1
|
| 3 |
+
altair==4.2.0
|
| 4 |
anyio==3.6.2
|
| 5 |
async-timeout==4.0.2
|
| 6 |
attrs==22.1.0
|
| 7 |
+
backports.zoneinfo==0.2.1
|
| 8 |
bcrypt==4.0.1
|
| 9 |
+
blinker==1.5
|
| 10 |
+
cachetools==5.2.0
|
| 11 |
certifi==2022.9.24
|
| 12 |
cffi==1.15.1
|
| 13 |
charset-normalizer==2.1.1
|
| 14 |
click==8.1.3
|
| 15 |
+
commonmark==0.9.1
|
| 16 |
contourpy==1.0.6
|
| 17 |
cryptography==38.0.4
|
| 18 |
cycler==0.11.0
|
| 19 |
+
decorator==5.1.1
|
| 20 |
+
entrypoints==0.4
|
| 21 |
fastapi==0.88.0
|
| 22 |
ffmpeg-python==0.2.0
|
| 23 |
ffmpy==0.3.0
|
|
|
|
| 26 |
frozenlist==1.3.3
|
| 27 |
fsspec==2022.11.0
|
| 28 |
future==0.18.2
|
| 29 |
+
gitdb==4.0.10
|
| 30 |
+
GitPython==3.1.29
|
| 31 |
gradio==3.12.0
|
| 32 |
h11==0.12.0
|
| 33 |
httpcore==0.15.0
|
| 34 |
httpx==0.23.1
|
| 35 |
huggingface-hub==0.11.1
|
| 36 |
idna==3.4
|
| 37 |
+
importlib-metadata==5.1.0
|
| 38 |
+
importlib-resources==5.10.1
|
| 39 |
Jinja2==3.1.2
|
| 40 |
+
joblib==1.2.0
|
| 41 |
+
jsonschema==4.17.3
|
| 42 |
kiwisolver==1.4.4
|
| 43 |
linkify-it-py==1.0.3
|
| 44 |
markdown-it-py==2.1.0
|
|
|
|
| 49 |
more-itertools==9.0.0
|
| 50 |
multidict==6.0.3
|
| 51 |
numpy==1.23.5
|
| 52 |
+
opencv-python==4.6.0.66
|
| 53 |
orjson==3.8.3
|
| 54 |
packaging==21.3
|
| 55 |
pandas==1.5.2
|
| 56 |
paramiko==2.12.0
|
| 57 |
Pillow==9.3.0
|
| 58 |
+
pkgutil_resolve_name==1.3.10
|
| 59 |
+
protobuf==3.20.3
|
| 60 |
+
pyarrow==10.0.1
|
| 61 |
pycparser==2.21
|
| 62 |
pycryptodome==3.16.0
|
| 63 |
pydantic==1.10.2
|
| 64 |
+
pydeck==0.8.0
|
| 65 |
pydub==0.25.1
|
| 66 |
+
Pygments==2.13.0
|
| 67 |
+
Pympler==1.0.1
|
| 68 |
PyNaCl==1.5.0
|
| 69 |
pyparsing==3.0.9
|
| 70 |
+
pyrsistent==0.19.2
|
| 71 |
python-dateutil==2.8.2
|
| 72 |
python-multipart==0.0.5
|
| 73 |
+
pytube==12.1.0
|
| 74 |
pytz==2022.6
|
| 75 |
+
pytz-deprecation-shim==0.1.0.post0
|
| 76 |
PyYAML==6.0
|
| 77 |
regex==2022.10.31
|
| 78 |
requests==2.28.1
|
| 79 |
rfc3986==1.5.0
|
| 80 |
+
rich==12.6.0
|
| 81 |
+
sacremoses==0.0.53
|
| 82 |
semantic-version==2.10.0
|
| 83 |
+
semver==2.13.0
|
| 84 |
+
sentencepiece==0.1.97
|
| 85 |
setuptools-rust==1.5.2
|
| 86 |
six==1.16.0
|
| 87 |
+
smmap==5.0.0
|
| 88 |
sniffio==1.3.0
|
| 89 |
starlette==0.22.0
|
| 90 |
+
streamlit==1.15.2
|
| 91 |
tokenizers==0.13.2
|
| 92 |
+
toml==0.10.2
|
| 93 |
+
toolz==0.12.0
|
| 94 |
torch==1.13.0
|
| 95 |
+
tornado==6.2
|
| 96 |
tqdm==4.64.1
|
| 97 |
transformers==4.25.1
|
| 98 |
typing_extensions==4.4.0
|
| 99 |
+
tzdata==2022.7
|
| 100 |
+
tzlocal==4.2
|
| 101 |
uc-micro-py==1.0.1
|
| 102 |
urllib3==1.26.13
|
| 103 |
uvicorn==0.20.0
|
| 104 |
+
validators==0.20.0
|
| 105 |
websockets==10.4
|
| 106 |
whisper @ git+https://github.com/openai/whisper.git@fd8f80c8b880dd7c284c109ca7f03dbe978bc532
|
| 107 |
yarl==1.8.2
|
| 108 |
+
zipp==3.11.0
|
|
|