xangcastle commited on
Commit
e3eb20a
·
1 Parent(s): d8cd210

adding translation

Browse files
Files changed (3) hide show
  1. .gitignore +2 -0
  2. app.py +119 -13
  3. requirements.txt +36 -2
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ local
2
+ .idea
app.py CHANGED
@@ -1,23 +1,129 @@
 
 
 
1
  import gradio as gr
2
  import whisper
3
  import numpy as np
4
  import pytube as pt
 
 
 
 
5
 
6
- model = whisper.load_model("medium")
 
 
 
 
 
 
 
7
  print(
8
- f"Model is {'multilingual' if model.is_multilingual else 'English-only'} "
9
- f"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters."
10
  )
11
 
12
- options = dict(language='es', beam_size=5, best_of=5)
13
- transcribe_options = dict(task="transcribe", **options)
14
- translate_options = dict(task="translate", **options)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
 
17
  def transcribe(audio):
18
- transcription = model.transcribe(audio, **transcribe_options)
19
- translation = model.transcribe(audio, **translate_options)
20
- return transcription["text"], translation["text"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
 
23
  def youtube_transcribe(url):
@@ -36,16 +142,16 @@ mic_interface = gr.Interface(
36
  )
37
 
38
  audio_interface = gr.Interface(
39
- fn=transcribe,
40
  inputs=gr.Audio(type="filepath"),
41
- outputs=["text", "text"],
42
  title="Transcribir y traducir audio",
43
  )
44
 
45
  video_interface = gr.Interface(
46
- fn=transcribe,
47
  inputs="video",
48
- outputs=["text", "text"],
49
  title="Transcribir y traducir audio",
50
  )
51
 
 
1
+ import os.path
2
+ import pathlib
3
+ from io import StringIO
4
  import gradio as gr
5
  import whisper
6
  import numpy as np
7
  import pytube as pt
8
+ import ffmpeg
9
+ import textwrap
10
+ import cv2
11
+ from transformers import pipeline
12
 
13
+ APP_DIR = pathlib.Path(__file__).parent.absolute()
14
+
15
+ LOCAL_DIR = APP_DIR / "local"
16
+ LOCAL_DIR.mkdir(exist_ok=True)
17
+ save_dir = LOCAL_DIR / "output"
18
+ save_dir.mkdir(exist_ok=True)
19
+
20
+ transcriber = whisper.load_model("base")
21
  print(
22
+ f"Model is {'multilingual' if transcriber.is_multilingual else 'English-only'} "
23
+ f"and has {sum(np.prod(p.shape) for p in transcriber.parameters()):,} parameters."
24
  )
25
 
26
+ options = dict(language='en', beam_size=5, best_of=5)
27
+ transcribe_options = dict(task="translate", **options)
28
+
29
+ translator = pipeline("translation_en_to_es", model="Helsinki-NLP/opus-mt-en-es")
30
+
31
+
32
+ def translate(text):
33
+ return translator(text)[0]["translation_text"]
34
+
35
+
36
+ def format_timestamp(seconds, always_include_hours=False, fractional_seperator='.'):
37
+ assert seconds >= 0, "non-negative timestamp expected"
38
+ milliseconds = round(seconds * 1000.0)
39
+
40
+ hours = milliseconds // 3_600_000
41
+ milliseconds -= hours * 3_600_000
42
+
43
+ minutes = milliseconds // 60_000
44
+ milliseconds -= minutes * 60_000
45
+
46
+ seconds = milliseconds // 1_000
47
+ milliseconds -= seconds * 1_000
48
+
49
+ hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
50
+ return f"{hours_marker}{minutes:02d}:{seconds:02d}{fractional_seperator}{milliseconds:03d}"
51
+
52
+
53
+ def process_text(text: str, max_line_width=None):
54
+ if max_line_width is None or max_line_width < 0:
55
+ return text
56
+
57
+ lines = textwrap.wrap(text, width=max_line_width, tabsize=4)
58
+ return '\n'.join(lines)
59
+
60
+
61
+ def write_srt(transcript, file, max_line_width=None):
62
+ for i, segment in enumerate(transcript, start=1):
63
+ text = process_text(segment['text'].strip(), max_line_width).replace('-->', '->')
64
+
65
+ # write srt lines
66
+ print(
67
+ f"{i}\n"
68
+ f"{format_timestamp(segment['start'], always_include_hours=True, fractional_seperator=',')} --> "
69
+ f"{format_timestamp(segment['end'], always_include_hours=True, fractional_seperator=',')}\n"
70
+ f"{text}\n",
71
+ file=file,
72
+ flush=True,
73
+ )
74
+
75
+
76
+ def get_subs(segments, max_line_width):
77
+ for sentence in segments:
78
+ sentence['text'] = translate(sentence["text"])
79
+ segment_stream = StringIO()
80
+ write_srt(segments, file=segment_stream, max_line_width=max_line_width)
81
+ segment_stream.seek(0)
82
+ return segment_stream.read()
83
+
84
+
85
+ def generate_subtitled_video(video, audio, transcript):
86
+ video_file = ffmpeg.input(video)
87
+ audio_file = ffmpeg.input(audio)
88
+ ffmpeg.concat(video_file.filter(
89
+ "subtitles", transcript
90
+ ), audio_file, v=1, a=1).output(f"{save_dir}/final.mp4").run(quiet=True, overwrite_output=True)
91
+ return f"{save_dir}/final.mp4"
92
+
93
+
94
+ def generate_subtitled_audio(audio, transcript):
95
+ if not os.path.exists(f'{save_dir}/cover.jpg'):
96
+ cover = np.zeros([320, 640, 3], dtype=np.uint8)
97
+ cv2.imwrite(f'{save_dir}/cover.jpg', cover)
98
+ os.system(f'ffmpeg -y -loop 1 -i {save_dir}/cover.jpg '
99
+ f'-i {audio} -c:v libx264 '
100
+ f'-tune stillimage -c:a aac -b:a 192k -pix_fmt yuv420p -shortest '
101
+ f'-vf "subtitles={transcript}" {save_dir}/final.mp4')
102
+ return f"{save_dir}/final.mp4"
103
 
104
 
105
  def transcribe(audio):
106
+ transcription = transcriber.transcribe(audio, **transcribe_options)
107
+ srt = get_subs(transcription["segments"], 80)
108
+ with open(f"{save_dir}/transcript.srt", "w+", encoding='utf8') as f:
109
+ f.writelines(srt)
110
+ f.close()
111
+ return transcription["text"], srt
112
+
113
+
114
+ def transcribe_audio(audio):
115
+ transcription, translation = transcribe(audio)
116
+ return generate_subtitled_audio(audio, f"{save_dir}/transcript.srt"), transcription, translation
117
+
118
+
119
+ def transcribe_video(video):
120
+ audio = ffmpeg.input(video)
121
+ audio = ffmpeg.output(audio, f"{save_dir}/output.wav", acodec="pcm_s16le", ac=1, ar="16k")
122
+ ffmpeg.run(audio, overwrite_output=True)
123
+ audio = whisper.load_audio(f"{save_dir}/output.wav")
124
+ transcription, translation = transcribe(audio)
125
+ return generate_subtitled_video(video, f"{save_dir}/output.wav",
126
+ f"{save_dir}/transcript.srt"), transcription, translation
127
 
128
 
129
  def youtube_transcribe(url):
 
142
  )
143
 
144
  audio_interface = gr.Interface(
145
+ fn=transcribe_audio,
146
  inputs=gr.Audio(type="filepath"),
147
+ outputs=["video", "text", "text"],
148
  title="Transcribir y traducir audio",
149
  )
150
 
151
  video_interface = gr.Interface(
152
+ fn=transcribe_video,
153
  inputs="video",
154
+ outputs=["video", "text", "text"],
155
  title="Transcribir y traducir audio",
156
  )
157
 
requirements.txt CHANGED
@@ -1,16 +1,23 @@
1
  aiohttp==3.8.3
2
  aiosignal==1.3.1
 
3
  anyio==3.6.2
4
  async-timeout==4.0.2
5
  attrs==22.1.0
 
6
  bcrypt==4.0.1
 
 
7
  certifi==2022.9.24
8
  cffi==1.15.1
9
  charset-normalizer==2.1.1
10
  click==8.1.3
 
11
  contourpy==1.0.6
12
  cryptography==38.0.4
13
  cycler==0.11.0
 
 
14
  fastapi==0.88.0
15
  ffmpeg-python==0.2.0
16
  ffmpy==0.3.0
@@ -19,13 +26,19 @@ fonttools==4.38.0
19
  frozenlist==1.3.3
20
  fsspec==2022.11.0
21
  future==0.18.2
 
 
22
  gradio==3.12.0
23
  h11==0.12.0
24
  httpcore==0.15.0
25
  httpx==0.23.1
26
  huggingface-hub==0.11.1
27
  idna==3.4
 
 
28
  Jinja2==3.1.2
 
 
29
  kiwisolver==1.4.4
30
  linkify-it-py==1.0.3
31
  markdown-it-py==2.1.0
@@ -36,39 +49,60 @@ mdurl==0.1.2
36
  more-itertools==9.0.0
37
  multidict==6.0.3
38
  numpy==1.23.5
 
39
  orjson==3.8.3
40
  packaging==21.3
41
  pandas==1.5.2
42
  paramiko==2.12.0
43
  Pillow==9.3.0
 
 
 
44
  pycparser==2.21
45
  pycryptodome==3.16.0
46
  pydantic==1.10.2
 
47
  pydub==0.25.1
 
 
48
  PyNaCl==1.5.0
49
  pyparsing==3.0.9
 
50
  python-dateutil==2.8.2
51
  python-multipart==0.0.5
 
52
  pytz==2022.6
 
53
  PyYAML==6.0
54
  regex==2022.10.31
55
  requests==2.28.1
56
  rfc3986==1.5.0
 
 
57
  semantic-version==2.10.0
 
 
58
  setuptools-rust==1.5.2
59
  six==1.16.0
 
60
  sniffio==1.3.0
61
  starlette==0.22.0
 
62
  tokenizers==0.13.2
 
 
63
  torch==1.13.0
 
64
  tqdm==4.64.1
65
  transformers==4.25.1
66
  typing_extensions==4.4.0
 
 
67
  uc-micro-py==1.0.1
68
  urllib3==1.26.13
69
  uvicorn==0.20.0
 
70
  websockets==10.4
71
  whisper @ git+https://github.com/openai/whisper.git@fd8f80c8b880dd7c284c109ca7f03dbe978bc532
72
  yarl==1.8.2
73
-
74
- pytube~=12.1.0
 
1
  aiohttp==3.8.3
2
  aiosignal==1.3.1
3
+ altair==4.2.0
4
  anyio==3.6.2
5
  async-timeout==4.0.2
6
  attrs==22.1.0
7
+ backports.zoneinfo==0.2.1
8
  bcrypt==4.0.1
9
+ blinker==1.5
10
+ cachetools==5.2.0
11
  certifi==2022.9.24
12
  cffi==1.15.1
13
  charset-normalizer==2.1.1
14
  click==8.1.3
15
+ commonmark==0.9.1
16
  contourpy==1.0.6
17
  cryptography==38.0.4
18
  cycler==0.11.0
19
+ decorator==5.1.1
20
+ entrypoints==0.4
21
  fastapi==0.88.0
22
  ffmpeg-python==0.2.0
23
  ffmpy==0.3.0
 
26
  frozenlist==1.3.3
27
  fsspec==2022.11.0
28
  future==0.18.2
29
+ gitdb==4.0.10
30
+ GitPython==3.1.29
31
  gradio==3.12.0
32
  h11==0.12.0
33
  httpcore==0.15.0
34
  httpx==0.23.1
35
  huggingface-hub==0.11.1
36
  idna==3.4
37
+ importlib-metadata==5.1.0
38
+ importlib-resources==5.10.1
39
  Jinja2==3.1.2
40
+ joblib==1.2.0
41
+ jsonschema==4.17.3
42
  kiwisolver==1.4.4
43
  linkify-it-py==1.0.3
44
  markdown-it-py==2.1.0
 
49
  more-itertools==9.0.0
50
  multidict==6.0.3
51
  numpy==1.23.5
52
+ opencv-python==4.6.0.66
53
  orjson==3.8.3
54
  packaging==21.3
55
  pandas==1.5.2
56
  paramiko==2.12.0
57
  Pillow==9.3.0
58
+ pkgutil_resolve_name==1.3.10
59
+ protobuf==3.20.3
60
+ pyarrow==10.0.1
61
  pycparser==2.21
62
  pycryptodome==3.16.0
63
  pydantic==1.10.2
64
+ pydeck==0.8.0
65
  pydub==0.25.1
66
+ Pygments==2.13.0
67
+ Pympler==1.0.1
68
  PyNaCl==1.5.0
69
  pyparsing==3.0.9
70
+ pyrsistent==0.19.2
71
  python-dateutil==2.8.2
72
  python-multipart==0.0.5
73
+ pytube==12.1.0
74
  pytz==2022.6
75
+ pytz-deprecation-shim==0.1.0.post0
76
  PyYAML==6.0
77
  regex==2022.10.31
78
  requests==2.28.1
79
  rfc3986==1.5.0
80
+ rich==12.6.0
81
+ sacremoses==0.0.53
82
  semantic-version==2.10.0
83
+ semver==2.13.0
84
+ sentencepiece==0.1.97
85
  setuptools-rust==1.5.2
86
  six==1.16.0
87
+ smmap==5.0.0
88
  sniffio==1.3.0
89
  starlette==0.22.0
90
+ streamlit==1.15.2
91
  tokenizers==0.13.2
92
+ toml==0.10.2
93
+ toolz==0.12.0
94
  torch==1.13.0
95
+ tornado==6.2
96
  tqdm==4.64.1
97
  transformers==4.25.1
98
  typing_extensions==4.4.0
99
+ tzdata==2022.7
100
+ tzlocal==4.2
101
  uc-micro-py==1.0.1
102
  urllib3==1.26.13
103
  uvicorn==0.20.0
104
+ validators==0.20.0
105
  websockets==10.4
106
  whisper @ git+https://github.com/openai/whisper.git@fd8f80c8b880dd7c284c109ca7f03dbe978bc532
107
  yarl==1.8.2
108
+ zipp==3.11.0