Spaces:
Configuration error
Configuration error
test-rtechs
commited on
Commit
•
9670139
1
Parent(s):
42f33fc
Upload 38 files
Browse files- .devcontainer/devcontainer.json +6 -0
- .gitattributes +2 -0
- README.md +364 -16
- SoniTranslate_Colab.ipynb +46 -45
- SoniTranslate_Colab_embedded.ipynb +0 -0
- assets/Video.mp4 +3 -0
- assets/video_dub.mp4 +3 -0
- docs/windows_install.md +2 -2
- requirements.txt +23 -5
- requirements_base.txt +15 -0
- requirements_extra.txt +19 -0
- soni_translate/languages_gui.py +0 -0
- soni_translate/mdx_net.py +0 -12
- soni_translate/preprocessor.py +1 -1
- soni_translate/speech_segmentation.py +39 -91
- soni_translate/text_multiformat_processor.py +3 -1
- soni_translate/text_to_speech.py +11 -10
- soni_translate/utils.py +27 -4
- voice_main.py +9 -3
.devcontainer/devcontainer.json
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"image": "mcr.microsoft.com/devcontainers/universal:2",
|
3 |
+
"features": {"ghcr.io/devcontainers-contrib/features/ffmpeg-apt-get:1": {},
|
4 |
+
"ghcr.io/devcontainers-contrib/features/yt-dlp:2": {}
|
5 |
+
}
|
6 |
+
}
|
.gitattributes
CHANGED
@@ -1,2 +1,4 @@
|
|
1 |
assets/Video_subtitled.mp4 filter=lfs diff=lfs merge=lfs -text
|
2 |
assets/Video_main.mp4 filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
1 |
assets/Video_subtitled.mp4 filter=lfs diff=lfs merge=lfs -text
|
2 |
assets/Video_main.mp4 filter=lfs diff=lfs merge=lfs -text
|
3 |
+
assets/video_dub.mp4 filter=lfs diff=lfs merge=lfs -text
|
4 |
+
assets/Video.mp4 filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
@@ -1,16 +1,364 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# 🎥 SoniTranslate 🈷️
|
2 |
+
|
3 |
+
🎬 Video Translation with Synchronized Audio 🌐
|
4 |
+
|
5 |
+
SonyTranslate is a powerful and user-friendly web application that allows you to easily translate videos into different languages. This repository hosts the code for the SonyTranslate web UI, which is built with the Gradio library to provide a seamless and interactive user experience.
|
6 |
+
|
7 |
+
|
8 |
+
| Description | Link |
|
9 |
+
| ----------- | ---- |
|
10 |
+
| 📙 Colab Notebook | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/R3gm/SoniTranslate/blob/main/SoniTranslate_Colab.ipynb) |
|
11 |
+
| 🎉 Repository | [![GitHub Repository](https://img.shields.io/badge/GitHub-Repository-black?style=flat-square&logo=github)](https://github.com/R3gm/SoniTranslate/) |
|
12 |
+
| 🚀 Online DEMO | [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/r3gm/SoniTranslate_translate_audio_of_a_video_content) |
|
13 |
+
|
14 |
+
## SonyTranslate's web UI, which features a browser interface built on the Gradio library.
|
15 |
+
![image](https://github.com/R3gm/SoniTranslate/assets/114810545/0d71fbf4-e9f0-4f8f-944e-8f3f1ea6a019)
|
16 |
+
|
17 |
+
## Using the project: A video guide
|
18 |
+
|
19 |
+
For a comprehensive understanding of the project, we highly recommend watching this video tutorial by [DEV-MalletteS](https://github.com/DEV-MalletteS). You can watch it on YouTube by clicking the thumbnail below:
|
20 |
+
|
21 |
+
[![Watch the video](https://img.youtube.com/vi/SmGkFaSzq_Q/0.jpg)](https://www.youtube.com/watch?v=SmGkFaSzq_Q)
|
22 |
+
|
23 |
+
|
24 |
+
## Supported languages for translation
|
25 |
+
|
26 |
+
| Language Code | Language |
|
27 |
+
|---------------|------------|
|
28 |
+
| en | English |
|
29 |
+
| fr | French |
|
30 |
+
| de | German |
|
31 |
+
| es | Spanish |
|
32 |
+
| it | Italian |
|
33 |
+
| ja | Japanese |
|
34 |
+
| nl | Dutch |
|
35 |
+
| uk | Ukrainian |
|
36 |
+
| pt | Portuguese |
|
37 |
+
| ar | Arabic |
|
38 |
+
| zh | Chinese - Simplified |
|
39 |
+
| zh-TW | Chinese - Traditional |
|
40 |
+
| cs | Czech |
|
41 |
+
| da | Danish |
|
42 |
+
| fi | Finnish |
|
43 |
+
| el | Greek |
|
44 |
+
| he | Hebrew |
|
45 |
+
| hu | Hungarian |
|
46 |
+
| ko | Korean |
|
47 |
+
| fa | Persian |
|
48 |
+
| pl | Polish |
|
49 |
+
| ru | Russian |
|
50 |
+
| tr | Turkish |
|
51 |
+
| ur | Urdu |
|
52 |
+
| hi | Hindi |
|
53 |
+
| vi | Vietnamese |
|
54 |
+
| id | Indonesian |
|
55 |
+
| bn | Bengali |
|
56 |
+
| te | Telugu |
|
57 |
+
| mr | Marathi |
|
58 |
+
| ta | Tamil |
|
59 |
+
| jw (or jv) | Javanese |
|
60 |
+
| ca | Catalan |
|
61 |
+
| ne | Nepali |
|
62 |
+
| th | Thai |
|
63 |
+
| sv | Swedish |
|
64 |
+
| am | Amharic |
|
65 |
+
| cy | Welsh |
|
66 |
+
| hr | Croatian |
|
67 |
+
| is | Icelandic |
|
68 |
+
| ka | Georgian |
|
69 |
+
| km | Khmer |
|
70 |
+
| sk | Slovak |
|
71 |
+
| sq | Albanian |
|
72 |
+
| sr | Serbian |
|
73 |
+
| az | Azerbaijani|
|
74 |
+
| bg | Bulgarian |
|
75 |
+
| gl | Galician |
|
76 |
+
| gu | Gujarati |
|
77 |
+
| kk | Kazakh |
|
78 |
+
| kn | Kannada |
|
79 |
+
| lt | Lithuanian |
|
80 |
+
| lv | Latvian |
|
81 |
+
| ml | Malayalam |
|
82 |
+
| ro | Romanian |
|
83 |
+
| si | Sinhala |
|
84 |
+
| su | Sundanese |
|
85 |
+
| et | Estonian |
|
86 |
+
| mk | Macedonian |
|
87 |
+
| sw | Swahili |
|
88 |
+
| af | Afrikaans |
|
89 |
+
| bs | Bosnian |
|
90 |
+
| la | Latin |
|
91 |
+
| my | Myanmar Burmese |
|
92 |
+
| no | Norwegian |
|
93 |
+
| as | Assamese |
|
94 |
+
| eu | Basque |
|
95 |
+
| ha | Hausa |
|
96 |
+
| ht | Haitian Creole |
|
97 |
+
| hy | Armenian |
|
98 |
+
| lo | Lao |
|
99 |
+
| mg | Malagasy |
|
100 |
+
| mn | Mongolian |
|
101 |
+
| mt | Maltese |
|
102 |
+
| pa | Punjabi |
|
103 |
+
| ps | Pashto |
|
104 |
+
| sl | Slovenian |
|
105 |
+
| sn | Shona |
|
106 |
+
| so | Somali |
|
107 |
+
| tg | Tajik |
|
108 |
+
| tk | Turkmen |
|
109 |
+
| tt | Tatar |
|
110 |
+
| uz | Uzbek |
|
111 |
+
| yo | Yoruba |
|
112 |
+
|
113 |
+
### Non-transcription
|
114 |
+
|
115 |
+
| Language Code | Language |
|
116 |
+
|---------------|------------|
|
117 |
+
| ay | Aymara |
|
118 |
+
| bm | Bambara |
|
119 |
+
| ceb | Cebuano |
|
120 |
+
| ny | Chichewa |
|
121 |
+
| dv | Divehi |
|
122 |
+
| doi | Dogri |
|
123 |
+
| ee | Ewe |
|
124 |
+
| gn | Guarani |
|
125 |
+
| ilo | Iloko |
|
126 |
+
| rw | Kinyarwanda|
|
127 |
+
| kri | Krio |
|
128 |
+
| ku | Kurdish |
|
129 |
+
| ky | Kirghiz |
|
130 |
+
| lg | Ganda |
|
131 |
+
| mai | Maithili |
|
132 |
+
| or | Oriya |
|
133 |
+
| om | Oromo |
|
134 |
+
| qu | Quechua |
|
135 |
+
| sm | Samoan |
|
136 |
+
| ti | Tigrinya |
|
137 |
+
| ts | Tsonga |
|
138 |
+
| ak | Akan |
|
139 |
+
| ug | Uighur |
|
140 |
+
|
141 |
+
## Example:
|
142 |
+
|
143 |
+
### Original audio
|
144 |
+
|
145 |
+
https://github.com/R3gm/SoniTranslate/assets/114810545/db9e78c0-b228-4e81-9704-e62d5cc407a3
|
146 |
+
|
147 |
+
|
148 |
+
|
149 |
+
### Translated audio
|
150 |
+
|
151 |
+
https://github.com/R3gm/SoniTranslate/assets/114810545/6a8ddc65-a46f-4653-9726-6df2615f0ef9
|
152 |
+
|
153 |
+
|
154 |
+
## Colab Runtime
|
155 |
+
|
156 |
+
To run SoniTranslate using Colab Runtime: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/R3gm/SoniTranslate/blob/main/SoniTranslate_Colab.ipynb)
|
157 |
+
|
158 |
+
## Install Locally (Installation tested in Linux)
|
159 |
+
|
160 |
+
### Before You Start
|
161 |
+
|
162 |
+
Before you start installing and using SoniTranslate, there are a few things you need to do:
|
163 |
+
|
164 |
+
1. Install the NVIDIA drivers for CUDA 11.8.0, NVIDIA CUDA is a parallel computing platform and programming model that enables developers to use the power of NVIDIA graphics processing units (GPUs) to speed up compute-intensive tasks. You can find the drivers [here](https://developer.nvidia.com/cuda-toolkit-archive). Follow the instructions on the website to download and install the drivers.
|
165 |
+
2. Accept the license agreement for using Pyannote. You need to have an account on Hugging Face and `accept the license to use the models`: https://huggingface.co/pyannote/speaker-diarization and https://huggingface.co/pyannote/segmentation
|
166 |
+
3. Create a [huggingface token](https://huggingface.co/settings/tokens). Hugging Face is a natural language processing platform that provides access to state-of-the-art models and tools. You will need to create a token in order to use some of the automatic model download features in SoniTranslate. Follow the instructions on the Hugging Face website to create a token. When you are creating the new Access Token in Hugging Face, make sure to tick "Read access to contents of all public gated repos you can access".
|
167 |
+
4. Install [Anaconda](https://www.anaconda.com/) or [Miniconda](https://docs.anaconda.com/free/miniconda/miniconda-install/). Anaconda is a free and open-source distribution of Python and R. It includes a package manager called conda that makes it easy to install and manage Python environments and packages. Follow the instructions on the Anaconda website to download and install Anaconda on your system.
|
168 |
+
5. Install Git for your system. Git is a version control system that helps you track changes to your code and collaborate with other developers. You can install Git with Anaconda by running `conda install -c anaconda git -y` in your terminal (Do this after step 1 in the following section.). If you have trouble installing Git via Anaconda, you can use the following link instead:
|
169 |
+
- [Git for Linux](https://git-scm.com/download/linux)
|
170 |
+
|
171 |
+
Once you have completed these steps, you will be ready to install SoniTranslate.
|
172 |
+
|
173 |
+
### Getting Started
|
174 |
+
|
175 |
+
To install SoniTranslate, follow these steps:
|
176 |
+
|
177 |
+
1. Create a suitable anaconda environment for SoniTranslate and activate it:
|
178 |
+
|
179 |
+
```
|
180 |
+
conda create -n sonitr python=3.10 -y
|
181 |
+
conda activate sonitr
|
182 |
+
python -m pip install pip==23.1.2
|
183 |
+
conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia
|
184 |
+
```
|
185 |
+
|
186 |
+
2. Clone this github repository and navigate to it:
|
187 |
+
```
|
188 |
+
git clone https://github.com/r3gm/SoniTranslate.git
|
189 |
+
cd SoniTranslate
|
190 |
+
```
|
191 |
+
|
192 |
+
3. Install required packages:
|
193 |
+
|
194 |
+
```
|
195 |
+
pip install -r requirements_base.txt -v
|
196 |
+
pip install -r requirements_extra.txt -v
|
197 |
+
pip install onnxruntime-gpu
|
198 |
+
```
|
199 |
+
|
200 |
+
4. Install [ffmpeg](https://ffmpeg.org/download.html). FFmpeg is a free software project that produces libraries and programs for handling multimedia data. You will need it to process audio and video files. You can install ffmpeg with Anaconda by running `conda install -y ffmpeg` in your terminal (recommended). If you have trouble installing ffmpeg via Anaconda, you can use the following link instead: (https://ffmpeg.org/ffmpeg.html). Once it is installed, make sure it is in your PATH by running `ffmpeg -h` in your terminal. If you don't get an error message, you're good to go.
|
201 |
+
|
202 |
+
5. Optional install:
|
203 |
+
|
204 |
+
After installing FFmpeg, you can install these optional packages.
|
205 |
+
|
206 |
+
|
207 |
+
[Piper TTS](https://github.com/rhasspy/piper) is a fast, local neural text to speech system that sounds great and is optimized for the Raspberry Pi 4. Piper is used in a variety of projects. Voices are trained with VITS and exported to the onnxruntime.
|
208 |
+
|
209 |
+
```
|
210 |
+
pip install -q piper-tts==1.2.0
|
211 |
+
```
|
212 |
+
|
213 |
+
[Coqui XTTS](https://github.com/coqui-ai/TTS) is a text-to-speech (TTS) model that lets you generate realistic voices in different languages. It can clone voices with just a short audio clip, even speak in a different language! It's like having a personal voice mimic for any text you need spoken.
|
214 |
+
|
215 |
+
```
|
216 |
+
pip install -q -r requirements_xtts.txt
|
217 |
+
pip install -q TTS==0.21.1 --no-deps
|
218 |
+
```
|
219 |
+
|
220 |
+
|
221 |
+
### Running SoniTranslate
|
222 |
+
|
223 |
+
To run SoniTranslate locally, make sure the `sonitr` conda environment is active:
|
224 |
+
|
225 |
+
```
|
226 |
+
conda activate sonitr
|
227 |
+
```
|
228 |
+
|
229 |
+
Setting your Hugging Face token as an environment variable in Linux:
|
230 |
+
|
231 |
+
```
|
232 |
+
export YOUR_HF_TOKEN="YOUR_HUGGING_FACE_TOKEN"
|
233 |
+
```
|
234 |
+
|
235 |
+
Then navigate to the `SoniTranslate` folder and run either the `app_rvc.py`
|
236 |
+
|
237 |
+
```
|
238 |
+
python app_rvc.py
|
239 |
+
```
|
240 |
+
When the `local URL` `http://127.0.0.1:7860` is displayed in the terminal, simply open this URL in your web browser to access the SoniTranslate interface.
|
241 |
+
|
242 |
+
### Stop and close SoniTranslate.
|
243 |
+
|
244 |
+
In most environments, you can stop the execution by pressing Ctrl+C in the terminal where you launched the script `app_rvc.py`. This will interrupt the program and stop the Gradio app.
|
245 |
+
To deactivate the Conda environment, you can use the following command:
|
246 |
+
|
247 |
+
```
|
248 |
+
conda deactivate
|
249 |
+
```
|
250 |
+
|
251 |
+
This will deactivate the currently active Conda environment sonitr, and you'll return to the base environment or the global Python environment.
|
252 |
+
|
253 |
+
### Starting Over
|
254 |
+
|
255 |
+
If you need to start over from scratch, you can delete the `SoniTranslate` folder and remove the `sonitr` conda environment with the following set of commands:
|
256 |
+
|
257 |
+
```
|
258 |
+
conda deactivate
|
259 |
+
conda env remove -n sonitr
|
260 |
+
```
|
261 |
+
|
262 |
+
With the `sonitr` environment removed, you can start over with a fresh installation.
|
263 |
+
|
264 |
+
### Notes
|
265 |
+
|
266 |
+
- Alternatively, you can set your Hugging Face token as a permanent environment variable with:
|
267 |
+
|
268 |
+
```
|
269 |
+
conda activate sonitr
|
270 |
+
conda env config vars set YOUR_HF_TOKEN="YOUR_HUGGING_FACE_TOKEN_HERE"
|
271 |
+
conda deactivate
|
272 |
+
```
|
273 |
+
|
274 |
+
- To use OpenAI's GPT API for translation, tts or transcription, set up your OpenAI API key as an environment variable in quotes:
|
275 |
+
|
276 |
+
```
|
277 |
+
conda activate sonitr
|
278 |
+
conda env config vars set OPENAI_API_KEY="your-api-key-here"
|
279 |
+
conda deactivate
|
280 |
+
```
|
281 |
+
|
282 |
+
## Command line arguments
|
283 |
+
|
284 |
+
The app_rvc.py script supports command-line arguments to customize its behavior. Here's a brief guide on how to use them:
|
285 |
+
|
286 |
+
| Argument command | Default | Value | Description |
|
287 |
+
|------------------|---------|-------|-------------|
|
288 |
+
| --theme | Taithrah/Minimal | String | Sets the theme for the interface. Themes can be found in the [Theme Gallery](https://huggingface.co/spaces/gradio/theme-gallery). |
|
289 |
+
| --language | english | String | Selects the interface language. Available options: afrikaans, arabic, azerbaijani, chinese_zh_cn, english, french, german, hindi, indonesian, italian, japanese, korean, marathi, persian, polish, portuguese, russian, spanish, swedish, turkish, ukrainian, vietnamese. |
|
290 |
+
| --verbosity_level| info | String | Sets the verbosity level of the logger: debug, info, warning, error, or critical. |
|
291 |
+
| --public_url | | Boolean | Enables a public link. |
|
292 |
+
| --cpu_mode | | Boolean | Enable CPU mode to run the program without utilizing GPU acceleration. |
|
293 |
+
| --logs_in_gui | | Boolean | Shows the operations performed in Logs (obsolete). |
|
294 |
+
|
295 |
+
Example usage:
|
296 |
+
```
|
297 |
+
python app_rvc.py --theme aliabid94/new-theme --language french
|
298 |
+
```
|
299 |
+
This command sets the theme to a custom theme and selects French as the interface language.
|
300 |
+
Feel free to customize these arguments according to your preferences and requirements.
|
301 |
+
|
302 |
+
## 📖 News
|
303 |
+
|
304 |
+
🔥 2024/18/05: New Update Details
|
305 |
+
- Added option Overlap Reduction
|
306 |
+
- OpenAI API Key Integration for Transcription, translation, and TTS
|
307 |
+
- More output types: subtitles by speaker, separate audio sound, and video only with subtitles
|
308 |
+
- Access to a better-performing version of Whisper for transcribing speech on the [Hugging Face Whisper page](https://huggingface.co/models?pipeline_tag=automatic-speech-recognition&sort=trending&search=whisper). Copy the repository ID and paste it into the 'Whisper ASR model' section in 'Advanced Settings'; e.g., `kotoba-tech/kotoba-whisper-v1.1` for Japanese transcription [available here](https://huggingface.co/kotoba-tech/kotoba-whisper-v1.1)
|
309 |
+
- Support for ASS subtitles and batch processing with subtitles
|
310 |
+
- Vocal enhancement before transcription
|
311 |
+
- Added CPU mode with `app_rvc.py --cpu_mode`
|
312 |
+
- TTS now supports up to 12 speakers
|
313 |
+
- OpenVoiceV2 integration for voice imitation
|
314 |
+
- PDF to videobook (displays images from the PDF)
|
315 |
+
- GUI language translation in Persian and Afrikaans
|
316 |
+
- **New Language Support**:
|
317 |
+
- **Complete support**: Estonian, Macedonian, Malay, Swahili, Afrikaans, Bosnian, Latin, Myanmar Burmese, Norwegian, Traditional Chinese, Assamese, Basque, Hausa, Haitian Creole, Armenian, Lao, Malagasy, Mongolian, Maltese, Punjabi, Pashto, Slovenian, Shona, Somali, Tajik, Turkmen, Tatar, Uzbek, and Yoruba
|
318 |
+
- **Non-transcription**: Aymara, Bambara, Cebuano, Chichewa, Divehi, Dogri, Ewe, Guarani, Iloko, Kinyarwanda, Krio, Kurdish, Kirghiz, Ganda, Maithili, Oriya, Oromo, Quechua, Samoan, Tigrinya, Tsonga, Akan, and Uighur
|
319 |
+
|
320 |
+
🔥 2024/03/02: Preserve file names in output. Multiple archives can now be submitted simultaneously by specifying their paths, directories or URLs separated by commas. Processing of a full YouTube playlist. About [supported sites URL](https://github.com/yt-dlp/yt-dlp/blob/master/supportedsites.md), please be aware that not all sites may work optimally. Added option for disabling diarization. Implemented soft subtitles. Format output (MP3, MP4, MKV, WAV, and OGG), and resolved issues related to file reading and diarization.
|
321 |
+
|
322 |
+
🔥 2024/02/22: Added freevc for voice imitation, fixed voiceless track, divide segments. New languages support (Swedish, Amharic, Welsh, Croatian, Icelandic, Georgian, Khmer, Slovak, Albanian, Serbian, Azerbaijani, Bulgarian, Galician, Gujarati, Kazakh, Kannada, Lithuanian, Latvian, Malayalam, Romanian, Sinhala and Sundanese). New translations of the GUI (Spanish, French, German, Italian, Japanese, Chinese Simplified, Ukrainian, Arabic, Russian, Turkish, Indonesian, Portuguese, Hindi, Vietnamese, Polish, Swedish, Korean, Marathi and Azerbaijani). With subtitle file, no align and the media file is not needed to process the SRT file. Burn subtitles to video. Queue can accept multiple tasks simultaneously. Sound alert notification. Continue process from last checkpoint. Acceleration rate regulation.
|
323 |
+
|
324 |
+
🔥 2024/01/16: Expanded language support (Thai, Nepali, Catalan, Javanese, Tamil, Marathi, Telugu, Bengali and Indonesian), the introduction of whisper large v3, configurable GUI options, integration of BARK, Facebook-mms, Coqui XTTS, and Piper-TTS. Additional features included audio separation utilities, XTTS WAV creation, use an SRT file as a base for translation, document translation, manual speaker editing, and flexible output options (video, audio, subtitles).
|
325 |
+
|
326 |
+
🔥 2023/10/29: Edit the translated subtitle, download it, adjust volume and speed options.
|
327 |
+
|
328 |
+
🔥 2023/08/03: Changed default options and added directory view of downloads.
|
329 |
+
|
330 |
+
🔥 2023/08/02: Added support for Arabic, Czech, Danish, Finnish, Greek, Hebrew, Hungarian, Korean, Persian, Polish, Russian, Turkish, Urdu, Hindi, and Vietnamese languages. 🌐
|
331 |
+
|
332 |
+
🔥 2023/08/01: Add options for use RVC models.
|
333 |
+
|
334 |
+
🔥 2023/07/27: Fix some bug processing the video and audio.
|
335 |
+
|
336 |
+
🔥 2023/07/26: New UI and add mix options.
|
337 |
+
|
338 |
+
|
339 |
+
## Contributing
|
340 |
+
|
341 |
+
Welcome to contributions from the community! If you have any ideas, bug reports, or feature requests, please open an issue or submit a pull request. For more information, please refer to the contribution guidelines.
|
342 |
+
|
343 |
+
## Credits
|
344 |
+
|
345 |
+
This project leverages a number of open-source projects. We would like to acknowledge and thank the contributors of the following repositories:
|
346 |
+
|
347 |
+
- [PyTorch](https://github.com/pytorch/pytorch)
|
348 |
+
- [yt-dlp](https://github.com/yt-dlp/yt-dlp)
|
349 |
+
- [Gradio](https://github.com/gradio-app/gradio)
|
350 |
+
- [edge-tts](https://github.com/rany2/edge-tts)
|
351 |
+
- [deep-translator](https://github.com/nidhaloff/deep-translator)
|
352 |
+
- [pyannote-audio](https://github.com/pyannote/pyannote-audio)
|
353 |
+
- [WhisperX](https://github.com/m-bain/whisperX)
|
354 |
+
- [faster-whisper](https://github.com/SYSTRAN/faster-whisper)
|
355 |
+
- [CTranslate2](https://github.com/OpenNMT/CTranslate2)
|
356 |
+
- [Transformers](https://github.com/huggingface/transformers)
|
357 |
+
- [FFmpeg](https://github.com/FFmpeg/FFmpeg)
|
358 |
+
- [Piper](https://github.com/rhasspy/piper)
|
359 |
+
- [Coqui TTS](https://github.com/coqui-ai/TTS)
|
360 |
+
- [pypdf](https://github.com/py-pdf/pypdf)
|
361 |
+
- [OpenVoice](https://github.com/myshell-ai/OpenVoice)
|
362 |
+
|
363 |
+
## License
|
364 |
+
Although the code is licensed under Apache 2, the models or weights may have commercial restrictions, as seen with pyannote diarization.
|
SoniTranslate_Colab.ipynb
CHANGED
@@ -1,27 +1,10 @@
|
|
1 |
{
|
2 |
-
"nbformat": 4,
|
3 |
-
"nbformat_minor": 0,
|
4 |
-
"metadata": {
|
5 |
-
"colab": {
|
6 |
-
"provenance": [],
|
7 |
-
"gpuType": "T4",
|
8 |
-
"include_colab_link": true
|
9 |
-
},
|
10 |
-
"kernelspec": {
|
11 |
-
"name": "python3",
|
12 |
-
"display_name": "Python 3"
|
13 |
-
},
|
14 |
-
"language_info": {
|
15 |
-
"name": "python"
|
16 |
-
},
|
17 |
-
"accelerator": "GPU"
|
18 |
-
},
|
19 |
"cells": [
|
20 |
{
|
21 |
"cell_type": "markdown",
|
22 |
"metadata": {
|
23 |
-
"
|
24 |
-
"
|
25 |
},
|
26 |
"source": [
|
27 |
"<a href=\"https://colab.research.google.com/github/R3gm/SoniTranslate/blob/main/SoniTranslate_Colab.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
|
@@ -29,6 +12,9 @@
|
|
29 |
},
|
30 |
{
|
31 |
"cell_type": "markdown",
|
|
|
|
|
|
|
32 |
"source": [
|
33 |
"# SoniTranslate\n",
|
34 |
"\n",
|
@@ -38,17 +24,14 @@
|
|
38 |
"| 🚀 Online Demo in HF | [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/r3gm/SoniTranslate_translate_audio_of_a_video_content) |\n",
|
39 |
"\n",
|
40 |
"\n"
|
41 |
-
]
|
42 |
-
"metadata": {
|
43 |
-
"id": "8lw0EgLex-YZ"
|
44 |
-
}
|
45 |
},
|
46 |
{
|
47 |
"cell_type": "code",
|
48 |
"execution_count": null,
|
49 |
"metadata": {
|
50 |
-
"
|
51 |
-
"
|
52 |
},
|
53 |
"outputs": [],
|
54 |
"source": [
|
@@ -56,6 +39,8 @@
|
|
56 |
"!git clone https://github.com/r3gm/SoniTranslate.git\n",
|
57 |
"%cd SoniTranslate\n",
|
58 |
"\n",
|
|
|
|
|
59 |
"!apt install git-lfs\n",
|
60 |
"!git lfs install\n",
|
61 |
"\n",
|
@@ -78,20 +63,25 @@
|
|
78 |
},
|
79 |
{
|
80 |
"cell_type": "markdown",
|
|
|
|
|
|
|
81 |
"source": [
|
82 |
"One important step is to accept the license agreement for using Pyannote. You need to have an account on Hugging Face and `accept the license to use the models`: https://huggingface.co/pyannote/speaker-diarization and https://huggingface.co/pyannote/segmentation\n",
|
83 |
"\n",
|
|
|
84 |
"\n",
|
85 |
-
"\
|
86 |
-
|
87 |
-
"Get your KEY TOKEN here: https://hf.co/settings/tokens"
|
88 |
-
],
|
89 |
-
"metadata": {
|
90 |
-
"id": "LTaTstXPXNg2"
|
91 |
-
}
|
92 |
},
|
93 |
{
|
94 |
"cell_type": "code",
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
"source": [
|
96 |
"#@markdown # `RUN THE WEB APP`\n",
|
97 |
"YOUR_HF_TOKEN = \"\" #@param {type:'string'}\n",
|
@@ -103,22 +93,33 @@
|
|
103 |
"\n",
|
104 |
"%cd /content/SoniTranslate\n",
|
105 |
"!python app_rvc.py --theme {theme} --verbosity_level {verbosity_level} --language {interface_language} --public_url"
|
106 |
-
]
|
107 |
-
"metadata": {
|
108 |
-
"id": "XkhXfaFw4R4J",
|
109 |
-
"cellView": "form"
|
110 |
-
},
|
111 |
-
"execution_count": null,
|
112 |
-
"outputs": []
|
113 |
},
|
114 |
{
|
115 |
"cell_type": "markdown",
|
116 |
-
"source": [
|
117 |
-
"Open the `public URL` when it appears"
|
118 |
-
],
|
119 |
"metadata": {
|
120 |
"id": "KJW3KrhZJh0u"
|
121 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
}
|
123 |
-
|
124 |
-
|
|
|
|
|
|
1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
"cells": [
|
3 |
{
|
4 |
"cell_type": "markdown",
|
5 |
"metadata": {
|
6 |
+
"colab_type": "text",
|
7 |
+
"id": "view-in-github"
|
8 |
},
|
9 |
"source": [
|
10 |
"<a href=\"https://colab.research.google.com/github/R3gm/SoniTranslate/blob/main/SoniTranslate_Colab.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
|
|
|
12 |
},
|
13 |
{
|
14 |
"cell_type": "markdown",
|
15 |
+
"metadata": {
|
16 |
+
"id": "8lw0EgLex-YZ"
|
17 |
+
},
|
18 |
"source": [
|
19 |
"# SoniTranslate\n",
|
20 |
"\n",
|
|
|
24 |
"| 🚀 Online Demo in HF | [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/r3gm/SoniTranslate_translate_audio_of_a_video_content) |\n",
|
25 |
"\n",
|
26 |
"\n"
|
27 |
+
]
|
|
|
|
|
|
|
28 |
},
|
29 |
{
|
30 |
"cell_type": "code",
|
31 |
"execution_count": null,
|
32 |
"metadata": {
|
33 |
+
"cellView": "form",
|
34 |
+
"id": "LUgwm0rfx0_J"
|
35 |
},
|
36 |
"outputs": [],
|
37 |
"source": [
|
|
|
39 |
"!git clone https://github.com/r3gm/SoniTranslate.git\n",
|
40 |
"%cd SoniTranslate\n",
|
41 |
"\n",
|
42 |
+
"!pip uninstall chex pandas-stubs ibis-framework albumentations albucore -y -q\n",
|
43 |
+
"!python -m pip install -q pip==23.1.2\n",
|
44 |
"!apt install git-lfs\n",
|
45 |
"!git lfs install\n",
|
46 |
"\n",
|
|
|
63 |
},
|
64 |
{
|
65 |
"cell_type": "markdown",
|
66 |
+
"metadata": {
|
67 |
+
"id": "LTaTstXPXNg2"
|
68 |
+
},
|
69 |
"source": [
|
70 |
"One important step is to accept the license agreement for using Pyannote. You need to have an account on Hugging Face and `accept the license to use the models`: https://huggingface.co/pyannote/speaker-diarization and https://huggingface.co/pyannote/segmentation\n",
|
71 |
"\n",
|
72 |
+
"Get your KEY TOKEN here: https://hf.co/settings/tokens\n",
|
73 |
"\n",
|
74 |
+
"When you are creating the new Access Token in Hugging Face, make sure to tick \"Read access to contents of all public gated repos you can access\"."
|
75 |
+
]
|
|
|
|
|
|
|
|
|
|
|
76 |
},
|
77 |
{
|
78 |
"cell_type": "code",
|
79 |
+
"execution_count": null,
|
80 |
+
"metadata": {
|
81 |
+
"cellView": "form",
|
82 |
+
"id": "XkhXfaFw4R4J"
|
83 |
+
},
|
84 |
+
"outputs": [],
|
85 |
"source": [
|
86 |
"#@markdown # `RUN THE WEB APP`\n",
|
87 |
"YOUR_HF_TOKEN = \"\" #@param {type:'string'}\n",
|
|
|
93 |
"\n",
|
94 |
"%cd /content/SoniTranslate\n",
|
95 |
"!python app_rvc.py --theme {theme} --verbosity_level {verbosity_level} --language {interface_language} --public_url"
|
96 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
},
|
98 |
{
|
99 |
"cell_type": "markdown",
|
|
|
|
|
|
|
100 |
"metadata": {
|
101 |
"id": "KJW3KrhZJh0u"
|
102 |
+
},
|
103 |
+
"source": [
|
104 |
+
"Open the `public URL` when it appears"
|
105 |
+
]
|
106 |
+
}
|
107 |
+
],
|
108 |
+
"metadata": {
|
109 |
+
"accelerator": "GPU",
|
110 |
+
"colab": {
|
111 |
+
"gpuType": "T4",
|
112 |
+
"include_colab_link": true,
|
113 |
+
"provenance": []
|
114 |
+
},
|
115 |
+
"kernelspec": {
|
116 |
+
"display_name": "Python 3",
|
117 |
+
"name": "python3"
|
118 |
+
},
|
119 |
+
"language_info": {
|
120 |
+
"name": "python"
|
121 |
}
|
122 |
+
},
|
123 |
+
"nbformat": 4,
|
124 |
+
"nbformat_minor": 0
|
125 |
+
}
|
SoniTranslate_Colab_embedded.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
assets/Video.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e03c405628162038a10679372d43ca0e38e40507e1e5fce8b6d252ec3ca4dbf8
|
3 |
+
size 1931745
|
assets/video_dub.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d501c9a646b576d7bc3af0c51900dd6b66992fe82ac90aded1f28091af17ca65
|
3 |
+
size 1813313
|
docs/windows_install.md
CHANGED
@@ -28,7 +28,7 @@ Before you start installing and using SoniTranslate, there are a few things you
|
|
28 |
3. If you see that your CUDA version is less than 11.8, you should update your NVIDIA driver. Visit the NVIDIA website's driver download page (https://www.nvidia.com/Download/index.aspx) and enter your graphics card information.
|
29 |
|
30 |
4. Accept the license agreement for using Pyannote. You need to have an account on Hugging Face and `accept the license to use the models`: https://huggingface.co/pyannote/speaker-diarization and https://huggingface.co/pyannote/segmentation
|
31 |
-
5. Create a [huggingface token](https://huggingface.co/settings/tokens). Hugging Face is a natural language processing platform that provides access to state-of-the-art models and tools. You will need to create a token in order to use some of the automatic model download features in SoniTranslate. Follow the instructions on the Hugging Face website to create a token.
|
32 |
6. Install [Anaconda](https://www.anaconda.com/) or [Miniconda](https://docs.anaconda.com/free/miniconda/miniconda-install/). Anaconda is a free and open-source distribution of Python and R. It includes a package manager called conda that makes it easy to install and manage Python environments and packages. Follow the instructions on the Anaconda website to download and install Anaconda on your system.
|
33 |
7. Install Git for your system. Git is a version control system that helps you track changes to your code and collaborate with other developers. You can install Git with Anaconda by running `conda install -c anaconda git -y` in your terminal (Do this after step 1 in the following section.). If you have trouble installing Git via Anaconda, you can use the following link instead:
|
34 |
- [Git for Windows](https://git-scm.com/download/win)
|
@@ -44,6 +44,7 @@ To install SoniTranslate, follow these steps:
|
|
44 |
```
|
45 |
conda create -n sonitr python=3.10 -y
|
46 |
conda activate sonitr
|
|
|
47 |
```
|
48 |
|
49 |
2. Clone this github repository and navigate to it:
|
@@ -100,7 +101,6 @@ conda env config vars set YOUR_HF_TOKEN="YOUR_HUGGING_FACE_TOKEN_HERE"
|
|
100 |
conda deactivate
|
101 |
```
|
102 |
|
103 |
-
|
104 |
### Running SoniTranslate
|
105 |
|
106 |
To run SoniTranslate locally, make sure the `sonitr` conda environment is active:
|
|
|
28 |
3. If you see that your CUDA version is less than 11.8, you should update your NVIDIA driver. Visit the NVIDIA website's driver download page (https://www.nvidia.com/Download/index.aspx) and enter your graphics card information.
|
29 |
|
30 |
4. Accept the license agreement for using Pyannote. You need to have an account on Hugging Face and `accept the license to use the models`: https://huggingface.co/pyannote/speaker-diarization and https://huggingface.co/pyannote/segmentation
|
31 |
+
5. Create a [huggingface token](https://huggingface.co/settings/tokens). Hugging Face is a natural language processing platform that provides access to state-of-the-art models and tools. You will need to create a token in order to use some of the automatic model download features in SoniTranslate. Follow the instructions on the Hugging Face website to create a token. When you are creating the new Access Token in Hugging Face, make sure to tick "Read access to contents of all public gated repos you can access".
|
32 |
6. Install [Anaconda](https://www.anaconda.com/) or [Miniconda](https://docs.anaconda.com/free/miniconda/miniconda-install/). Anaconda is a free and open-source distribution of Python and R. It includes a package manager called conda that makes it easy to install and manage Python environments and packages. Follow the instructions on the Anaconda website to download and install Anaconda on your system.
|
33 |
7. Install Git for your system. Git is a version control system that helps you track changes to your code and collaborate with other developers. You can install Git with Anaconda by running `conda install -c anaconda git -y` in your terminal (Do this after step 1 in the following section.). If you have trouble installing Git via Anaconda, you can use the following link instead:
|
34 |
- [Git for Windows](https://git-scm.com/download/win)
|
|
|
44 |
```
|
45 |
conda create -n sonitr python=3.10 -y
|
46 |
conda activate sonitr
|
47 |
+
python -m pip install pip==23.1.2
|
48 |
```
|
49 |
|
50 |
2. Clone this github repository and navigate to it:
|
|
|
101 |
conda deactivate
|
102 |
```
|
103 |
|
|
|
104 |
### Running SoniTranslate
|
105 |
|
106 |
To run SoniTranslate locally, make sure the `sonitr` conda environment is active:
|
requirements.txt
CHANGED
@@ -1,19 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
praat-parselmouth>=0.4.3
|
2 |
pyworld==0.3.2
|
3 |
faiss-cpu==1.7.3
|
4 |
torchcrepe==0.0.20
|
5 |
ffmpeg-python>=0.2.0
|
6 |
-
fairseq
|
7 |
gdown
|
8 |
rarfile
|
|
|
9 |
transformers
|
10 |
accelerate
|
11 |
optimum
|
12 |
sentencepiece
|
13 |
srt
|
|
|
14 |
git+https://github.com/R3gm/openvoice_package.git@lite
|
15 |
-
openai==1.14.3
|
16 |
-
tiktoken==0.6.0
|
17 |
# Documents
|
18 |
-
|
19 |
-
python-docx
|
|
|
|
|
|
|
|
|
|
1 |
+
# Temporal requirements
|
2 |
+
nest_asyncio
|
3 |
+
--extra-index-url https://download.pytorch.org/whl/cu118
|
4 |
+
torch>=2.1.0+cu118
|
5 |
+
torchvision>=0.16.0+cu118
|
6 |
+
torchaudio>=2.1.0+cu118
|
7 |
+
yt-dlp
|
8 |
+
gradio==4.19.2
|
9 |
+
pydub==0.25.1
|
10 |
+
edge_tts==6.1.7
|
11 |
+
deep_translator==1.11.4
|
12 |
+
git+https://github.com/m-bain/whisperX.git@a5dca2c
|
13 |
+
gTTS
|
14 |
+
gradio_client==0.10.1
|
15 |
praat-parselmouth>=0.4.3
|
16 |
pyworld==0.3.2
|
17 |
faiss-cpu==1.7.3
|
18 |
torchcrepe==0.0.20
|
19 |
ffmpeg-python>=0.2.0
|
20 |
+
git+https://github.com/facebookresearch/fairseq.git@refs/pull/5359/merge
|
21 |
gdown
|
22 |
rarfile
|
23 |
+
IPython
|
24 |
transformers
|
25 |
accelerate
|
26 |
optimum
|
27 |
sentencepiece
|
28 |
srt
|
29 |
+
onnxruntime-gpu
|
30 |
git+https://github.com/R3gm/openvoice_package.git@lite
|
|
|
|
|
31 |
# Documents
|
32 |
+
PyPDF2
|
33 |
+
python-docx
|
34 |
+
|
35 |
+
# after this
|
36 |
+
# pip install git+https://github.com/omry/omegaconf.git@refs/pull/1137/merge
|
37 |
+
|
requirements_base.txt
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
--extra-index-url https://download.pytorch.org/whl/cu118
|
2 |
+
torch>=2.1.0+cu118
|
3 |
+
torchvision>=0.16.0+cu118
|
4 |
+
torchaudio>=2.1.0+cu118
|
5 |
+
yt-dlp
|
6 |
+
gradio==4.19.2
|
7 |
+
pydub==0.25.1
|
8 |
+
edge_tts==6.1.7
|
9 |
+
deep_translator==1.11.4
|
10 |
+
git+https://github.com/R3gm/pyannote-audio.git@3.1.1
|
11 |
+
git+https://github.com/R3gm/whisperX.git@cuda_11_8
|
12 |
+
nest_asyncio
|
13 |
+
gTTS
|
14 |
+
gradio_client==0.10.1
|
15 |
+
IPython
|
requirements_extra.txt
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
praat-parselmouth>=0.4.3
|
2 |
+
pyworld==0.3.2
|
3 |
+
faiss-cpu==1.7.3
|
4 |
+
torchcrepe==0.0.20
|
5 |
+
ffmpeg-python>=0.2.0
|
6 |
+
fairseq==0.12.2
|
7 |
+
gdown
|
8 |
+
rarfile
|
9 |
+
transformers
|
10 |
+
accelerate
|
11 |
+
optimum
|
12 |
+
sentencepiece
|
13 |
+
srt
|
14 |
+
git+https://github.com/R3gm/openvoice_package.git@lite
|
15 |
+
openai==1.14.3
|
16 |
+
tiktoken==0.6.0
|
17 |
+
# Documents
|
18 |
+
pypdf==4.2.0
|
19 |
+
python-docx
|
soni_translate/languages_gui.py
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
soni_translate/mdx_net.py
CHANGED
@@ -367,18 +367,6 @@ def run_mdx(
|
|
367 |
processor_num = -1
|
368 |
m_threads = 1
|
369 |
|
370 |
-
if os.environ.get("ZERO_GPU") == "TRUE":
|
371 |
-
duration = librosa.get_duration(filename=filename)
|
372 |
-
|
373 |
-
if duration < 60:
|
374 |
-
pass
|
375 |
-
elif duration >= 60 and duration <= 900:
|
376 |
-
m_threads = 4
|
377 |
-
elif duration > 900:
|
378 |
-
m_threads = 16
|
379 |
-
|
380 |
-
logger.info(f"MDX-NET Threads: {m_threads}, duration {duration}")
|
381 |
-
|
382 |
model_hash = MDX.get_hash(model_path)
|
383 |
mp = model_params.get(model_hash)
|
384 |
model = MDXModel(
|
|
|
367 |
processor_num = -1
|
368 |
m_threads = 1
|
369 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
370 |
model_hash = MDX.get_hash(model_path)
|
371 |
mp = model_params.get(model_hash)
|
372 |
model = MDXModel(
|
soni_translate/preprocessor.py
CHANGED
@@ -14,7 +14,7 @@ ERROR_INCORRECT_CODEC_PARAMETERS = [
|
|
14 |
TESTED_CODECS = [
|
15 |
"h264", # mp4
|
16 |
"h265", # mp4
|
17 |
-
"hevc",
|
18 |
"vp9", # webm
|
19 |
"mpeg4", # mp4
|
20 |
"mpeg2video", # mpg
|
|
|
14 |
TESTED_CODECS = [
|
15 |
"h264", # mp4
|
16 |
"h265", # mp4
|
17 |
+
"hevc",
|
18 |
"vp9", # webm
|
19 |
"mpeg4", # mp4
|
20 |
"mpeg2video", # mpg
|
soni_translate/speech_segmentation.py
CHANGED
@@ -14,87 +14,6 @@ from .logging_setup import logger
|
|
14 |
from .postprocessor import sanitize_file_name
|
15 |
from .utils import remove_directory_contents, run_command
|
16 |
|
17 |
-
# ZERO GPU CONFIG
|
18 |
-
import spaces
|
19 |
-
import copy
|
20 |
-
import random
|
21 |
-
import time
|
22 |
-
|
23 |
-
def random_sleep():
|
24 |
-
if os.environ.get("ZERO_GPU") == "TRUE":
|
25 |
-
print("Random sleep")
|
26 |
-
sleep_time = round(random.uniform(7.2, 9.9), 1)
|
27 |
-
time.sleep(sleep_time)
|
28 |
-
|
29 |
-
|
30 |
-
@spaces.GPU
|
31 |
-
def load_and_transcribe_audio(asr_model, audio, compute_type, language, asr_options, batch_size, segment_duration_limit):
|
32 |
-
# Load model
|
33 |
-
model = whisperx.load_model(
|
34 |
-
asr_model,
|
35 |
-
os.environ.get("SONITR_DEVICE") if os.environ.get("ZERO_GPU") != "TRUE" else "cuda",
|
36 |
-
compute_type=compute_type,
|
37 |
-
language=language,
|
38 |
-
asr_options=asr_options,
|
39 |
-
)
|
40 |
-
|
41 |
-
# Transcribe audio
|
42 |
-
result = model.transcribe(
|
43 |
-
audio,
|
44 |
-
batch_size=batch_size,
|
45 |
-
chunk_size=segment_duration_limit,
|
46 |
-
print_progress=True,
|
47 |
-
)
|
48 |
-
|
49 |
-
del model
|
50 |
-
gc.collect()
|
51 |
-
torch.cuda.empty_cache() # noqa
|
52 |
-
|
53 |
-
return result
|
54 |
-
|
55 |
-
def load_align_and_align_segments(result, audio, DAMHF):
|
56 |
-
|
57 |
-
# Load alignment model
|
58 |
-
model_a, metadata = whisperx.load_align_model(
|
59 |
-
language_code=result["language"],
|
60 |
-
device=os.environ.get("SONITR_DEVICE") if os.environ.get("ZERO_GPU") != "TRUE" else "cuda",
|
61 |
-
model_name=None
|
62 |
-
if result["language"] in DAMHF.keys()
|
63 |
-
else EXTRA_ALIGN[result["language"]],
|
64 |
-
)
|
65 |
-
|
66 |
-
# Align segments
|
67 |
-
alignment_result = whisperx.align(
|
68 |
-
result["segments"],
|
69 |
-
model_a,
|
70 |
-
metadata,
|
71 |
-
audio,
|
72 |
-
os.environ.get("SONITR_DEVICE") if os.environ.get("ZERO_GPU") != "TRUE" else "cuda",
|
73 |
-
return_char_alignments=True,
|
74 |
-
print_progress=False,
|
75 |
-
)
|
76 |
-
|
77 |
-
# Clean up
|
78 |
-
del model_a
|
79 |
-
gc.collect()
|
80 |
-
torch.cuda.empty_cache() # noqa
|
81 |
-
|
82 |
-
return alignment_result
|
83 |
-
|
84 |
-
@spaces.GPU
|
85 |
-
def diarize_audio(diarize_model, audio_wav, min_speakers, max_speakers):
|
86 |
-
|
87 |
-
if os.environ.get("ZERO_GPU") == "TRUE":
|
88 |
-
diarize_model.model.to(torch.device("cuda"))
|
89 |
-
diarize_segments = diarize_model(
|
90 |
-
audio_wav,
|
91 |
-
min_speakers=min_speakers,
|
92 |
-
max_speakers=max_speakers
|
93 |
-
)
|
94 |
-
return diarize_segments
|
95 |
-
|
96 |
-
# ZERO GPU CONFIG
|
97 |
-
|
98 |
ASR_MODEL_OPTIONS = [
|
99 |
"tiny",
|
100 |
"base",
|
@@ -224,6 +143,7 @@ def find_whisper_models():
|
|
224 |
folders.append(folder)
|
225 |
return folders
|
226 |
|
|
|
227 |
def transcribe_speech(
|
228 |
audio_wav,
|
229 |
asr_model,
|
@@ -308,17 +228,29 @@ def transcribe_speech(
|
|
308 |
asr_model = model_dir
|
309 |
logger.info(f"ASR Model: {str(model_dir)}")
|
310 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
311 |
audio = whisperx.load_audio(audio_wav)
|
312 |
-
|
313 |
-
|
314 |
-
|
|
|
|
|
315 |
)
|
316 |
|
317 |
if result["language"] == "zh" and not prompt:
|
318 |
result["language"] = "zh-TW"
|
319 |
logger.info("Chinese - Traditional (zh-TW)")
|
320 |
|
321 |
-
|
|
|
|
|
322 |
return audio, result
|
323 |
|
324 |
|
@@ -369,9 +301,25 @@ def align_speech(audio, result):
|
|
369 |
)
|
370 |
return result
|
371 |
|
372 |
-
|
373 |
-
|
374 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
375 |
return result
|
376 |
|
377 |
|
@@ -471,9 +419,9 @@ def diarize_speech(
|
|
471 |
)
|
472 |
else:
|
473 |
raise error
|
474 |
-
|
475 |
-
|
476 |
-
|
477 |
|
478 |
result_diarize = whisperx.assign_word_speakers(
|
479 |
diarize_segments, result
|
|
|
14 |
from .postprocessor import sanitize_file_name
|
15 |
from .utils import remove_directory_contents, run_command
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
ASR_MODEL_OPTIONS = [
|
18 |
"tiny",
|
19 |
"base",
|
|
|
143 |
folders.append(folder)
|
144 |
return folders
|
145 |
|
146 |
+
|
147 |
def transcribe_speech(
|
148 |
audio_wav,
|
149 |
asr_model,
|
|
|
228 |
asr_model = model_dir
|
229 |
logger.info(f"ASR Model: {str(model_dir)}")
|
230 |
|
231 |
+
model = whisperx.load_model(
|
232 |
+
asr_model,
|
233 |
+
os.environ.get("SONITR_DEVICE"),
|
234 |
+
compute_type=compute_type,
|
235 |
+
language=SOURCE_LANGUAGE,
|
236 |
+
asr_options=asr_options,
|
237 |
+
)
|
238 |
+
|
239 |
audio = whisperx.load_audio(audio_wav)
|
240 |
+
result = model.transcribe(
|
241 |
+
audio,
|
242 |
+
batch_size=batch_size,
|
243 |
+
chunk_size=segment_duration_limit,
|
244 |
+
print_progress=True,
|
245 |
)
|
246 |
|
247 |
if result["language"] == "zh" and not prompt:
|
248 |
result["language"] = "zh-TW"
|
249 |
logger.info("Chinese - Traditional (zh-TW)")
|
250 |
|
251 |
+
del model
|
252 |
+
gc.collect()
|
253 |
+
torch.cuda.empty_cache() # noqa
|
254 |
return audio, result
|
255 |
|
256 |
|
|
|
301 |
)
|
302 |
return result
|
303 |
|
304 |
+
model_a, metadata = whisperx.load_align_model(
|
305 |
+
language_code=result["language"],
|
306 |
+
device=os.environ.get("SONITR_DEVICE"),
|
307 |
+
model_name=None
|
308 |
+
if result["language"] in DAMHF.keys()
|
309 |
+
else EXTRA_ALIGN[result["language"]],
|
310 |
+
)
|
311 |
+
result = whisperx.align(
|
312 |
+
result["segments"],
|
313 |
+
model_a,
|
314 |
+
metadata,
|
315 |
+
audio,
|
316 |
+
os.environ.get("SONITR_DEVICE"),
|
317 |
+
return_char_alignments=True,
|
318 |
+
print_progress=False,
|
319 |
+
)
|
320 |
+
del model_a
|
321 |
+
gc.collect()
|
322 |
+
torch.cuda.empty_cache() # noqa
|
323 |
return result
|
324 |
|
325 |
|
|
|
419 |
)
|
420 |
else:
|
421 |
raise error
|
422 |
+
diarize_segments = diarize_model(
|
423 |
+
audio_wav, min_speakers=min_speakers, max_speakers=max_speakers
|
424 |
+
)
|
425 |
|
426 |
result_diarize = whisperx.assign_word_speakers(
|
427 |
diarize_segments, result
|
soni_translate/text_multiformat_processor.py
CHANGED
@@ -357,7 +357,7 @@ def add_border_to_image(
|
|
357 |
|
358 |
bordered_img = ImageOps.expand(resized_img, padding, fill=border_color)
|
359 |
|
360 |
-
bordered_img.save(image_path)
|
361 |
|
362 |
return image_path
|
363 |
|
@@ -506,6 +506,8 @@ def doc_to_txtximg_pages(
|
|
506 |
images = []
|
507 |
for image_file_object in page.images:
|
508 |
img_name = f"{images_folder}{i:04d}_{count:02d}_{image_file_object.name}"
|
|
|
|
|
509 |
images.append(img_name)
|
510 |
with open(img_name, "wb") as fp:
|
511 |
fp.write(image_file_object.data)
|
|
|
357 |
|
358 |
bordered_img = ImageOps.expand(resized_img, padding, fill=border_color)
|
359 |
|
360 |
+
bordered_img.save(image_path, format='PNG')
|
361 |
|
362 |
return image_path
|
363 |
|
|
|
506 |
images = []
|
507 |
for image_file_object in page.images:
|
508 |
img_name = f"{images_folder}{i:04d}_{count:02d}_{image_file_object.name}"
|
509 |
+
if not img_name.lower().endswith('.png'):
|
510 |
+
img_name = os.path.splitext(img_name)[0] + '.png'
|
511 |
images.append(img_name)
|
512 |
with open(img_name, "wb") as fp:
|
513 |
fp.write(image_file_object.data)
|
soni_translate/text_to_speech.py
CHANGED
@@ -15,6 +15,7 @@ from .utils import (
|
|
15 |
remove_directory_contents,
|
16 |
remove_files,
|
17 |
run_command,
|
|
|
18 |
)
|
19 |
import numpy as np
|
20 |
from typing import Any, Dict
|
@@ -59,7 +60,7 @@ def error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename):
|
|
59 |
# Read audio data from the TemporaryFile using soundfile
|
60 |
audio_data, samplerate = sf.read(f)
|
61 |
f.close() # Close the TemporaryFile
|
62 |
-
|
63 |
filename, audio_data, samplerate, format="ogg", subtype="vorbis"
|
64 |
)
|
65 |
|
@@ -73,7 +74,7 @@ def error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename):
|
|
73 |
sample_rate_aux = 22050
|
74 |
duration = float(segment["end"]) - float(segment["start"])
|
75 |
data = np.zeros(int(sample_rate_aux * duration)).astype(np.float32)
|
76 |
-
|
77 |
filename, data, sample_rate_aux, format="ogg", subtype="vorbis"
|
78 |
)
|
79 |
logger.error("Audio will be replaced -> [silent audio].")
|
@@ -181,7 +182,7 @@ def segments_egde_tts(filtered_edge_segments, TRANSLATE_AUDIO_TO, is_gui):
|
|
181 |
# os.remove(temp_file)
|
182 |
|
183 |
# Save file
|
184 |
-
|
185 |
file=filename,
|
186 |
samplerate=sample_rate,
|
187 |
data=data,
|
@@ -256,7 +257,7 @@ def segments_bark_tts(
|
|
256 |
speech_output.cpu().numpy().squeeze().astype(np.float32),
|
257 |
sampling_rate,
|
258 |
)
|
259 |
-
|
260 |
file=filename,
|
261 |
samplerate=sampling_rate,
|
262 |
data=data_tts,
|
@@ -299,7 +300,7 @@ def uromanize(input_string):
|
|
299 |
stderr=subprocess.PIPE,
|
300 |
)
|
301 |
stdout, stderr = process.communicate()
|
302 |
-
script_path = os.path.join("./uroman", "
|
303 |
|
304 |
command = ["perl", script_path]
|
305 |
|
@@ -362,7 +363,7 @@ def segments_vits_tts(filtered_vits_segments, TRANSLATE_AUDIO_TO):
|
|
362 |
sampling_rate,
|
363 |
)
|
364 |
# Save file
|
365 |
-
|
366 |
file=filename,
|
367 |
samplerate=sampling_rate,
|
368 |
data=data_tts,
|
@@ -667,7 +668,7 @@ def segments_coqui_tts(
|
|
667 |
sampling_rate,
|
668 |
)
|
669 |
# Save file
|
670 |
-
|
671 |
file=filename,
|
672 |
samplerate=sampling_rate,
|
673 |
data=data_tts,
|
@@ -855,7 +856,7 @@ def segments_vits_onnx_tts(filtered_onnx_vits_segments, TRANSLATE_AUDIO_TO):
|
|
855 |
sampling_rate,
|
856 |
)
|
857 |
# Save file
|
858 |
-
|
859 |
file=filename,
|
860 |
samplerate=sampling_rate,
|
861 |
data=data_tts,
|
@@ -925,7 +926,7 @@ def segments_openai_tts(
|
|
925 |
sampling_rate,
|
926 |
)
|
927 |
|
928 |
-
|
929 |
file=filename,
|
930 |
samplerate=sampling_rate,
|
931 |
data=data_tts,
|
@@ -1509,7 +1510,7 @@ def toneconverter_freevc(
|
|
1509 |
target_wav=original_wav_audio_segment,
|
1510 |
)
|
1511 |
|
1512 |
-
|
1513 |
file=save_path,
|
1514 |
samplerate=tts.voice_converter.vc_config.audio.output_sample_rate,
|
1515 |
data=wav,
|
|
|
15 |
remove_directory_contents,
|
16 |
remove_files,
|
17 |
run_command,
|
18 |
+
write_chunked,
|
19 |
)
|
20 |
import numpy as np
|
21 |
from typing import Any, Dict
|
|
|
60 |
# Read audio data from the TemporaryFile using soundfile
|
61 |
audio_data, samplerate = sf.read(f)
|
62 |
f.close() # Close the TemporaryFile
|
63 |
+
write_chunked(
|
64 |
filename, audio_data, samplerate, format="ogg", subtype="vorbis"
|
65 |
)
|
66 |
|
|
|
74 |
sample_rate_aux = 22050
|
75 |
duration = float(segment["end"]) - float(segment["start"])
|
76 |
data = np.zeros(int(sample_rate_aux * duration)).astype(np.float32)
|
77 |
+
write_chunked(
|
78 |
filename, data, sample_rate_aux, format="ogg", subtype="vorbis"
|
79 |
)
|
80 |
logger.error("Audio will be replaced -> [silent audio].")
|
|
|
182 |
# os.remove(temp_file)
|
183 |
|
184 |
# Save file
|
185 |
+
write_chunked(
|
186 |
file=filename,
|
187 |
samplerate=sample_rate,
|
188 |
data=data,
|
|
|
257 |
speech_output.cpu().numpy().squeeze().astype(np.float32),
|
258 |
sampling_rate,
|
259 |
)
|
260 |
+
write_chunked(
|
261 |
file=filename,
|
262 |
samplerate=sampling_rate,
|
263 |
data=data_tts,
|
|
|
300 |
stderr=subprocess.PIPE,
|
301 |
)
|
302 |
stdout, stderr = process.communicate()
|
303 |
+
script_path = os.path.join("./uroman", "uroman", "uroman.pl")
|
304 |
|
305 |
command = ["perl", script_path]
|
306 |
|
|
|
363 |
sampling_rate,
|
364 |
)
|
365 |
# Save file
|
366 |
+
write_chunked(
|
367 |
file=filename,
|
368 |
samplerate=sampling_rate,
|
369 |
data=data_tts,
|
|
|
668 |
sampling_rate,
|
669 |
)
|
670 |
# Save file
|
671 |
+
write_chunked(
|
672 |
file=filename,
|
673 |
samplerate=sampling_rate,
|
674 |
data=data_tts,
|
|
|
856 |
sampling_rate,
|
857 |
)
|
858 |
# Save file
|
859 |
+
write_chunked(
|
860 |
file=filename,
|
861 |
samplerate=sampling_rate,
|
862 |
data=data_tts,
|
|
|
926 |
sampling_rate,
|
927 |
)
|
928 |
|
929 |
+
write_chunked(
|
930 |
file=filename,
|
931 |
samplerate=sampling_rate,
|
932 |
data=data_tts,
|
|
|
1510 |
target_wav=original_wav_audio_segment,
|
1511 |
)
|
1512 |
|
1513 |
+
write_chunked(
|
1514 |
file=save_path,
|
1515 |
samplerate=tts.voice_converter.vc_config.audio.output_sample_rate,
|
1516 |
data=wav,
|
soni_translate/utils.py
CHANGED
@@ -3,6 +3,8 @@ from .logging_setup import logger
|
|
3 |
from urllib.parse import urlparse
|
4 |
from IPython.utils import capture
|
5 |
import re
|
|
|
|
|
6 |
|
7 |
VIDEO_EXTENSIONS = [
|
8 |
".mp4",
|
@@ -66,6 +68,31 @@ def run_command(command):
|
|
66 |
raise Exception(errors.decode())
|
67 |
|
68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
def print_tree_directory(root_dir, indent=""):
|
70 |
if not os.path.exists(root_dir):
|
71 |
logger.error(f"{indent} Invalid directory or file: {root_dir}")
|
@@ -143,10 +170,6 @@ def manual_download(url, dst):
|
|
143 |
|
144 |
|
145 |
def download_list(text_downloads):
|
146 |
-
|
147 |
-
if os.environ.get("ZERO_GPU") == "TRUE":
|
148 |
-
raise RuntimeError("This option is disabled in this demo.")
|
149 |
-
|
150 |
try:
|
151 |
urls = [elem.strip() for elem in text_downloads.split(",")]
|
152 |
except Exception as error:
|
|
|
3 |
from urllib.parse import urlparse
|
4 |
from IPython.utils import capture
|
5 |
import re
|
6 |
+
import soundfile as sf
|
7 |
+
import numpy as np
|
8 |
|
9 |
VIDEO_EXTENSIONS = [
|
10 |
".mp4",
|
|
|
68 |
raise Exception(errors.decode())
|
69 |
|
70 |
|
71 |
+
def write_chunked(
|
72 |
+
file,
|
73 |
+
data,
|
74 |
+
samplerate,
|
75 |
+
subtype=None,
|
76 |
+
endian=None,
|
77 |
+
format=None,
|
78 |
+
closefd=True,
|
79 |
+
chunk_size=0x1000
|
80 |
+
):
|
81 |
+
|
82 |
+
data = np.asarray(data)
|
83 |
+
if data.ndim == 1:
|
84 |
+
channels = 1
|
85 |
+
else:
|
86 |
+
channels = data.shape[1]
|
87 |
+
with sf.SoundFile(
|
88 |
+
file, 'w', samplerate, channels,
|
89 |
+
subtype, endian, format, closefd
|
90 |
+
) as f:
|
91 |
+
num_chunks = (len(data) + chunk_size - 1) // chunk_size
|
92 |
+
for chunk in np.array_split(data, num_chunks, axis=0):
|
93 |
+
f.write(chunk)
|
94 |
+
|
95 |
+
|
96 |
def print_tree_directory(root_dir, indent=""):
|
97 |
if not os.path.exists(root_dir):
|
98 |
logger.error(f"{indent} Invalid directory or file: {root_dir}")
|
|
|
170 |
|
171 |
|
172 |
def download_list(text_downloads):
|
|
|
|
|
|
|
|
|
173 |
try:
|
174 |
urls = [elem.strip() for elem in text_downloads.split(",")]
|
175 |
except Exception as error:
|
voice_main.py
CHANGED
@@ -17,7 +17,11 @@ from lib.audio import load_audio
|
|
17 |
import soundfile as sf
|
18 |
import edge_tts
|
19 |
import asyncio
|
20 |
-
from soni_translate.utils import
|
|
|
|
|
|
|
|
|
21 |
from scipy import signal
|
22 |
from time import time as ttime
|
23 |
import faiss
|
@@ -437,10 +441,12 @@ class ClassVoices:
|
|
437 |
output_audio_path = new_path
|
438 |
|
439 |
# Save file
|
440 |
-
|
441 |
file=output_audio_path,
|
442 |
samplerate=final_sr,
|
443 |
-
data=audio_opt
|
|
|
|
|
444 |
)
|
445 |
|
446 |
self.model_config[task_id]["result"].append(output_audio_path)
|
|
|
17 |
import soundfile as sf
|
18 |
import edge_tts
|
19 |
import asyncio
|
20 |
+
from soni_translate.utils import (
|
21 |
+
remove_directory_contents,
|
22 |
+
create_directories,
|
23 |
+
write_chunked,
|
24 |
+
)
|
25 |
from scipy import signal
|
26 |
from time import time as ttime
|
27 |
import faiss
|
|
|
441 |
output_audio_path = new_path
|
442 |
|
443 |
# Save file
|
444 |
+
write_chunked(
|
445 |
file=output_audio_path,
|
446 |
samplerate=final_sr,
|
447 |
+
data=audio_opt,
|
448 |
+
format="ogg",
|
449 |
+
subtype="vorbis",
|
450 |
)
|
451 |
|
452 |
self.model_config[task_id]["result"].append(output_audio_path)
|