test-rtechs commited on
Commit
9670139
1 Parent(s): 42f33fc

Upload 38 files

Browse files
.devcontainer/devcontainer.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "image": "mcr.microsoft.com/devcontainers/universal:2",
3
+ "features": {"ghcr.io/devcontainers-contrib/features/ffmpeg-apt-get:1": {},
4
+ "ghcr.io/devcontainers-contrib/features/yt-dlp:2": {}
5
+ }
6
+ }
.gitattributes CHANGED
@@ -1,2 +1,4 @@
1
  assets/Video_subtitled.mp4 filter=lfs diff=lfs merge=lfs -text
2
  assets/Video_main.mp4 filter=lfs diff=lfs merge=lfs -text
 
 
 
1
  assets/Video_subtitled.mp4 filter=lfs diff=lfs merge=lfs -text
2
  assets/Video_main.mp4 filter=lfs diff=lfs merge=lfs -text
3
+ assets/video_dub.mp4 filter=lfs diff=lfs merge=lfs -text
4
+ assets/Video.mp4 filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,16 +1,364 @@
1
- ---
2
- title: Video Dubbing (SoniTranslate)
3
- emoji: 🌍
4
- colorFrom: blue
5
- colorTo: green
6
- sdk: gradio
7
- sdk_version: 4.31.3
8
- app_file: app_rvc.py
9
- pinned: true
10
- license: mit
11
- short_description: Video Dubbing with Open Source Projects
12
- preload_from_hub:
13
- - Systran/faster-whisper-large-v3
14
- ---
15
-
16
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🎥 SoniTranslate 🈷️
2
+
3
+ 🎬 Video Translation with Synchronized Audio 🌐
4
+
5
+ SonyTranslate is a powerful and user-friendly web application that allows you to easily translate videos into different languages. This repository hosts the code for the SonyTranslate web UI, which is built with the Gradio library to provide a seamless and interactive user experience.
6
+
7
+
8
+ | Description | Link |
9
+ | ----------- | ---- |
10
+ | 📙 Colab Notebook | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/R3gm/SoniTranslate/blob/main/SoniTranslate_Colab.ipynb) |
11
+ | 🎉 Repository | [![GitHub Repository](https://img.shields.io/badge/GitHub-Repository-black?style=flat-square&logo=github)](https://github.com/R3gm/SoniTranslate/) |
12
+ | 🚀 Online DEMO | [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/r3gm/SoniTranslate_translate_audio_of_a_video_content) |
13
+
14
+ ## SonyTranslate's web UI, which features a browser interface built on the Gradio library.
15
+ ![image](https://github.com/R3gm/SoniTranslate/assets/114810545/0d71fbf4-e9f0-4f8f-944e-8f3f1ea6a019)
16
+
17
+ ## Using the project: A video guide
18
+
19
+ For a comprehensive understanding of the project, we highly recommend watching this video tutorial by [DEV-MalletteS](https://github.com/DEV-MalletteS). You can watch it on YouTube by clicking the thumbnail below:
20
+
21
+ [![Watch the video](https://img.youtube.com/vi/SmGkFaSzq_Q/0.jpg)](https://www.youtube.com/watch?v=SmGkFaSzq_Q)
22
+
23
+
24
+ ## Supported languages for translation
25
+
26
+ | Language Code | Language |
27
+ |---------------|------------|
28
+ | en | English |
29
+ | fr | French |
30
+ | de | German |
31
+ | es | Spanish |
32
+ | it | Italian |
33
+ | ja | Japanese |
34
+ | nl | Dutch |
35
+ | uk | Ukrainian |
36
+ | pt | Portuguese |
37
+ | ar | Arabic |
38
+ | zh | Chinese - Simplified |
39
+ | zh-TW | Chinese - Traditional |
40
+ | cs | Czech |
41
+ | da | Danish |
42
+ | fi | Finnish |
43
+ | el | Greek |
44
+ | he | Hebrew |
45
+ | hu | Hungarian |
46
+ | ko | Korean |
47
+ | fa | Persian |
48
+ | pl | Polish |
49
+ | ru | Russian |
50
+ | tr | Turkish |
51
+ | ur | Urdu |
52
+ | hi | Hindi |
53
+ | vi | Vietnamese |
54
+ | id | Indonesian |
55
+ | bn | Bengali |
56
+ | te | Telugu |
57
+ | mr | Marathi |
58
+ | ta | Tamil |
59
+ | jw (or jv) | Javanese |
60
+ | ca | Catalan |
61
+ | ne | Nepali |
62
+ | th | Thai |
63
+ | sv | Swedish |
64
+ | am | Amharic |
65
+ | cy | Welsh |
66
+ | hr | Croatian |
67
+ | is | Icelandic |
68
+ | ka | Georgian |
69
+ | km | Khmer |
70
+ | sk | Slovak |
71
+ | sq | Albanian |
72
+ | sr | Serbian |
73
+ | az | Azerbaijani|
74
+ | bg | Bulgarian |
75
+ | gl | Galician |
76
+ | gu | Gujarati |
77
+ | kk | Kazakh |
78
+ | kn | Kannada |
79
+ | lt | Lithuanian |
80
+ | lv | Latvian |
81
+ | ml | Malayalam |
82
+ | ro | Romanian |
83
+ | si | Sinhala |
84
+ | su | Sundanese |
85
+ | et | Estonian |
86
+ | mk | Macedonian |
87
+ | sw | Swahili |
88
+ | af | Afrikaans |
89
+ | bs | Bosnian |
90
+ | la | Latin |
91
+ | my | Myanmar Burmese |
92
+ | no | Norwegian |
93
+ | as | Assamese |
94
+ | eu | Basque |
95
+ | ha | Hausa |
96
+ | ht | Haitian Creole |
97
+ | hy | Armenian |
98
+ | lo | Lao |
99
+ | mg | Malagasy |
100
+ | mn | Mongolian |
101
+ | mt | Maltese |
102
+ | pa | Punjabi |
103
+ | ps | Pashto |
104
+ | sl | Slovenian |
105
+ | sn | Shona |
106
+ | so | Somali |
107
+ | tg | Tajik |
108
+ | tk | Turkmen |
109
+ | tt | Tatar |
110
+ | uz | Uzbek |
111
+ | yo | Yoruba |
112
+
113
+ ### Non-transcription
114
+
115
+ | Language Code | Language |
116
+ |---------------|------------|
117
+ | ay | Aymara |
118
+ | bm | Bambara |
119
+ | ceb | Cebuano |
120
+ | ny | Chichewa |
121
+ | dv | Divehi |
122
+ | doi | Dogri |
123
+ | ee | Ewe |
124
+ | gn | Guarani |
125
+ | ilo | Iloko |
126
+ | rw | Kinyarwanda|
127
+ | kri | Krio |
128
+ | ku | Kurdish |
129
+ | ky | Kirghiz |
130
+ | lg | Ganda |
131
+ | mai | Maithili |
132
+ | or | Oriya |
133
+ | om | Oromo |
134
+ | qu | Quechua |
135
+ | sm | Samoan |
136
+ | ti | Tigrinya |
137
+ | ts | Tsonga |
138
+ | ak | Akan |
139
+ | ug | Uighur |
140
+
141
+ ## Example:
142
+
143
+ ### Original audio
144
+
145
+ https://github.com/R3gm/SoniTranslate/assets/114810545/db9e78c0-b228-4e81-9704-e62d5cc407a3
146
+
147
+
148
+
149
+ ### Translated audio
150
+
151
+ https://github.com/R3gm/SoniTranslate/assets/114810545/6a8ddc65-a46f-4653-9726-6df2615f0ef9
152
+
153
+
154
+ ## Colab Runtime
155
+
156
+ To run SoniTranslate using Colab Runtime: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/R3gm/SoniTranslate/blob/main/SoniTranslate_Colab.ipynb)
157
+
158
+ ## Install Locally (Installation tested in Linux)
159
+
160
+ ### Before You Start
161
+
162
+ Before you start installing and using SoniTranslate, there are a few things you need to do:
163
+
164
+ 1. Install the NVIDIA drivers for CUDA 11.8.0, NVIDIA CUDA is a parallel computing platform and programming model that enables developers to use the power of NVIDIA graphics processing units (GPUs) to speed up compute-intensive tasks. You can find the drivers [here](https://developer.nvidia.com/cuda-toolkit-archive). Follow the instructions on the website to download and install the drivers.
165
+ 2. Accept the license agreement for using Pyannote. You need to have an account on Hugging Face and `accept the license to use the models`: https://huggingface.co/pyannote/speaker-diarization and https://huggingface.co/pyannote/segmentation
166
+ 3. Create a [huggingface token](https://huggingface.co/settings/tokens). Hugging Face is a natural language processing platform that provides access to state-of-the-art models and tools. You will need to create a token in order to use some of the automatic model download features in SoniTranslate. Follow the instructions on the Hugging Face website to create a token. When you are creating the new Access Token in Hugging Face, make sure to tick "Read access to contents of all public gated repos you can access".
167
+ 4. Install [Anaconda](https://www.anaconda.com/) or [Miniconda](https://docs.anaconda.com/free/miniconda/miniconda-install/). Anaconda is a free and open-source distribution of Python and R. It includes a package manager called conda that makes it easy to install and manage Python environments and packages. Follow the instructions on the Anaconda website to download and install Anaconda on your system.
168
+ 5. Install Git for your system. Git is a version control system that helps you track changes to your code and collaborate with other developers. You can install Git with Anaconda by running `conda install -c anaconda git -y` in your terminal (Do this after step 1 in the following section.). If you have trouble installing Git via Anaconda, you can use the following link instead:
169
+ - [Git for Linux](https://git-scm.com/download/linux)
170
+
171
+ Once you have completed these steps, you will be ready to install SoniTranslate.
172
+
173
+ ### Getting Started
174
+
175
+ To install SoniTranslate, follow these steps:
176
+
177
+ 1. Create a suitable anaconda environment for SoniTranslate and activate it:
178
+
179
+ ```
180
+ conda create -n sonitr python=3.10 -y
181
+ conda activate sonitr
182
+ python -m pip install pip==23.1.2
183
+ conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia
184
+ ```
185
+
186
+ 2. Clone this github repository and navigate to it:
187
+ ```
188
+ git clone https://github.com/r3gm/SoniTranslate.git
189
+ cd SoniTranslate
190
+ ```
191
+
192
+ 3. Install required packages:
193
+
194
+ ```
195
+ pip install -r requirements_base.txt -v
196
+ pip install -r requirements_extra.txt -v
197
+ pip install onnxruntime-gpu
198
+ ```
199
+
200
+ 4. Install [ffmpeg](https://ffmpeg.org/download.html). FFmpeg is a free software project that produces libraries and programs for handling multimedia data. You will need it to process audio and video files. You can install ffmpeg with Anaconda by running `conda install -y ffmpeg` in your terminal (recommended). If you have trouble installing ffmpeg via Anaconda, you can use the following link instead: (https://ffmpeg.org/ffmpeg.html). Once it is installed, make sure it is in your PATH by running `ffmpeg -h` in your terminal. If you don't get an error message, you're good to go.
201
+
202
+ 5. Optional install:
203
+
204
+ After installing FFmpeg, you can install these optional packages.
205
+
206
+
207
+ [Piper TTS](https://github.com/rhasspy/piper) is a fast, local neural text to speech system that sounds great and is optimized for the Raspberry Pi 4. Piper is used in a variety of projects. Voices are trained with VITS and exported to the onnxruntime.
208
+
209
+ ```
210
+ pip install -q piper-tts==1.2.0
211
+ ```
212
+
213
+ [Coqui XTTS](https://github.com/coqui-ai/TTS) is a text-to-speech (TTS) model that lets you generate realistic voices in different languages. It can clone voices with just a short audio clip, even speak in a different language! It's like having a personal voice mimic for any text you need spoken.
214
+
215
+ ```
216
+ pip install -q -r requirements_xtts.txt
217
+ pip install -q TTS==0.21.1 --no-deps
218
+ ```
219
+
220
+
221
+ ### Running SoniTranslate
222
+
223
+ To run SoniTranslate locally, make sure the `sonitr` conda environment is active:
224
+
225
+ ```
226
+ conda activate sonitr
227
+ ```
228
+
229
+ Setting your Hugging Face token as an environment variable in Linux:
230
+
231
+ ```
232
+ export YOUR_HF_TOKEN="YOUR_HUGGING_FACE_TOKEN"
233
+ ```
234
+
235
+ Then navigate to the `SoniTranslate` folder and run either the `app_rvc.py`
236
+
237
+ ```
238
+ python app_rvc.py
239
+ ```
240
+ When the `local URL` `http://127.0.0.1:7860` is displayed in the terminal, simply open this URL in your web browser to access the SoniTranslate interface.
241
+
242
+ ### Stop and close SoniTranslate.
243
+
244
+ In most environments, you can stop the execution by pressing Ctrl+C in the terminal where you launched the script `app_rvc.py`. This will interrupt the program and stop the Gradio app.
245
+ To deactivate the Conda environment, you can use the following command:
246
+
247
+ ```
248
+ conda deactivate
249
+ ```
250
+
251
+ This will deactivate the currently active Conda environment sonitr, and you'll return to the base environment or the global Python environment.
252
+
253
+ ### Starting Over
254
+
255
+ If you need to start over from scratch, you can delete the `SoniTranslate` folder and remove the `sonitr` conda environment with the following set of commands:
256
+
257
+ ```
258
+ conda deactivate
259
+ conda env remove -n sonitr
260
+ ```
261
+
262
+ With the `sonitr` environment removed, you can start over with a fresh installation.
263
+
264
+ ### Notes
265
+
266
+ - Alternatively, you can set your Hugging Face token as a permanent environment variable with:
267
+
268
+ ```
269
+ conda activate sonitr
270
+ conda env config vars set YOUR_HF_TOKEN="YOUR_HUGGING_FACE_TOKEN_HERE"
271
+ conda deactivate
272
+ ```
273
+
274
+ - To use OpenAI's GPT API for translation, tts or transcription, set up your OpenAI API key as an environment variable in quotes:
275
+
276
+ ```
277
+ conda activate sonitr
278
+ conda env config vars set OPENAI_API_KEY="your-api-key-here"
279
+ conda deactivate
280
+ ```
281
+
282
+ ## Command line arguments
283
+
284
+ The app_rvc.py script supports command-line arguments to customize its behavior. Here's a brief guide on how to use them:
285
+
286
+ | Argument command | Default | Value | Description |
287
+ |------------------|---------|-------|-------------|
288
+ | --theme | Taithrah/Minimal | String | Sets the theme for the interface. Themes can be found in the [Theme Gallery](https://huggingface.co/spaces/gradio/theme-gallery). |
289
+ | --language | english | String | Selects the interface language. Available options: afrikaans, arabic, azerbaijani, chinese_zh_cn, english, french, german, hindi, indonesian, italian, japanese, korean, marathi, persian, polish, portuguese, russian, spanish, swedish, turkish, ukrainian, vietnamese. |
290
+ | --verbosity_level| info | String | Sets the verbosity level of the logger: debug, info, warning, error, or critical. |
291
+ | --public_url | | Boolean | Enables a public link. |
292
+ | --cpu_mode | | Boolean | Enable CPU mode to run the program without utilizing GPU acceleration. |
293
+ | --logs_in_gui | | Boolean | Shows the operations performed in Logs (obsolete). |
294
+
295
+ Example usage:
296
+ ```
297
+ python app_rvc.py --theme aliabid94/new-theme --language french
298
+ ```
299
+ This command sets the theme to a custom theme and selects French as the interface language.
300
+ Feel free to customize these arguments according to your preferences and requirements.
301
+
302
+ ## 📖 News
303
+
304
+ 🔥 2024/18/05: New Update Details
305
+ - Added option Overlap Reduction
306
+ - OpenAI API Key Integration for Transcription, translation, and TTS
307
+ - More output types: subtitles by speaker, separate audio sound, and video only with subtitles
308
+ - Access to a better-performing version of Whisper for transcribing speech on the [Hugging Face Whisper page](https://huggingface.co/models?pipeline_tag=automatic-speech-recognition&sort=trending&search=whisper). Copy the repository ID and paste it into the 'Whisper ASR model' section in 'Advanced Settings'; e.g., `kotoba-tech/kotoba-whisper-v1.1` for Japanese transcription [available here](https://huggingface.co/kotoba-tech/kotoba-whisper-v1.1)
309
+ - Support for ASS subtitles and batch processing with subtitles
310
+ - Vocal enhancement before transcription
311
+ - Added CPU mode with `app_rvc.py --cpu_mode`
312
+ - TTS now supports up to 12 speakers
313
+ - OpenVoiceV2 integration for voice imitation
314
+ - PDF to videobook (displays images from the PDF)
315
+ - GUI language translation in Persian and Afrikaans
316
+ - **New Language Support**:
317
+ - **Complete support**: Estonian, Macedonian, Malay, Swahili, Afrikaans, Bosnian, Latin, Myanmar Burmese, Norwegian, Traditional Chinese, Assamese, Basque, Hausa, Haitian Creole, Armenian, Lao, Malagasy, Mongolian, Maltese, Punjabi, Pashto, Slovenian, Shona, Somali, Tajik, Turkmen, Tatar, Uzbek, and Yoruba
318
+ - **Non-transcription**: Aymara, Bambara, Cebuano, Chichewa, Divehi, Dogri, Ewe, Guarani, Iloko, Kinyarwanda, Krio, Kurdish, Kirghiz, Ganda, Maithili, Oriya, Oromo, Quechua, Samoan, Tigrinya, Tsonga, Akan, and Uighur
319
+
320
+ 🔥 2024/03/02: Preserve file names in output. Multiple archives can now be submitted simultaneously by specifying their paths, directories or URLs separated by commas. Processing of a full YouTube playlist. About [supported sites URL](https://github.com/yt-dlp/yt-dlp/blob/master/supportedsites.md), please be aware that not all sites may work optimally. Added option for disabling diarization. Implemented soft subtitles. Format output (MP3, MP4, MKV, WAV, and OGG), and resolved issues related to file reading and diarization.
321
+
322
+ 🔥 2024/02/22: Added freevc for voice imitation, fixed voiceless track, divide segments. New languages support (Swedish, Amharic, Welsh, Croatian, Icelandic, Georgian, Khmer, Slovak, Albanian, Serbian, Azerbaijani, Bulgarian, Galician, Gujarati, Kazakh, Kannada, Lithuanian, Latvian, Malayalam, Romanian, Sinhala and Sundanese). New translations of the GUI (Spanish, French, German, Italian, Japanese, Chinese Simplified, Ukrainian, Arabic, Russian, Turkish, Indonesian, Portuguese, Hindi, Vietnamese, Polish, Swedish, Korean, Marathi and Azerbaijani). With subtitle file, no align and the media file is not needed to process the SRT file. Burn subtitles to video. Queue can accept multiple tasks simultaneously. Sound alert notification. Continue process from last checkpoint. Acceleration rate regulation.
323
+
324
+ 🔥 2024/01/16: Expanded language support (Thai, Nepali, Catalan, Javanese, Tamil, Marathi, Telugu, Bengali and Indonesian), the introduction of whisper large v3, configurable GUI options, integration of BARK, Facebook-mms, Coqui XTTS, and Piper-TTS. Additional features included audio separation utilities, XTTS WAV creation, use an SRT file as a base for translation, document translation, manual speaker editing, and flexible output options (video, audio, subtitles).
325
+
326
+ 🔥 2023/10/29: Edit the translated subtitle, download it, adjust volume and speed options.
327
+
328
+ 🔥 2023/08/03: Changed default options and added directory view of downloads.
329
+
330
+ 🔥 2023/08/02: Added support for Arabic, Czech, Danish, Finnish, Greek, Hebrew, Hungarian, Korean, Persian, Polish, Russian, Turkish, Urdu, Hindi, and Vietnamese languages. 🌐
331
+
332
+ 🔥 2023/08/01: Add options for use RVC models.
333
+
334
+ 🔥 2023/07/27: Fix some bug processing the video and audio.
335
+
336
+ 🔥 2023/07/26: New UI and add mix options.
337
+
338
+
339
+ ## Contributing
340
+
341
+ Welcome to contributions from the community! If you have any ideas, bug reports, or feature requests, please open an issue or submit a pull request. For more information, please refer to the contribution guidelines.
342
+
343
+ ## Credits
344
+
345
+ This project leverages a number of open-source projects. We would like to acknowledge and thank the contributors of the following repositories:
346
+
347
+ - [PyTorch](https://github.com/pytorch/pytorch)
348
+ - [yt-dlp](https://github.com/yt-dlp/yt-dlp)
349
+ - [Gradio](https://github.com/gradio-app/gradio)
350
+ - [edge-tts](https://github.com/rany2/edge-tts)
351
+ - [deep-translator](https://github.com/nidhaloff/deep-translator)
352
+ - [pyannote-audio](https://github.com/pyannote/pyannote-audio)
353
+ - [WhisperX](https://github.com/m-bain/whisperX)
354
+ - [faster-whisper](https://github.com/SYSTRAN/faster-whisper)
355
+ - [CTranslate2](https://github.com/OpenNMT/CTranslate2)
356
+ - [Transformers](https://github.com/huggingface/transformers)
357
+ - [FFmpeg](https://github.com/FFmpeg/FFmpeg)
358
+ - [Piper](https://github.com/rhasspy/piper)
359
+ - [Coqui TTS](https://github.com/coqui-ai/TTS)
360
+ - [pypdf](https://github.com/py-pdf/pypdf)
361
+ - [OpenVoice](https://github.com/myshell-ai/OpenVoice)
362
+
363
+ ## License
364
+ Although the code is licensed under Apache 2, the models or weights may have commercial restrictions, as seen with pyannote diarization.
SoniTranslate_Colab.ipynb CHANGED
@@ -1,27 +1,10 @@
1
  {
2
- "nbformat": 4,
3
- "nbformat_minor": 0,
4
- "metadata": {
5
- "colab": {
6
- "provenance": [],
7
- "gpuType": "T4",
8
- "include_colab_link": true
9
- },
10
- "kernelspec": {
11
- "name": "python3",
12
- "display_name": "Python 3"
13
- },
14
- "language_info": {
15
- "name": "python"
16
- },
17
- "accelerator": "GPU"
18
- },
19
  "cells": [
20
  {
21
  "cell_type": "markdown",
22
  "metadata": {
23
- "id": "view-in-github",
24
- "colab_type": "text"
25
  },
26
  "source": [
27
  "<a href=\"https://colab.research.google.com/github/R3gm/SoniTranslate/blob/main/SoniTranslate_Colab.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
@@ -29,6 +12,9 @@
29
  },
30
  {
31
  "cell_type": "markdown",
 
 
 
32
  "source": [
33
  "# SoniTranslate\n",
34
  "\n",
@@ -38,17 +24,14 @@
38
  "| 🚀 Online Demo in HF | [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/r3gm/SoniTranslate_translate_audio_of_a_video_content) |\n",
39
  "\n",
40
  "\n"
41
- ],
42
- "metadata": {
43
- "id": "8lw0EgLex-YZ"
44
- }
45
  },
46
  {
47
  "cell_type": "code",
48
  "execution_count": null,
49
  "metadata": {
50
- "id": "LUgwm0rfx0_J",
51
- "cellView": "form"
52
  },
53
  "outputs": [],
54
  "source": [
@@ -56,6 +39,8 @@
56
  "!git clone https://github.com/r3gm/SoniTranslate.git\n",
57
  "%cd SoniTranslate\n",
58
  "\n",
 
 
59
  "!apt install git-lfs\n",
60
  "!git lfs install\n",
61
  "\n",
@@ -78,20 +63,25 @@
78
  },
79
  {
80
  "cell_type": "markdown",
 
 
 
81
  "source": [
82
  "One important step is to accept the license agreement for using Pyannote. You need to have an account on Hugging Face and `accept the license to use the models`: https://huggingface.co/pyannote/speaker-diarization and https://huggingface.co/pyannote/segmentation\n",
83
  "\n",
 
84
  "\n",
85
- "\n",
86
- "\n",
87
- "Get your KEY TOKEN here: https://hf.co/settings/tokens"
88
- ],
89
- "metadata": {
90
- "id": "LTaTstXPXNg2"
91
- }
92
  },
93
  {
94
  "cell_type": "code",
 
 
 
 
 
 
95
  "source": [
96
  "#@markdown # `RUN THE WEB APP`\n",
97
  "YOUR_HF_TOKEN = \"\" #@param {type:'string'}\n",
@@ -103,22 +93,33 @@
103
  "\n",
104
  "%cd /content/SoniTranslate\n",
105
  "!python app_rvc.py --theme {theme} --verbosity_level {verbosity_level} --language {interface_language} --public_url"
106
- ],
107
- "metadata": {
108
- "id": "XkhXfaFw4R4J",
109
- "cellView": "form"
110
- },
111
- "execution_count": null,
112
- "outputs": []
113
  },
114
  {
115
  "cell_type": "markdown",
116
- "source": [
117
- "Open the `public URL` when it appears"
118
- ],
119
  "metadata": {
120
  "id": "KJW3KrhZJh0u"
121
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  }
123
- ]
124
- }
 
 
 
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "cells": [
3
  {
4
  "cell_type": "markdown",
5
  "metadata": {
6
+ "colab_type": "text",
7
+ "id": "view-in-github"
8
  },
9
  "source": [
10
  "<a href=\"https://colab.research.google.com/github/R3gm/SoniTranslate/blob/main/SoniTranslate_Colab.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
 
12
  },
13
  {
14
  "cell_type": "markdown",
15
+ "metadata": {
16
+ "id": "8lw0EgLex-YZ"
17
+ },
18
  "source": [
19
  "# SoniTranslate\n",
20
  "\n",
 
24
  "| 🚀 Online Demo in HF | [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/r3gm/SoniTranslate_translate_audio_of_a_video_content) |\n",
25
  "\n",
26
  "\n"
27
+ ]
 
 
 
28
  },
29
  {
30
  "cell_type": "code",
31
  "execution_count": null,
32
  "metadata": {
33
+ "cellView": "form",
34
+ "id": "LUgwm0rfx0_J"
35
  },
36
  "outputs": [],
37
  "source": [
 
39
  "!git clone https://github.com/r3gm/SoniTranslate.git\n",
40
  "%cd SoniTranslate\n",
41
  "\n",
42
+ "!pip uninstall chex pandas-stubs ibis-framework albumentations albucore -y -q\n",
43
+ "!python -m pip install -q pip==23.1.2\n",
44
  "!apt install git-lfs\n",
45
  "!git lfs install\n",
46
  "\n",
 
63
  },
64
  {
65
  "cell_type": "markdown",
66
+ "metadata": {
67
+ "id": "LTaTstXPXNg2"
68
+ },
69
  "source": [
70
  "One important step is to accept the license agreement for using Pyannote. You need to have an account on Hugging Face and `accept the license to use the models`: https://huggingface.co/pyannote/speaker-diarization and https://huggingface.co/pyannote/segmentation\n",
71
  "\n",
72
+ "Get your KEY TOKEN here: https://hf.co/settings/tokens\n",
73
  "\n",
74
+ "When you are creating the new Access Token in Hugging Face, make sure to tick \"Read access to contents of all public gated repos you can access\"."
75
+ ]
 
 
 
 
 
76
  },
77
  {
78
  "cell_type": "code",
79
+ "execution_count": null,
80
+ "metadata": {
81
+ "cellView": "form",
82
+ "id": "XkhXfaFw4R4J"
83
+ },
84
+ "outputs": [],
85
  "source": [
86
  "#@markdown # `RUN THE WEB APP`\n",
87
  "YOUR_HF_TOKEN = \"\" #@param {type:'string'}\n",
 
93
  "\n",
94
  "%cd /content/SoniTranslate\n",
95
  "!python app_rvc.py --theme {theme} --verbosity_level {verbosity_level} --language {interface_language} --public_url"
96
+ ]
 
 
 
 
 
 
97
  },
98
  {
99
  "cell_type": "markdown",
 
 
 
100
  "metadata": {
101
  "id": "KJW3KrhZJh0u"
102
+ },
103
+ "source": [
104
+ "Open the `public URL` when it appears"
105
+ ]
106
+ }
107
+ ],
108
+ "metadata": {
109
+ "accelerator": "GPU",
110
+ "colab": {
111
+ "gpuType": "T4",
112
+ "include_colab_link": true,
113
+ "provenance": []
114
+ },
115
+ "kernelspec": {
116
+ "display_name": "Python 3",
117
+ "name": "python3"
118
+ },
119
+ "language_info": {
120
+ "name": "python"
121
  }
122
+ },
123
+ "nbformat": 4,
124
+ "nbformat_minor": 0
125
+ }
SoniTranslate_Colab_embedded.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
assets/Video.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e03c405628162038a10679372d43ca0e38e40507e1e5fce8b6d252ec3ca4dbf8
3
+ size 1931745
assets/video_dub.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d501c9a646b576d7bc3af0c51900dd6b66992fe82ac90aded1f28091af17ca65
3
+ size 1813313
docs/windows_install.md CHANGED
@@ -28,7 +28,7 @@ Before you start installing and using SoniTranslate, there are a few things you
28
  3. If you see that your CUDA version is less than 11.8, you should update your NVIDIA driver. Visit the NVIDIA website's driver download page (https://www.nvidia.com/Download/index.aspx) and enter your graphics card information.
29
 
30
  4. Accept the license agreement for using Pyannote. You need to have an account on Hugging Face and `accept the license to use the models`: https://huggingface.co/pyannote/speaker-diarization and https://huggingface.co/pyannote/segmentation
31
- 5. Create a [huggingface token](https://huggingface.co/settings/tokens). Hugging Face is a natural language processing platform that provides access to state-of-the-art models and tools. You will need to create a token in order to use some of the automatic model download features in SoniTranslate. Follow the instructions on the Hugging Face website to create a token.
32
  6. Install [Anaconda](https://www.anaconda.com/) or [Miniconda](https://docs.anaconda.com/free/miniconda/miniconda-install/). Anaconda is a free and open-source distribution of Python and R. It includes a package manager called conda that makes it easy to install and manage Python environments and packages. Follow the instructions on the Anaconda website to download and install Anaconda on your system.
33
  7. Install Git for your system. Git is a version control system that helps you track changes to your code and collaborate with other developers. You can install Git with Anaconda by running `conda install -c anaconda git -y` in your terminal (Do this after step 1 in the following section.). If you have trouble installing Git via Anaconda, you can use the following link instead:
34
  - [Git for Windows](https://git-scm.com/download/win)
@@ -44,6 +44,7 @@ To install SoniTranslate, follow these steps:
44
  ```
45
  conda create -n sonitr python=3.10 -y
46
  conda activate sonitr
 
47
  ```
48
 
49
  2. Clone this github repository and navigate to it:
@@ -100,7 +101,6 @@ conda env config vars set YOUR_HF_TOKEN="YOUR_HUGGING_FACE_TOKEN_HERE"
100
  conda deactivate
101
  ```
102
 
103
-
104
  ### Running SoniTranslate
105
 
106
  To run SoniTranslate locally, make sure the `sonitr` conda environment is active:
 
28
  3. If you see that your CUDA version is less than 11.8, you should update your NVIDIA driver. Visit the NVIDIA website's driver download page (https://www.nvidia.com/Download/index.aspx) and enter your graphics card information.
29
 
30
  4. Accept the license agreement for using Pyannote. You need to have an account on Hugging Face and `accept the license to use the models`: https://huggingface.co/pyannote/speaker-diarization and https://huggingface.co/pyannote/segmentation
31
+ 5. Create a [huggingface token](https://huggingface.co/settings/tokens). Hugging Face is a natural language processing platform that provides access to state-of-the-art models and tools. You will need to create a token in order to use some of the automatic model download features in SoniTranslate. Follow the instructions on the Hugging Face website to create a token. When you are creating the new Access Token in Hugging Face, make sure to tick "Read access to contents of all public gated repos you can access".
32
  6. Install [Anaconda](https://www.anaconda.com/) or [Miniconda](https://docs.anaconda.com/free/miniconda/miniconda-install/). Anaconda is a free and open-source distribution of Python and R. It includes a package manager called conda that makes it easy to install and manage Python environments and packages. Follow the instructions on the Anaconda website to download and install Anaconda on your system.
33
  7. Install Git for your system. Git is a version control system that helps you track changes to your code and collaborate with other developers. You can install Git with Anaconda by running `conda install -c anaconda git -y` in your terminal (Do this after step 1 in the following section.). If you have trouble installing Git via Anaconda, you can use the following link instead:
34
  - [Git for Windows](https://git-scm.com/download/win)
 
44
  ```
45
  conda create -n sonitr python=3.10 -y
46
  conda activate sonitr
47
+ python -m pip install pip==23.1.2
48
  ```
49
 
50
  2. Clone this github repository and navigate to it:
 
101
  conda deactivate
102
  ```
103
 
 
104
  ### Running SoniTranslate
105
 
106
  To run SoniTranslate locally, make sure the `sonitr` conda environment is active:
requirements.txt CHANGED
@@ -1,19 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  praat-parselmouth>=0.4.3
2
  pyworld==0.3.2
3
  faiss-cpu==1.7.3
4
  torchcrepe==0.0.20
5
  ffmpeg-python>=0.2.0
6
- fairseq==0.12.2
7
  gdown
8
  rarfile
 
9
  transformers
10
  accelerate
11
  optimum
12
  sentencepiece
13
  srt
 
14
  git+https://github.com/R3gm/openvoice_package.git@lite
15
- openai==1.14.3
16
- tiktoken==0.6.0
17
  # Documents
18
- pypdf==4.2.0
19
- python-docx
 
 
 
 
 
1
+ # Temporal requirements
2
+ nest_asyncio
3
+ --extra-index-url https://download.pytorch.org/whl/cu118
4
+ torch>=2.1.0+cu118
5
+ torchvision>=0.16.0+cu118
6
+ torchaudio>=2.1.0+cu118
7
+ yt-dlp
8
+ gradio==4.19.2
9
+ pydub==0.25.1
10
+ edge_tts==6.1.7
11
+ deep_translator==1.11.4
12
+ git+https://github.com/m-bain/whisperX.git@a5dca2c
13
+ gTTS
14
+ gradio_client==0.10.1
15
  praat-parselmouth>=0.4.3
16
  pyworld==0.3.2
17
  faiss-cpu==1.7.3
18
  torchcrepe==0.0.20
19
  ffmpeg-python>=0.2.0
20
+ git+https://github.com/facebookresearch/fairseq.git@refs/pull/5359/merge
21
  gdown
22
  rarfile
23
+ IPython
24
  transformers
25
  accelerate
26
  optimum
27
  sentencepiece
28
  srt
29
+ onnxruntime-gpu
30
  git+https://github.com/R3gm/openvoice_package.git@lite
 
 
31
  # Documents
32
+ PyPDF2
33
+ python-docx
34
+
35
+ # after this
36
+ # pip install git+https://github.com/omry/omegaconf.git@refs/pull/1137/merge
37
+
requirements_base.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ --extra-index-url https://download.pytorch.org/whl/cu118
2
+ torch>=2.1.0+cu118
3
+ torchvision>=0.16.0+cu118
4
+ torchaudio>=2.1.0+cu118
5
+ yt-dlp
6
+ gradio==4.19.2
7
+ pydub==0.25.1
8
+ edge_tts==6.1.7
9
+ deep_translator==1.11.4
10
+ git+https://github.com/R3gm/pyannote-audio.git@3.1.1
11
+ git+https://github.com/R3gm/whisperX.git@cuda_11_8
12
+ nest_asyncio
13
+ gTTS
14
+ gradio_client==0.10.1
15
+ IPython
requirements_extra.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ praat-parselmouth>=0.4.3
2
+ pyworld==0.3.2
3
+ faiss-cpu==1.7.3
4
+ torchcrepe==0.0.20
5
+ ffmpeg-python>=0.2.0
6
+ fairseq==0.12.2
7
+ gdown
8
+ rarfile
9
+ transformers
10
+ accelerate
11
+ optimum
12
+ sentencepiece
13
+ srt
14
+ git+https://github.com/R3gm/openvoice_package.git@lite
15
+ openai==1.14.3
16
+ tiktoken==0.6.0
17
+ # Documents
18
+ pypdf==4.2.0
19
+ python-docx
soni_translate/languages_gui.py CHANGED
The diff for this file is too large to render. See raw diff
 
soni_translate/mdx_net.py CHANGED
@@ -367,18 +367,6 @@ def run_mdx(
367
  processor_num = -1
368
  m_threads = 1
369
 
370
- if os.environ.get("ZERO_GPU") == "TRUE":
371
- duration = librosa.get_duration(filename=filename)
372
-
373
- if duration < 60:
374
- pass
375
- elif duration >= 60 and duration <= 900:
376
- m_threads = 4
377
- elif duration > 900:
378
- m_threads = 16
379
-
380
- logger.info(f"MDX-NET Threads: {m_threads}, duration {duration}")
381
-
382
  model_hash = MDX.get_hash(model_path)
383
  mp = model_params.get(model_hash)
384
  model = MDXModel(
 
367
  processor_num = -1
368
  m_threads = 1
369
 
 
 
 
 
 
 
 
 
 
 
 
 
370
  model_hash = MDX.get_hash(model_path)
371
  mp = model_params.get(model_hash)
372
  model = MDXModel(
soni_translate/preprocessor.py CHANGED
@@ -14,7 +14,7 @@ ERROR_INCORRECT_CODEC_PARAMETERS = [
14
  TESTED_CODECS = [
15
  "h264", # mp4
16
  "h265", # mp4
17
- "hevc", # test
18
  "vp9", # webm
19
  "mpeg4", # mp4
20
  "mpeg2video", # mpg
 
14
  TESTED_CODECS = [
15
  "h264", # mp4
16
  "h265", # mp4
17
+ "hevc",
18
  "vp9", # webm
19
  "mpeg4", # mp4
20
  "mpeg2video", # mpg
soni_translate/speech_segmentation.py CHANGED
@@ -14,87 +14,6 @@ from .logging_setup import logger
14
  from .postprocessor import sanitize_file_name
15
  from .utils import remove_directory_contents, run_command
16
 
17
- # ZERO GPU CONFIG
18
- import spaces
19
- import copy
20
- import random
21
- import time
22
-
23
- def random_sleep():
24
- if os.environ.get("ZERO_GPU") == "TRUE":
25
- print("Random sleep")
26
- sleep_time = round(random.uniform(7.2, 9.9), 1)
27
- time.sleep(sleep_time)
28
-
29
-
30
- @spaces.GPU
31
- def load_and_transcribe_audio(asr_model, audio, compute_type, language, asr_options, batch_size, segment_duration_limit):
32
- # Load model
33
- model = whisperx.load_model(
34
- asr_model,
35
- os.environ.get("SONITR_DEVICE") if os.environ.get("ZERO_GPU") != "TRUE" else "cuda",
36
- compute_type=compute_type,
37
- language=language,
38
- asr_options=asr_options,
39
- )
40
-
41
- # Transcribe audio
42
- result = model.transcribe(
43
- audio,
44
- batch_size=batch_size,
45
- chunk_size=segment_duration_limit,
46
- print_progress=True,
47
- )
48
-
49
- del model
50
- gc.collect()
51
- torch.cuda.empty_cache() # noqa
52
-
53
- return result
54
-
55
- def load_align_and_align_segments(result, audio, DAMHF):
56
-
57
- # Load alignment model
58
- model_a, metadata = whisperx.load_align_model(
59
- language_code=result["language"],
60
- device=os.environ.get("SONITR_DEVICE") if os.environ.get("ZERO_GPU") != "TRUE" else "cuda",
61
- model_name=None
62
- if result["language"] in DAMHF.keys()
63
- else EXTRA_ALIGN[result["language"]],
64
- )
65
-
66
- # Align segments
67
- alignment_result = whisperx.align(
68
- result["segments"],
69
- model_a,
70
- metadata,
71
- audio,
72
- os.environ.get("SONITR_DEVICE") if os.environ.get("ZERO_GPU") != "TRUE" else "cuda",
73
- return_char_alignments=True,
74
- print_progress=False,
75
- )
76
-
77
- # Clean up
78
- del model_a
79
- gc.collect()
80
- torch.cuda.empty_cache() # noqa
81
-
82
- return alignment_result
83
-
84
- @spaces.GPU
85
- def diarize_audio(diarize_model, audio_wav, min_speakers, max_speakers):
86
-
87
- if os.environ.get("ZERO_GPU") == "TRUE":
88
- diarize_model.model.to(torch.device("cuda"))
89
- diarize_segments = diarize_model(
90
- audio_wav,
91
- min_speakers=min_speakers,
92
- max_speakers=max_speakers
93
- )
94
- return diarize_segments
95
-
96
- # ZERO GPU CONFIG
97
-
98
  ASR_MODEL_OPTIONS = [
99
  "tiny",
100
  "base",
@@ -224,6 +143,7 @@ def find_whisper_models():
224
  folders.append(folder)
225
  return folders
226
 
 
227
  def transcribe_speech(
228
  audio_wav,
229
  asr_model,
@@ -308,17 +228,29 @@ def transcribe_speech(
308
  asr_model = model_dir
309
  logger.info(f"ASR Model: {str(model_dir)}")
310
 
 
 
 
 
 
 
 
 
311
  audio = whisperx.load_audio(audio_wav)
312
-
313
- result = load_and_transcribe_audio(
314
- asr_model, audio, compute_type, SOURCE_LANGUAGE, asr_options, batch_size, segment_duration_limit
 
 
315
  )
316
 
317
  if result["language"] == "zh" and not prompt:
318
  result["language"] = "zh-TW"
319
  logger.info("Chinese - Traditional (zh-TW)")
320
 
321
-
 
 
322
  return audio, result
323
 
324
 
@@ -369,9 +301,25 @@ def align_speech(audio, result):
369
  )
370
  return result
371
 
372
- # random_sleep()
373
- result = load_align_and_align_segments(result, audio, DAMHF)
374
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
375
  return result
376
 
377
 
@@ -471,9 +419,9 @@ def diarize_speech(
471
  )
472
  else:
473
  raise error
474
-
475
- random_sleep()
476
- diarize_segments = diarize_audio(diarize_model, audio_wav, min_speakers, max_speakers)
477
 
478
  result_diarize = whisperx.assign_word_speakers(
479
  diarize_segments, result
 
14
  from .postprocessor import sanitize_file_name
15
  from .utils import remove_directory_contents, run_command
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  ASR_MODEL_OPTIONS = [
18
  "tiny",
19
  "base",
 
143
  folders.append(folder)
144
  return folders
145
 
146
+
147
  def transcribe_speech(
148
  audio_wav,
149
  asr_model,
 
228
  asr_model = model_dir
229
  logger.info(f"ASR Model: {str(model_dir)}")
230
 
231
+ model = whisperx.load_model(
232
+ asr_model,
233
+ os.environ.get("SONITR_DEVICE"),
234
+ compute_type=compute_type,
235
+ language=SOURCE_LANGUAGE,
236
+ asr_options=asr_options,
237
+ )
238
+
239
  audio = whisperx.load_audio(audio_wav)
240
+ result = model.transcribe(
241
+ audio,
242
+ batch_size=batch_size,
243
+ chunk_size=segment_duration_limit,
244
+ print_progress=True,
245
  )
246
 
247
  if result["language"] == "zh" and not prompt:
248
  result["language"] = "zh-TW"
249
  logger.info("Chinese - Traditional (zh-TW)")
250
 
251
+ del model
252
+ gc.collect()
253
+ torch.cuda.empty_cache() # noqa
254
  return audio, result
255
 
256
 
 
301
  )
302
  return result
303
 
304
+ model_a, metadata = whisperx.load_align_model(
305
+ language_code=result["language"],
306
+ device=os.environ.get("SONITR_DEVICE"),
307
+ model_name=None
308
+ if result["language"] in DAMHF.keys()
309
+ else EXTRA_ALIGN[result["language"]],
310
+ )
311
+ result = whisperx.align(
312
+ result["segments"],
313
+ model_a,
314
+ metadata,
315
+ audio,
316
+ os.environ.get("SONITR_DEVICE"),
317
+ return_char_alignments=True,
318
+ print_progress=False,
319
+ )
320
+ del model_a
321
+ gc.collect()
322
+ torch.cuda.empty_cache() # noqa
323
  return result
324
 
325
 
 
419
  )
420
  else:
421
  raise error
422
+ diarize_segments = diarize_model(
423
+ audio_wav, min_speakers=min_speakers, max_speakers=max_speakers
424
+ )
425
 
426
  result_diarize = whisperx.assign_word_speakers(
427
  diarize_segments, result
soni_translate/text_multiformat_processor.py CHANGED
@@ -357,7 +357,7 @@ def add_border_to_image(
357
 
358
  bordered_img = ImageOps.expand(resized_img, padding, fill=border_color)
359
 
360
- bordered_img.save(image_path)
361
 
362
  return image_path
363
 
@@ -506,6 +506,8 @@ def doc_to_txtximg_pages(
506
  images = []
507
  for image_file_object in page.images:
508
  img_name = f"{images_folder}{i:04d}_{count:02d}_{image_file_object.name}"
 
 
509
  images.append(img_name)
510
  with open(img_name, "wb") as fp:
511
  fp.write(image_file_object.data)
 
357
 
358
  bordered_img = ImageOps.expand(resized_img, padding, fill=border_color)
359
 
360
+ bordered_img.save(image_path, format='PNG')
361
 
362
  return image_path
363
 
 
506
  images = []
507
  for image_file_object in page.images:
508
  img_name = f"{images_folder}{i:04d}_{count:02d}_{image_file_object.name}"
509
+ if not img_name.lower().endswith('.png'):
510
+ img_name = os.path.splitext(img_name)[0] + '.png'
511
  images.append(img_name)
512
  with open(img_name, "wb") as fp:
513
  fp.write(image_file_object.data)
soni_translate/text_to_speech.py CHANGED
@@ -15,6 +15,7 @@ from .utils import (
15
  remove_directory_contents,
16
  remove_files,
17
  run_command,
 
18
  )
19
  import numpy as np
20
  from typing import Any, Dict
@@ -59,7 +60,7 @@ def error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename):
59
  # Read audio data from the TemporaryFile using soundfile
60
  audio_data, samplerate = sf.read(f)
61
  f.close() # Close the TemporaryFile
62
- sf.write(
63
  filename, audio_data, samplerate, format="ogg", subtype="vorbis"
64
  )
65
 
@@ -73,7 +74,7 @@ def error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename):
73
  sample_rate_aux = 22050
74
  duration = float(segment["end"]) - float(segment["start"])
75
  data = np.zeros(int(sample_rate_aux * duration)).astype(np.float32)
76
- sf.write(
77
  filename, data, sample_rate_aux, format="ogg", subtype="vorbis"
78
  )
79
  logger.error("Audio will be replaced -> [silent audio].")
@@ -181,7 +182,7 @@ def segments_egde_tts(filtered_edge_segments, TRANSLATE_AUDIO_TO, is_gui):
181
  # os.remove(temp_file)
182
 
183
  # Save file
184
- sf.write(
185
  file=filename,
186
  samplerate=sample_rate,
187
  data=data,
@@ -256,7 +257,7 @@ def segments_bark_tts(
256
  speech_output.cpu().numpy().squeeze().astype(np.float32),
257
  sampling_rate,
258
  )
259
- sf.write(
260
  file=filename,
261
  samplerate=sampling_rate,
262
  data=data_tts,
@@ -299,7 +300,7 @@ def uromanize(input_string):
299
  stderr=subprocess.PIPE,
300
  )
301
  stdout, stderr = process.communicate()
302
- script_path = os.path.join("./uroman", "bin", "uroman.pl")
303
 
304
  command = ["perl", script_path]
305
 
@@ -362,7 +363,7 @@ def segments_vits_tts(filtered_vits_segments, TRANSLATE_AUDIO_TO):
362
  sampling_rate,
363
  )
364
  # Save file
365
- sf.write(
366
  file=filename,
367
  samplerate=sampling_rate,
368
  data=data_tts,
@@ -667,7 +668,7 @@ def segments_coqui_tts(
667
  sampling_rate,
668
  )
669
  # Save file
670
- sf.write(
671
  file=filename,
672
  samplerate=sampling_rate,
673
  data=data_tts,
@@ -855,7 +856,7 @@ def segments_vits_onnx_tts(filtered_onnx_vits_segments, TRANSLATE_AUDIO_TO):
855
  sampling_rate,
856
  )
857
  # Save file
858
- sf.write(
859
  file=filename,
860
  samplerate=sampling_rate,
861
  data=data_tts,
@@ -925,7 +926,7 @@ def segments_openai_tts(
925
  sampling_rate,
926
  )
927
 
928
- sf.write(
929
  file=filename,
930
  samplerate=sampling_rate,
931
  data=data_tts,
@@ -1509,7 +1510,7 @@ def toneconverter_freevc(
1509
  target_wav=original_wav_audio_segment,
1510
  )
1511
 
1512
- sf.write(
1513
  file=save_path,
1514
  samplerate=tts.voice_converter.vc_config.audio.output_sample_rate,
1515
  data=wav,
 
15
  remove_directory_contents,
16
  remove_files,
17
  run_command,
18
+ write_chunked,
19
  )
20
  import numpy as np
21
  from typing import Any, Dict
 
60
  # Read audio data from the TemporaryFile using soundfile
61
  audio_data, samplerate = sf.read(f)
62
  f.close() # Close the TemporaryFile
63
+ write_chunked(
64
  filename, audio_data, samplerate, format="ogg", subtype="vorbis"
65
  )
66
 
 
74
  sample_rate_aux = 22050
75
  duration = float(segment["end"]) - float(segment["start"])
76
  data = np.zeros(int(sample_rate_aux * duration)).astype(np.float32)
77
+ write_chunked(
78
  filename, data, sample_rate_aux, format="ogg", subtype="vorbis"
79
  )
80
  logger.error("Audio will be replaced -> [silent audio].")
 
182
  # os.remove(temp_file)
183
 
184
  # Save file
185
+ write_chunked(
186
  file=filename,
187
  samplerate=sample_rate,
188
  data=data,
 
257
  speech_output.cpu().numpy().squeeze().astype(np.float32),
258
  sampling_rate,
259
  )
260
+ write_chunked(
261
  file=filename,
262
  samplerate=sampling_rate,
263
  data=data_tts,
 
300
  stderr=subprocess.PIPE,
301
  )
302
  stdout, stderr = process.communicate()
303
+ script_path = os.path.join("./uroman", "uroman", "uroman.pl")
304
 
305
  command = ["perl", script_path]
306
 
 
363
  sampling_rate,
364
  )
365
  # Save file
366
+ write_chunked(
367
  file=filename,
368
  samplerate=sampling_rate,
369
  data=data_tts,
 
668
  sampling_rate,
669
  )
670
  # Save file
671
+ write_chunked(
672
  file=filename,
673
  samplerate=sampling_rate,
674
  data=data_tts,
 
856
  sampling_rate,
857
  )
858
  # Save file
859
+ write_chunked(
860
  file=filename,
861
  samplerate=sampling_rate,
862
  data=data_tts,
 
926
  sampling_rate,
927
  )
928
 
929
+ write_chunked(
930
  file=filename,
931
  samplerate=sampling_rate,
932
  data=data_tts,
 
1510
  target_wav=original_wav_audio_segment,
1511
  )
1512
 
1513
+ write_chunked(
1514
  file=save_path,
1515
  samplerate=tts.voice_converter.vc_config.audio.output_sample_rate,
1516
  data=wav,
soni_translate/utils.py CHANGED
@@ -3,6 +3,8 @@ from .logging_setup import logger
3
  from urllib.parse import urlparse
4
  from IPython.utils import capture
5
  import re
 
 
6
 
7
  VIDEO_EXTENSIONS = [
8
  ".mp4",
@@ -66,6 +68,31 @@ def run_command(command):
66
  raise Exception(errors.decode())
67
 
68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  def print_tree_directory(root_dir, indent=""):
70
  if not os.path.exists(root_dir):
71
  logger.error(f"{indent} Invalid directory or file: {root_dir}")
@@ -143,10 +170,6 @@ def manual_download(url, dst):
143
 
144
 
145
  def download_list(text_downloads):
146
-
147
- if os.environ.get("ZERO_GPU") == "TRUE":
148
- raise RuntimeError("This option is disabled in this demo.")
149
-
150
  try:
151
  urls = [elem.strip() for elem in text_downloads.split(",")]
152
  except Exception as error:
 
3
  from urllib.parse import urlparse
4
  from IPython.utils import capture
5
  import re
6
+ import soundfile as sf
7
+ import numpy as np
8
 
9
  VIDEO_EXTENSIONS = [
10
  ".mp4",
 
68
  raise Exception(errors.decode())
69
 
70
 
71
+ def write_chunked(
72
+ file,
73
+ data,
74
+ samplerate,
75
+ subtype=None,
76
+ endian=None,
77
+ format=None,
78
+ closefd=True,
79
+ chunk_size=0x1000
80
+ ):
81
+
82
+ data = np.asarray(data)
83
+ if data.ndim == 1:
84
+ channels = 1
85
+ else:
86
+ channels = data.shape[1]
87
+ with sf.SoundFile(
88
+ file, 'w', samplerate, channels,
89
+ subtype, endian, format, closefd
90
+ ) as f:
91
+ num_chunks = (len(data) + chunk_size - 1) // chunk_size
92
+ for chunk in np.array_split(data, num_chunks, axis=0):
93
+ f.write(chunk)
94
+
95
+
96
  def print_tree_directory(root_dir, indent=""):
97
  if not os.path.exists(root_dir):
98
  logger.error(f"{indent} Invalid directory or file: {root_dir}")
 
170
 
171
 
172
  def download_list(text_downloads):
 
 
 
 
173
  try:
174
  urls = [elem.strip() for elem in text_downloads.split(",")]
175
  except Exception as error:
voice_main.py CHANGED
@@ -17,7 +17,11 @@ from lib.audio import load_audio
17
  import soundfile as sf
18
  import edge_tts
19
  import asyncio
20
- from soni_translate.utils import remove_directory_contents, create_directories
 
 
 
 
21
  from scipy import signal
22
  from time import time as ttime
23
  import faiss
@@ -437,10 +441,12 @@ class ClassVoices:
437
  output_audio_path = new_path
438
 
439
  # Save file
440
- sf.write(
441
  file=output_audio_path,
442
  samplerate=final_sr,
443
- data=audio_opt
 
 
444
  )
445
 
446
  self.model_config[task_id]["result"].append(output_audio_path)
 
17
  import soundfile as sf
18
  import edge_tts
19
  import asyncio
20
+ from soni_translate.utils import (
21
+ remove_directory_contents,
22
+ create_directories,
23
+ write_chunked,
24
+ )
25
  from scipy import signal
26
  from time import time as ttime
27
  import faiss
 
441
  output_audio_path = new_path
442
 
443
  # Save file
444
+ write_chunked(
445
  file=output_audio_path,
446
  samplerate=final_sr,
447
+ data=audio_opt,
448
+ format="ogg",
449
+ subtype="vorbis",
450
  )
451
 
452
  self.model_config[task_id]["result"].append(output_audio_path)