Elohe_video-dubb_tool

Configuration error

App Files Files Community

RO-Rtechs commited on Jun 19

Commit

f279f97

•

1 Parent(s): cd7a065

Upload 22 files

Browse files

Files changed (21) hide show

LICENSE +201 -201
README.md +345 -16
app_rvc.py +0 -0
mdx_models/data.json +353 -353
requirements.txt +37 -19
requirements_base.txt +15 -0
requirements_extra.txt +19 -0
requirements_xtts.txt +57 -57
soni_translate/audio_segments.py +141 -141
soni_translate/language_configuration.py +551 -551
soni_translate/languages_gui.py +1 -1
soni_translate/logging_setup.py +68 -68
soni_translate/mdx_net.py +582 -594
soni_translate/postprocessor.py +231 -231
soni_translate/preprocessor.py +309 -309
soni_translate/speech_segmentation.py +447 -499
soni_translate/text_multiformat_processor.py +987 -987
soni_translate/text_to_speech.py +0 -0
soni_translate/translate_segments.py +457 -457
soni_translate/utils.py +483 -487
voice_main.py +732 -732

LICENSE CHANGED Viewed

@@ -1,201 +1,201 @@
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-   1. Definitions.
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-   END OF TERMS AND CONDITIONS
-   APPENDIX: How to apply the Apache License to your work.
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-   Copyright [yyyy] [name of copyright owner]
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-       http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md CHANGED Viewed

@@ -1,16 +1,345 @@
----
-title: Video Dubbing (SoniTranslate)
-emoji: 🌍
-colorFrom: blue
-colorTo: green
-sdk: gradio
-sdk_version: 4.31.3
-app_file: app_rvc.py
-pinned: true
-license: mit
-short_description: Video Dubbing with Open Source Projects
-preload_from_hub:
-  - Systran/faster-whisper-large-v3
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# 🎥 SoniTranslate 🈷️
+🎬 Video Translation with Synchronized Audio 🌐
+SonyTranslate is a powerful and user-friendly web application that allows you to easily translate videos into different languages. This repository hosts the code for the SonyTranslate web UI, which is built with the Gradio library to provide a seamless and interactive user experience.
+| Description | Link |
+| ----------- | ---- |
+| 📙 Colab Notebook | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/R3gm/SoniTranslate/blob/main/SoniTranslate_Colab.ipynb) |
+| 🎉 Repository | [![GitHub Repository](https://img.shields.io/badge/GitHub-Repository-black?style=flat-square&logo=github)](https://github.com/R3gm/SoniTranslate/) |
+| 🚀 Online DEMO | [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/r3gm/SoniTranslate_translate_audio_of_a_video_content) |
+## SonyTranslate's web UI, which features a browser interface built on the Gradio library.
+![image](https://github.com/R3gm/SoniTranslate/assets/114810545/0d71fbf4-e9f0-4f8f-944e-8f3f1ea6a019)
+## Using the project: A video guide
+For a comprehensive understanding of the project, we highly recommend watching this video tutorial by [DEV-MalletteS](https://github.com/DEV-MalletteS). You can watch it on YouTube by clicking the thumbnail below:
+[![Watch the video](https://img.youtube.com/vi/SmGkFaSzq_Q/0.jpg)](https://www.youtube.com/watch?v=SmGkFaSzq_Q)
+## Supported languages for translation
+| Language Code | Language   |
+|---------------|------------|
+| en            | English    |
+| fr            | French     |
+| de            | German     |
+| es            | Spanish    |
+| it            | Italian    |
+| ja            | Japanese   |
+| nl            | Dutch      |
+| uk            | Ukrainian  |
+| pt            | Portuguese |
+| ar            | Arabic     |
+| zh            | Chinese - Simplified      |
+| zh-TW         | Chinese - Traditional     |
+| cs            | Czech      |
+| da            | Danish     |
+| fi            | Finnish    |
+| el            | Greek      |
+| he            | Hebrew     |
+| hu            | Hungarian  |
+| ko            | Korean     |
+| fa            | Persian    |
+| pl            | Polish     |
+| ru            | Russian    |
+| tr            | Turkish    |
+| ur            | Urdu       |
+| hi            | Hindi      |
+| vi            | Vietnamese |
+| id            | Indonesian |
+| bn            | Bengali    |
+| te            | Telugu     |
+| mr            | Marathi    |
+| ta            | Tamil      |
+| jw (or jv)    | Javanese   |
+| ca            | Catalan    |
+| ne            | Nepali     |
+| th            | Thai       |
+| sv            | Swedish    |
+| am            | Amharic    |
+| cy            | Welsh      |
+| hr            | Croatian   |
+| is            | Icelandic  |
+| ka            | Georgian   |
+| km            | Khmer      |
+| sk            | Slovak     |
+| sq            | Albanian   |
+| sr            | Serbian    |
+| az            | Azerbaijani|
+| bg            | Bulgarian  |
+| gl            | Galician   |
+| gu            | Gujarati   |
+| kk            | Kazakh     |
+| kn            | Kannada    |
+| lt            | Lithuanian |
+| lv            | Latvian    |
+| ml            | Malayalam  |
+| ro            | Romanian   |
+| si            | Sinhala    |
+| su            | Sundanese  |
+| et            | Estonian                  |
+| mk            | Macedonian                |
+| sw            | Swahili                   |
+| af            | Afrikaans                 |
+| bs            | Bosnian                   |
+| la            | Latin                     |
+| my            | Myanmar Burmese           |
+| no            | Norwegian                 |
+| as            | Assamese                  |
+| eu            | Basque                    |
+| ha            | Hausa                     |
+| ht            | Haitian Creole            |
+| hy            | Armenian                  |
+| lo            | Lao                       |
+| mg            | Malagasy                  |
+| mn            | Mongolian                 |
+| mt            | Maltese                   |
+| pa            | Punjabi                   |
+| ps            | Pashto                    |
+| sl            | Slovenian                 |
+| sn            | Shona                     |
+| so            | Somali                    |
+| tg            | Tajik                     |
+| tk            | Turkmen                   |
+| tt            | Tatar                     |
+| uz            | Uzbek                     |
+| yo            | Yoruba                    |
+### Non-transcription
+| Language Code | Language   |
+|---------------|------------|
+| ay | Aymara |
+| bm | Bambara |
+| ceb | Cebuano |
+| ny | Chichewa |
+| dv | Divehi |
+| doi | Dogri |
+| ee | Ewe |
+| gn | Guarani |
+| ilo | Iloko |
+| rw | Kinyarwanda|
+| kri | Krio |
+| ku | Kurdish |
+| ky | Kirghiz |
+| lg | Ganda |
+| mai | Maithili |
+| or | Oriya |
+| om | Oromo |
+| qu | Quechua |
+| sm | Samoan |
+| ti | Tigrinya |
+| ts | Tsonga |
+| ak | Akan |
+| ug | Uighur |
+## Example:
+### Original audio
+https://github.com/R3gm/SoniTranslate/assets/114810545/db9e78c0-b228-4e81-9704-e62d5cc407a3
+### Translated audio
+https://github.com/R3gm/SoniTranslate/assets/114810545/6a8ddc65-a46f-4653-9726-6df2615f0ef9
+## Colab Runtime
+To run SoniTranslate using Colab Runtime: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/R3gm/SoniTranslate/blob/main/SoniTranslate_Colab.ipynb)
+## Install Locally (Installation tested in Linux)
+### Before You Start
+Before you start installing and using SoniTranslate, there are a few things you need to do:
+1. Install the NVIDIA drivers for CUDA 11.8.0, NVIDIA CUDA is a parallel computing platform and programming model that enables developers to use the power of NVIDIA graphics processing units (GPUs) to speed up compute-intensive tasks. You can find the drivers [here](https://developer.nvidia.com/cuda-toolkit-archive). Follow the instructions on the website to download and install the drivers.
+2. Accept the license agreement for using Pyannote. You need to have an account on Hugging Face and `accept the license to use the models`: https://huggingface.co/pyannote/speaker-diarization and https://huggingface.co/pyannote/segmentation
+3. Create a [huggingface token](https://huggingface.co/settings/tokens). Hugging Face is a natural language processing platform that provides access to state-of-the-art models and tools. You will need to create a token in order to use some of the automatic model download features in SoniTranslate. Follow the instructions on the Hugging Face website to create a token.
+4. Install [Anaconda](https://www.anaconda.com/) or [Miniconda](https://docs.anaconda.com/free/miniconda/miniconda-install/). Anaconda is a free and open-source distribution of Python and R. It includes a package manager called conda that makes it easy to install and manage Python environments and packages. Follow the instructions on the Anaconda website to download and install Anaconda on your system.
+5. Install Git for your system. Git is a version control system that helps you track changes to your code and collaborate with other developers. You can install Git with Anaconda by running `conda install -c anaconda git -y` in your terminal (Do this after step 1 in the following section.). If you have trouble installing Git via Anaconda, you can use the following link instead:
+   - [Git for Linux](https://git-scm.com/download/linux)
+Once you have completed these steps, you will be ready to install SoniTranslate.
+### Getting Started
+To install SoniTranslate, follow these steps:
+1. Create a suitable anaconda environment for SoniTranslate and activate it:
+```
+conda create -n sonitr python=3.10 -y
+conda activate sonitr
+conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia
+```
+2. Clone this github repository and navigate to it:
+```
+git clone https://github.com/r3gm/SoniTranslate.git
+cd SoniTranslate
+```
+3. Install required packages:
+```
+pip install -r requirements_base.txt -v
+pip install -r requirements_extra.txt -v
+pip install onnxruntime-gpu
+```
+4. Install [ffmpeg](https://ffmpeg.org/download.html). FFmpeg is a free software project that produces libraries and programs for handling multimedia data. You will need it to process audio and video files. You can install ffmpeg with Anaconda by running `conda install -y ffmpeg` in your terminal (recommended). If you have trouble installing ffmpeg via Anaconda, you can use the following link instead: (https://ffmpeg.org/ffmpeg.html). Once it is installed, make sure it is in your PATH by running `ffmpeg -h` in your terminal. If you don't get an error message, you're good to go.
+5. Optional install:
+After installing FFmpeg, you can install these optional packages.
+[Piper TTS](https://github.com/rhasspy/piper) is a fast, local neural text to speech system that sounds great and is optimized for the Raspberry Pi 4. Piper is used in a variety of projects. Voices are trained with VITS and exported to the onnxruntime.
+```
+pip install -q piper-tts==1.2.0
+```
+[Coqui XTTS](https://github.com/coqui-ai/TTS) is a text-to-speech (TTS) model that lets you generate realistic voices in different languages. It can clone voices with just a short audio clip, even speak in a different language! It's like having a personal voice mimic for any text you need spoken.
+```
+pip install -q -r requirements_xtts.txt
+pip install -q TTS==0.21.1  --no-deps
+```
+### Running SoniTranslate
+To run SoniTranslate locally, make sure the `sonitr` conda environment is active:
+```
+conda activate sonitr
+```
+Setting your Hugging Face token as an environment variable in Linux:
+```
+export YOUR_HF_TOKEN="YOUR_HUGGING_FACE_TOKEN"
+```
+Then navigate to the `SoniTranslate` folder and run either the `app_rvc.py`
+```
+python app_rvc.py
+```
+When the `local URL` `http://127.0.0.1:7860` is displayed in the terminal, simply open this URL in your web browser to access the SoniTranslate interface.
+### Stop and close SoniTranslate.
+In most environments, you can stop the execution by pressing Ctrl+C in the terminal where you launched the script `app_rvc.py`. This will interrupt the program and stop the Gradio app.
+To deactivate the Conda environment, you can use the following command:
+```
+conda deactivate
+```
+This will deactivate the currently active Conda environment sonitr, and you'll return to the base environment or the global Python environment.
+### Starting Over
+If you need to start over from scratch, you can delete the `SoniTranslate` folder and remove the `sonitr` conda environment with the following set of commands:
+```
+conda deactivate
+conda env remove -n sonitr
+```
+With the `sonitr` environment removed, you can start over with a fresh installation.
+## Command line arguments
+The app_rvc.py script supports command-line arguments to customize its behavior. Here's a brief guide on how to use them:
+| Argument command | Default | Value | Description |
+|------------------|---------|-------|-------------|
+| --theme          | Taithrah/Minimal | String | Sets the theme for the interface. Themes can be found in the [Theme Gallery](https://huggingface.co/spaces/gradio/theme-gallery). |
+| --language       | english | String | Selects the interface language. Available options: afrikaans, arabic, azerbaijani, chinese_zh_cn, english, french, german, hindi, indonesian, italian, japanese, korean, marathi, persian, polish, portuguese, russian, spanish, swedish, turkish, ukrainian, vietnamese. |
+| --verbosity_level| info    | String | Sets the verbosity level of the logger: debug, info, warning, error, or critical. |
+| --public_url     |    | Boolean | Enables a public link. |
+| --cpu_mode     |    | Boolean | Enable CPU mode to run the program without utilizing GPU acceleration. |
+| --logs_in_gui    |    | Boolean | Shows the operations performed in Logs (obsolete). |
+Example usage:
+```
+python app_rvc.py --theme aliabid94/new-theme --language french
+```
+This command sets the theme to a custom theme and selects French as the interface language.
+Feel free to customize these arguments according to your preferences and requirements.
+## 📖 News
+🔥 2024/18/05: New Update Details
+   - Added option Overlap Reduction
+   - OpenAI API Key Integration for Transcription, translation, and TTS
+   - More output types: subtitles by speaker, separate audio sound, and video only with subtitles
+   - Access to a better-performing version of Whisper for transcribing speech on the [Hugging Face Whisper page](https://huggingface.co/models?pipeline_tag=automatic-speech-recognition&sort=trending&search=whisper). Copy the repository ID and paste it into the 'Whisper ASR model' section in 'Advanced Settings'; e.g., `kotoba-tech/kotoba-whisper-v1.1` for Japanese transcription [available here](https://huggingface.co/kotoba-tech/kotoba-whisper-v1.1)
+   - Support for ASS subtitles and batch processing with subtitles
+   - Vocal enhancement before transcription
+   - Added CPU mode with `app_rvc.py --cpu_mode`
+   - TTS now supports up to 12 speakers
+   - OpenVoiceV2 integration for voice imitation
+   - PDF to videobook (displays images from the PDF)
+   - GUI language translation in Persian and Afrikaans
+   - **New Language Support**:
+     - **Complete support**: Estonian, Macedonian, Malay, Swahili, Afrikaans, Bosnian, Latin, Myanmar Burmese, Norwegian, Traditional Chinese, Assamese, Basque, Hausa, Haitian Creole, Armenian, Lao, Malagasy, Mongolian, Maltese, Punjabi, Pashto, Slovenian, Shona, Somali, Tajik, Turkmen, Tatar, Uzbek, and Yoruba
+     - **Non-transcription**: Aymara, Bambara, Cebuano, Chichewa, Divehi, Dogri, Ewe, Guarani, Iloko, Kinyarwanda, Krio, Kurdish, Kirghiz, Ganda, Maithili, Oriya, Oromo, Quechua, Samoan, Tigrinya, Tsonga, Akan, and Uighur
+🔥 2024/03/02: Preserve file names in output. Multiple archives can now be submitted simultaneously by specifying their paths, directories or URLs separated by commas. Processing of a full YouTube playlist. About [supported sites URL](https://github.com/yt-dlp/yt-dlp/blob/master/supportedsites.md), please be aware that not all sites may work optimally. Added option for disabling diarization. Implemented soft subtitles. Format output (MP3, MP4, MKV, WAV, and OGG), and resolved issues related to file reading and diarization.
+🔥 2024/02/22: Added freevc for voice imitation, fixed voiceless track, divide segments. New languages support (Swedish, Amharic, Welsh, Croatian, Icelandic, Georgian, Khmer, Slovak, Albanian, Serbian, Azerbaijani, Bulgarian, Galician, Gujarati, Kazakh, Kannada, Lithuanian, Latvian, Malayalam, Romanian, Sinhala and Sundanese). New translations of the GUI (Spanish, French, German, Italian, Japanese, Chinese Simplified, Ukrainian, Arabic, Russian, Turkish, Indonesian, Portuguese, Hindi, Vietnamese, Polish, Swedish, Korean, Marathi and Azerbaijani). With subtitle file, no align and the media file is not needed to process the SRT file. Burn subtitles to video. Queue can accept multiple tasks simultaneously. Sound alert notification. Continue process from last checkpoint. Acceleration rate regulation.
+🔥 2024/01/16: Expanded language support (Thai, Nepali, Catalan, Javanese, Tamil, Marathi, Telugu, Bengali and Indonesian), the introduction of whisper large v3, configurable GUI options, integration of BARK, Facebook-mms, Coqui XTTS, and Piper-TTS. Additional features included audio separation utilities, XTTS WAV creation, use an SRT file as a base for translation, document translation, manual speaker editing, and flexible output options (video, audio, subtitles).
+🔥 2023/10/29: Edit the translated subtitle, download it, adjust volume and speed options.
+🔥 2023/08/03: Changed default options and added directory view of downloads.
+🔥 2023/08/02: Added support for Arabic, Czech, Danish, Finnish, Greek, Hebrew, Hungarian, Korean, Persian, Polish, Russian, Turkish, Urdu, Hindi, and Vietnamese languages. 🌐
+🔥 2023/08/01: Add options for use RVC models.
+🔥 2023/07/27: Fix some bug processing the video and audio.
+🔥 2023/07/26: New UI and add mix options.
+## Contributing
+Welcome to contributions from the community! If you have any ideas, bug reports, or feature requests, please open an issue or submit a pull request. For more information, please refer to the contribution guidelines.
+## Credits
+This project leverages a number of open-source projects. We would like to acknowledge and thank the contributors of the following repositories:
+- [PyTorch](https://github.com/pytorch/pytorch)
+- [yt-dlp](https://github.com/yt-dlp/yt-dlp)
+- [Gradio](https://github.com/gradio-app/gradio)
+- [edge-tts](https://github.com/rany2/edge-tts)
+- [deep-translator](https://github.com/nidhaloff/deep-translator)
+- [pyannote-audio](https://github.com/pyannote/pyannote-audio)
+- [WhisperX](https://github.com/m-bain/whisperX)
+- [faster-whisper](https://github.com/SYSTRAN/faster-whisper)
+- [CTranslate2](https://github.com/OpenNMT/CTranslate2)
+- [Transformers](https://github.com/huggingface/transformers)
+- [FFmpeg](https://github.com/FFmpeg/FFmpeg)
+- [Piper](https://github.com/rhasspy/piper)
+- [Coqui TTS](https://github.com/coqui-ai/TTS)
+- [pypdf](https://github.com/py-pdf/pypdf)
+- [OpenVoice](https://github.com/myshell-ai/OpenVoice)
+## License
+Although the code is licensed under Apache 2, the models or weights may have commercial restrictions, as seen with pyannote diarization.

app_rvc.py CHANGED Viewed

The diff for this file is too large to render. See raw diff

mdx_models/data.json CHANGED Viewed

@@ -1,354 +1,354 @@
-{
-    "0ddfc0eb5792638ad5dc27850236c246": {
-        "compensate": 1.035,
-        "mdx_dim_f_set": 2048,
-        "mdx_dim_t_set": 8,
-        "mdx_n_fft_scale_set": 6144,
-        "primary_stem": "Vocals"
-    },
-    "26d308f91f3423a67dc69a6d12a8793d": {
-        "compensate": 1.035,
-        "mdx_dim_f_set": 2048,
-        "mdx_dim_t_set": 9,
-        "mdx_n_fft_scale_set": 8192,
-        "primary_stem": "Other"
-    },
-    "2cdd429caac38f0194b133884160f2c6": {
-        "compensate": 1.045,
-        "mdx_dim_f_set": 3072,
-        "mdx_dim_t_set": 8,
-        "mdx_n_fft_scale_set": 7680,
-        "primary_stem": "Instrumental"
-    },
-    "2f5501189a2f6db6349916fabe8c90de": {
-        "compensate": 1.035,
-        "mdx_dim_f_set": 2048,
-        "mdx_dim_t_set": 8,
-        "mdx_n_fft_scale_set": 6144,
-        "primary_stem": "Vocals"
-    },
-    "398580b6d5d973af3120df54cee6759d": {
-        "compensate": 1.75,
-        "mdx_dim_f_set": 3072,
-        "mdx_dim_t_set": 8,
-        "mdx_n_fft_scale_set": 7680,
-        "primary_stem": "Vocals"
-    },
-    "488b3e6f8bd3717d9d7c428476be2d75": {
-        "compensate": 1.035,
-        "mdx_dim_f_set": 3072,
-        "mdx_dim_t_set": 8,
-        "mdx_n_fft_scale_set": 7680,
-        "primary_stem": "Instrumental"
-    },
-    "4910e7827f335048bdac11fa967772f9": {
-        "compensate": 1.035,
-        "mdx_dim_f_set": 2048,
-        "mdx_dim_t_set": 7,
-        "mdx_n_fft_scale_set": 4096,
-        "primary_stem": "Drums"
-    },
-    "53c4baf4d12c3e6c3831bb8f5b532b93": {
-        "compensate": 1.043,
-        "mdx_dim_f_set": 3072,
-        "mdx_dim_t_set": 8,
-        "mdx_n_fft_scale_set": 7680,
-        "primary_stem": "Vocals"
-    },
-    "5d343409ef0df48c7d78cce9f0106781": {
-        "compensate": 1.075,
-        "mdx_dim_f_set": 3072,
-        "mdx_dim_t_set": 8,
-        "mdx_n_fft_scale_set": 7680,
-        "primary_stem": "Vocals"
-    },
-    "5f6483271e1efb9bfb59e4a3e6d4d098": {
-        "compensate": 1.035,
-        "mdx_dim_f_set": 2048,
-        "mdx_dim_t_set": 9,
-        "mdx_n_fft_scale_set": 6144,
-        "primary_stem": "Vocals"
-    },
-    "65ab5919372a128e4167f5e01a8fda85": {
-        "compensate": 1.035,
-        "mdx_dim_f_set": 2048,
-        "mdx_dim_t_set": 8,
-        "mdx_n_fft_scale_set": 8192,
-        "primary_stem": "Other"
-    },
-    "6703e39f36f18aa7855ee1047765621d": {
-        "compensate": 1.035,
-        "mdx_dim_f_set": 2048,
-        "mdx_dim_t_set": 9,
-        "mdx_n_fft_scale_set": 16384,
-        "primary_stem": "Bass"
-    },
-    "6b31de20e84392859a3d09d43f089515": {
-        "compensate": 1.035,
-        "mdx_dim_f_set": 2048,
-        "mdx_dim_t_set": 8,
-        "mdx_n_fft_scale_set": 6144,
-        "primary_stem": "Vocals"
-    },
-    "867595e9de46f6ab699008295df62798": {
-        "compensate": 1.03,
-        "mdx_dim_f_set": 3072,
-        "mdx_dim_t_set": 8,
-        "mdx_n_fft_scale_set": 7680,
-        "primary_stem": "Vocals"
-    },
-    "a3cd63058945e777505c01d2507daf37": {
-        "compensate": 1.03,
-        "mdx_dim_f_set": 2048,
-        "mdx_dim_t_set": 8,
-        "mdx_n_fft_scale_set": 6144,
-        "primary_stem": "Vocals"
-    },
-    "b33d9b3950b6cbf5fe90a32608924700": {
-        "compensate": 1.03,
-        "mdx_dim_f_set": 3072,
-        "mdx_dim_t_set": 8,
-        "mdx_n_fft_scale_set": 7680,
-        "primary_stem": "Vocals"
-    },
-    "c3b29bdce8c4fa17ec609e16220330ab": {
-        "compensate": 1.035,
-        "mdx_dim_f_set": 2048,
-        "mdx_dim_t_set": 8,
-        "mdx_n_fft_scale_set": 16384,
-        "primary_stem": "Bass"
-    },
-    "ceed671467c1f64ebdfac8a2490d0d52": {
-        "compensate": 1.035,
-        "mdx_dim_f_set": 3072,
-        "mdx_dim_t_set": 8,
-        "mdx_n_fft_scale_set": 7680,
-        "primary_stem": "Instrumental"
-    },
-    "d2a1376f310e4f7fa37fb9b5774eb701": {
-        "compensate": 1.035,
-        "mdx_dim_f_set": 3072,
-        "mdx_dim_t_set": 8,
-        "mdx_n_fft_scale_set": 7680,
-        "primary_stem": "Instrumental"
-    },
-    "d7bff498db9324db933d913388cba6be": {
-        "compensate": 1.035,
-        "mdx_dim_f_set": 2048,
-        "mdx_dim_t_set": 8,
-        "mdx_n_fft_scale_set": 6144,
-        "primary_stem": "Vocals"
-    },
-    "d94058f8c7f1fae4164868ae8ae66b20": {
-        "compensate": 1.035,
-        "mdx_dim_f_set": 2048,
-        "mdx_dim_t_set": 8,
-        "mdx_n_fft_scale_set": 6144,
-        "primary_stem": "Vocals"
-    },
-    "dc41ede5961d50f277eb846db17f5319": {
-        "compensate": 1.035,
-        "mdx_dim_f_set": 2048,
-        "mdx_dim_t_set": 9,
-        "mdx_n_fft_scale_set": 4096,
-        "primary_stem": "Drums"
-    },
-    "e5572e58abf111f80d8241d2e44e7fa4": {
-        "compensate": 1.028,
-        "mdx_dim_f_set": 3072,
-        "mdx_dim_t_set": 8,
-        "mdx_n_fft_scale_set": 7680,
-        "primary_stem": "Instrumental"
-    },
-    "e7324c873b1f615c35c1967f912db92a": {
-        "compensate": 1.03,
-        "mdx_dim_f_set": 3072,
-        "mdx_dim_t_set": 8,
-        "mdx_n_fft_scale_set": 7680,
-        "primary_stem": "Vocals"
-    },
-    "1c56ec0224f1d559c42fd6fd2a67b154": {
-        "compensate": 1.025,
-        "mdx_dim_f_set": 2048,
-        "mdx_dim_t_set": 8,
-        "mdx_n_fft_scale_set": 5120,
-        "primary_stem": "Instrumental"
-    },
-    "f2df6d6863d8f435436d8b561594ff49": {
-        "compensate": 1.035,
-        "mdx_dim_f_set": 3072,
-        "mdx_dim_t_set": 8,
-        "mdx_n_fft_scale_set": 7680,
-        "primary_stem": "Instrumental"
-    },
-    "b06327a00d5e5fbc7d96e1781bbdb596": {
-        "compensate": 1.035,
-        "mdx_dim_f_set": 3072,
-        "mdx_dim_t_set": 8,
-        "mdx_n_fft_scale_set": 6144,
-        "primary_stem": "Instrumental"
-    },
-    "94ff780b977d3ca07c7a343dab2e25dd": {
-        "compensate": 1.039,
-        "mdx_dim_f_set": 3072,
-        "mdx_dim_t_set": 8,
-        "mdx_n_fft_scale_set": 6144,
-        "primary_stem": "Instrumental"
-    },
-    "73492b58195c3b52d34590d5474452f6": {
-        "compensate": 1.043,
-        "mdx_dim_f_set": 3072,
-        "mdx_dim_t_set": 8,
-        "mdx_n_fft_scale_set": 7680,
-        "primary_stem": "Vocals"
-    },
-    "970b3f9492014d18fefeedfe4773cb42": {
-        "compensate": 1.009,
-        "mdx_dim_f_set": 3072,
-        "mdx_dim_t_set": 8,
-        "mdx_n_fft_scale_set": 7680,
-        "primary_stem": "Vocals"
-    },
-    "1d64a6d2c30f709b8c9b4ce1366d96ee": {
-        "compensate": 1.035,
-        "mdx_dim_f_set": 2048,
-        "mdx_dim_t_set": 8,
-        "mdx_n_fft_scale_set": 5120,
-        "primary_stem": "Instrumental"
-    },
-    "203f2a3955221b64df85a41af87cf8f0": {
-        "compensate": 1.035,
-        "mdx_dim_f_set": 3072,
-        "mdx_dim_t_set": 8,
-        "mdx_n_fft_scale_set": 6144,
-        "primary_stem": "Instrumental"
-    },
-    "291c2049608edb52648b96e27eb80e95": {
-        "compensate": 1.035,
-        "mdx_dim_f_set": 3072,
-        "mdx_dim_t_set": 8,
-        "mdx_n_fft_scale_set": 6144,
-        "primary_stem": "Instrumental"
-    },
-    "ead8d05dab12ec571d67549b3aab03fc": {
-        "compensate": 1.035,
-        "mdx_dim_f_set": 3072,
-        "mdx_dim_t_set": 8,
-        "mdx_n_fft_scale_set": 6144,
-        "primary_stem": "Instrumental"
-    },
-    "cc63408db3d80b4d85b0287d1d7c9632": {
-        "compensate": 1.033,
-        "mdx_dim_f_set": 3072,
-        "mdx_dim_t_set": 8,
-        "mdx_n_fft_scale_set": 6144,
-        "primary_stem": "Instrumental"
-    },
-    "cd5b2989ad863f116c855db1dfe24e39": {
-        "compensate": 1.035,
-        "mdx_dim_f_set": 3072,
-        "mdx_dim_t_set": 9,
-        "mdx_n_fft_scale_set": 6144,
-        "primary_stem": "Other"
-    },
-    "55657dd70583b0fedfba5f67df11d711": {
-        "compensate": 1.022,
-        "mdx_dim_f_set": 3072,
-        "mdx_dim_t_set": 8,
-        "mdx_n_fft_scale_set": 6144,
-        "primary_stem": "Instrumental"
-    },
-    "b6bccda408a436db8500083ef3491e8b": {
-        "compensate": 1.02,
-        "mdx_dim_f_set": 3072,
-        "mdx_dim_t_set": 8,
-        "mdx_n_fft_scale_set": 7680,
-        "primary_stem": "Instrumental"
-    },
-    "8a88db95c7fb5dbe6a095ff2ffb428b1": {
-        "compensate": 1.026,
-        "mdx_dim_f_set": 2048,
-        "mdx_dim_t_set": 8,
-        "mdx_n_fft_scale_set": 5120,
-        "primary_stem": "Instrumental"
-    },
-    "b78da4afc6512f98e4756f5977f5c6b9": {
-        "compensate": 1.021,
-        "mdx_dim_f_set": 3072,
-        "mdx_dim_t_set": 8,
-        "mdx_n_fft_scale_set": 7680,
-        "primary_stem": "Instrumental"
-    },
-    "77d07b2667ddf05b9e3175941b4454a0": {
-        "compensate": 1.021,
-        "mdx_dim_f_set": 3072,
-        "mdx_dim_t_set": 8,
-        "mdx_n_fft_scale_set": 7680,
-        "primary_stem": "Vocals"
-    },
-    "0f2a6bc5b49d87d64728ee40e23bceb1": {
-        "compensate": 1.019,
-        "mdx_dim_f_set": 2560,
-        "mdx_dim_t_set": 8,
-        "mdx_n_fft_scale_set": 5120,
-        "primary_stem": "Instrumental"
-    },
-    "b02be2d198d4968a121030cf8950b492": {
-        "compensate": 1.020,
-        "mdx_dim_f_set": 2560,
-        "mdx_dim_t_set": 8,
-        "mdx_n_fft_scale_set": 5120,
-        "primary_stem": "No Crowd"
-    },
-    "2154254ee89b2945b97a7efed6e88820": {
-        "config_yaml": "model_2_stem_061321.yaml"
-    },
-    "063aadd735d58150722926dcbf5852a9": {
-        "config_yaml": "model_2_stem_061321.yaml"
-    },
-    "fe96801369f6a148df2720f5ced88c19": {
-        "config_yaml": "model3.yaml"
-    },
-    "02e8b226f85fb566e5db894b9931c640": {
-        "config_yaml": "model2.yaml"
-    },
-    "e3de6d861635ab9c1d766149edd680d6": {
-        "config_yaml": "model1.yaml"
-    },
-    "3f2936c554ab73ce2e396d54636bd373": {
-        "config_yaml": "modelB.yaml"
-    },
-    "890d0f6f82d7574bca741a9e8bcb8168": {
-        "config_yaml": "modelB.yaml"
-    },
-    "63a3cb8c37c474681049be4ad1ba8815": {
-        "config_yaml": "modelB.yaml"
-    },
-    "a7fc5d719743c7fd6b61bd2b4d48b9f0": {
-        "config_yaml": "modelA.yaml"
-    },
-    "3567f3dee6e77bf366fcb1c7b8bc3745": {
-        "config_yaml": "modelA.yaml"
-    },
-    "a28f4d717bd0d34cd2ff7a3b0a3d065e": {
-        "config_yaml": "modelA.yaml"
-    },
-    "c9971a18da20911822593dc81caa8be9": {
-        "config_yaml": "sndfx.yaml"
-    },
-    "57d94d5ed705460d21c75a5ac829a605": {
-        "config_yaml": "sndfx.yaml"
-    },
-    "e7a25f8764f25a52c1b96c4946e66ba2": {
-        "config_yaml": "sndfx.yaml"
-    },
-    "104081d24e37217086ce5fde09147ee1": {
-        "config_yaml": "model_2_stem_061321.yaml"
-    },
-    "1e6165b601539f38d0a9330f3facffeb": {
-        "config_yaml": "model_2_stem_061321.yaml"
-    },
-    "fe0108464ce0d8271be5ab810891bd7c": {
-        "config_yaml": "model_2_stem_full_band.yaml"
-    }
 }

+{
+    "0ddfc0eb5792638ad5dc27850236c246": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 2048,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 6144,
+        "primary_stem": "Vocals"
+    },
+    "26d308f91f3423a67dc69a6d12a8793d": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 2048,
+        "mdx_dim_t_set": 9,
+        "mdx_n_fft_scale_set": 8192,
+        "primary_stem": "Other"
+    },
+    "2cdd429caac38f0194b133884160f2c6": {
+        "compensate": 1.045,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 7680,
+        "primary_stem": "Instrumental"
+    },
+    "2f5501189a2f6db6349916fabe8c90de": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 2048,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 6144,
+        "primary_stem": "Vocals"
+    },
+    "398580b6d5d973af3120df54cee6759d": {
+        "compensate": 1.75,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 7680,
+        "primary_stem": "Vocals"
+    },
+    "488b3e6f8bd3717d9d7c428476be2d75": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 7680,
+        "primary_stem": "Instrumental"
+    },
+    "4910e7827f335048bdac11fa967772f9": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 2048,
+        "mdx_dim_t_set": 7,
+        "mdx_n_fft_scale_set": 4096,
+        "primary_stem": "Drums"
+    },
+    "53c4baf4d12c3e6c3831bb8f5b532b93": {
+        "compensate": 1.043,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 7680,
+        "primary_stem": "Vocals"
+    },
+    "5d343409ef0df48c7d78cce9f0106781": {
+        "compensate": 1.075,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 7680,
+        "primary_stem": "Vocals"
+    },
+    "5f6483271e1efb9bfb59e4a3e6d4d098": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 2048,
+        "mdx_dim_t_set": 9,
+        "mdx_n_fft_scale_set": 6144,
+        "primary_stem": "Vocals"
+    },
+    "65ab5919372a128e4167f5e01a8fda85": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 2048,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 8192,
+        "primary_stem": "Other"
+    },
+    "6703e39f36f18aa7855ee1047765621d": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 2048,
+        "mdx_dim_t_set": 9,
+        "mdx_n_fft_scale_set": 16384,
+        "primary_stem": "Bass"
+    },
+    "6b31de20e84392859a3d09d43f089515": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 2048,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 6144,
+        "primary_stem": "Vocals"
+    },
+    "867595e9de46f6ab699008295df62798": {
+        "compensate": 1.03,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 7680,
+        "primary_stem": "Vocals"
+    },
+    "a3cd63058945e777505c01d2507daf37": {
+        "compensate": 1.03,
+        "mdx_dim_f_set": 2048,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 6144,
+        "primary_stem": "Vocals"
+    },
+    "b33d9b3950b6cbf5fe90a32608924700": {
+        "compensate": 1.03,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 7680,
+        "primary_stem": "Vocals"
+    },
+    "c3b29bdce8c4fa17ec609e16220330ab": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 2048,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 16384,
+        "primary_stem": "Bass"
+    },
+    "ceed671467c1f64ebdfac8a2490d0d52": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 7680,
+        "primary_stem": "Instrumental"
+    },
+    "d2a1376f310e4f7fa37fb9b5774eb701": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 7680,
+        "primary_stem": "Instrumental"
+    },
+    "d7bff498db9324db933d913388cba6be": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 2048,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 6144,
+        "primary_stem": "Vocals"
+    },
+    "d94058f8c7f1fae4164868ae8ae66b20": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 2048,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 6144,
+        "primary_stem": "Vocals"
+    },
+    "dc41ede5961d50f277eb846db17f5319": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 2048,
+        "mdx_dim_t_set": 9,
+        "mdx_n_fft_scale_set": 4096,
+        "primary_stem": "Drums"
+    },
+    "e5572e58abf111f80d8241d2e44e7fa4": {
+        "compensate": 1.028,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 7680,
+        "primary_stem": "Instrumental"
+    },
+    "e7324c873b1f615c35c1967f912db92a": {
+        "compensate": 1.03,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 7680,
+        "primary_stem": "Vocals"
+    },
+    "1c56ec0224f1d559c42fd6fd2a67b154": {
+        "compensate": 1.025,
+        "mdx_dim_f_set": 2048,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 5120,
+        "primary_stem": "Instrumental"
+    },
+    "f2df6d6863d8f435436d8b561594ff49": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 7680,
+        "primary_stem": "Instrumental"
+    },
+    "b06327a00d5e5fbc7d96e1781bbdb596": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 6144,
+        "primary_stem": "Instrumental"
+    },
+    "94ff780b977d3ca07c7a343dab2e25dd": {
+        "compensate": 1.039,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 6144,
+        "primary_stem": "Instrumental"
+    },
+    "73492b58195c3b52d34590d5474452f6": {
+        "compensate": 1.043,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 7680,
+        "primary_stem": "Vocals"
+    },
+    "970b3f9492014d18fefeedfe4773cb42": {
+        "compensate": 1.009,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 7680,
+        "primary_stem": "Vocals"
+    },
+    "1d64a6d2c30f709b8c9b4ce1366d96ee": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 2048,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 5120,
+        "primary_stem": "Instrumental"
+    },
+    "203f2a3955221b64df85a41af87cf8f0": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 6144,
+        "primary_stem": "Instrumental"
+    },
+    "291c2049608edb52648b96e27eb80e95": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 6144,
+        "primary_stem": "Instrumental"
+    },
+    "ead8d05dab12ec571d67549b3aab03fc": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 6144,
+        "primary_stem": "Instrumental"
+    },
+    "cc63408db3d80b4d85b0287d1d7c9632": {
+        "compensate": 1.033,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 6144,
+        "primary_stem": "Instrumental"
+    },
+    "cd5b2989ad863f116c855db1dfe24e39": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 9,
+        "mdx_n_fft_scale_set": 6144,
+        "primary_stem": "Other"
+    },
+    "55657dd70583b0fedfba5f67df11d711": {
+        "compensate": 1.022,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 6144,
+        "primary_stem": "Instrumental"
+    },
+    "b6bccda408a436db8500083ef3491e8b": {
+        "compensate": 1.02,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 7680,
+        "primary_stem": "Instrumental"
+    },
+    "8a88db95c7fb5dbe6a095ff2ffb428b1": {
+        "compensate": 1.026,
+        "mdx_dim_f_set": 2048,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 5120,
+        "primary_stem": "Instrumental"
+    },
+    "b78da4afc6512f98e4756f5977f5c6b9": {
+        "compensate": 1.021,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 7680,
+        "primary_stem": "Instrumental"
+    },
+    "77d07b2667ddf05b9e3175941b4454a0": {
+        "compensate": 1.021,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 7680,
+        "primary_stem": "Vocals"
+    },
+    "0f2a6bc5b49d87d64728ee40e23bceb1": {
+        "compensate": 1.019,
+        "mdx_dim_f_set": 2560,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 5120,
+        "primary_stem": "Instrumental"
+    },
+    "b02be2d198d4968a121030cf8950b492": {
+        "compensate": 1.020,
+        "mdx_dim_f_set": 2560,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 5120,
+        "primary_stem": "No Crowd"
+    },
+    "2154254ee89b2945b97a7efed6e88820": {
+        "config_yaml": "model_2_stem_061321.yaml"
+    },
+    "063aadd735d58150722926dcbf5852a9": {
+        "config_yaml": "model_2_stem_061321.yaml"
+    },
+    "fe96801369f6a148df2720f5ced88c19": {
+        "config_yaml": "model3.yaml"
+    },
+    "02e8b226f85fb566e5db894b9931c640": {
+        "config_yaml": "model2.yaml"
+    },
+    "e3de6d861635ab9c1d766149edd680d6": {
+        "config_yaml": "model1.yaml"
+    },
+    "3f2936c554ab73ce2e396d54636bd373": {
+        "config_yaml": "modelB.yaml"
+    },
+    "890d0f6f82d7574bca741a9e8bcb8168": {
+        "config_yaml": "modelB.yaml"
+    },
+    "63a3cb8c37c474681049be4ad1ba8815": {
+        "config_yaml": "modelB.yaml"
+    },
+    "a7fc5d719743c7fd6b61bd2b4d48b9f0": {
+        "config_yaml": "modelA.yaml"
+    },
+    "3567f3dee6e77bf366fcb1c7b8bc3745": {
+        "config_yaml": "modelA.yaml"
+    },
+    "a28f4d717bd0d34cd2ff7a3b0a3d065e": {
+        "config_yaml": "modelA.yaml"
+    },
+    "c9971a18da20911822593dc81caa8be9": {
+        "config_yaml": "sndfx.yaml"
+    },
+    "57d94d5ed705460d21c75a5ac829a605": {
+        "config_yaml": "sndfx.yaml"
+    },
+    "e7a25f8764f25a52c1b96c4946e66ba2": {
+        "config_yaml": "sndfx.yaml"
+    },
+    "104081d24e37217086ce5fde09147ee1": {
+        "config_yaml": "model_2_stem_061321.yaml"
+    },
+    "1e6165b601539f38d0a9330f3facffeb": {
+        "config_yaml": "model_2_stem_061321.yaml"
+    },
+    "fe0108464ce0d8271be5ab810891bd7c": {
+        "config_yaml": "model_2_stem_full_band.yaml"
+    }
 }

requirements.txt CHANGED Viewed

@@ -1,19 +1,37 @@
-praat-parselmouth>=0.4.3
-pyworld==0.3.2
-faiss-cpu==1.7.3
-torchcrepe==0.0.20
-ffmpeg-python>=0.2.0
-fairseq==0.12.2
-gdown
-rarfile
-transformers
-accelerate
-optimum
-sentencepiece
-srt
-git+https://github.com/R3gm/openvoice_package.git@lite
-openai==1.14.3
-tiktoken==0.6.0
-# Documents
-pypdf==4.2.0
-python-docx

+# Temporal requirements
+nest_asyncio
+--extra-index-url https://download.pytorch.org/whl/cu118
+torch>=2.1.0+cu118
+torchvision>=0.16.0+cu118
+torchaudio>=2.1.0+cu118
+yt-dlp
+gradio==4.19.2
+pydub==0.25.1
+edge_tts==6.1.7
+deep_translator==1.11.4
+git+https://github.com/m-bain/whisperX.git@a5dca2c
+gTTS
+gradio_client==0.10.1
+praat-parselmouth>=0.4.3
+pyworld==0.3.2
+faiss-cpu==1.7.3
+torchcrepe==0.0.20
+ffmpeg-python>=0.2.0
+git+https://github.com/facebookresearch/fairseq.git@refs/pull/5359/merge
+gdown
+rarfile
+IPython
+transformers
+accelerate
+optimum
+sentencepiece
+srt
+onnxruntime-gpu
+git+https://github.com/R3gm/openvoice_package.git@lite
+# Documents
+PyPDF2
+python-docx
+# after this
+# pip install git+https://github.com/omry/omegaconf.git@refs/pull/1137/merge

requirements_base.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+--extra-index-url https://download.pytorch.org/whl/cu118
+torch>=2.1.0+cu118
+torchvision>=0.16.0+cu118
+torchaudio>=2.1.0+cu118
+yt-dlp
+gradio==4.19.2
+pydub==0.25.1
+edge_tts==6.1.7
+deep_translator==1.11.4
+git+https://github.com/R3gm/pyannote-audio.git@3.1.1
+git+https://github.com/R3gm/whisperX.git@cuda_11_8
+nest_asyncio
+gTTS
+gradio_client==0.10.1
+IPython

requirements_extra.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+praat-parselmouth>=0.4.3
+pyworld==0.3.2
+faiss-cpu==1.7.3
+torchcrepe==0.0.20
+ffmpeg-python>=0.2.0
+fairseq==0.12.2
+gdown
+rarfile
+transformers
+accelerate
+optimum
+sentencepiece
+srt
+git+https://github.com/R3gm/openvoice_package.git@lite
+openai==1.14.3
+tiktoken==0.6.0
+# Documents
+pypdf==4.2.0
+python-docx

requirements_xtts.txt CHANGED Viewed

@@ -1,58 +1,58 @@
-# core deps
-numpy==1.23.5
-cython>=0.29.30
-scipy>=1.11.2
-torch
-torchaudio
-soundfile
-librosa
-scikit-learn
-numba
-inflect>=5.6.0
-tqdm>=4.64.1
-anyascii>=0.3.0
-pyyaml>=6.0
-fsspec>=2023.6.0 # <= 2023.9.1 makes aux tests fail
-aiohttp>=3.8.1
-packaging>=23.1
-# deps for examples
-flask>=2.0.1
-# deps for inference
-pysbd>=0.3.4
-# deps for notebooks
-umap-learn>=0.5.1
-pandas
-# deps for training
-matplotlib
-# coqui stack
-trainer>=0.0.32
-# config management
-coqpit>=0.0.16
-# chinese g2p deps
-jieba
-pypinyin
-# korean
-hangul_romanize
-# gruut+supported langs
-gruut[de,es,fr]==2.2.3
-# deps for korean
-jamo
-nltk
-g2pkk>=0.1.1
-# deps for bangla
-bangla
-bnnumerizer
-bnunicodenormalizer
-#deps for tortoise
-einops>=0.6.0
-transformers
-#deps for bark
-encodec>=0.1.1
-# deps for XTTS
-unidecode>=1.3.2
-num2words
-spacy[ja]>=3
-# after this
-# pip install -r requirements_xtts.txt
 # pip install TTS==0.21.1  --no-deps

+# core deps
+numpy==1.23.5
+cython>=0.29.30
+scipy>=1.11.2
+torch
+torchaudio
+soundfile
+librosa
+scikit-learn
+numba
+inflect>=5.6.0
+tqdm>=4.64.1
+anyascii>=0.3.0
+pyyaml>=6.0
+fsspec>=2023.6.0 # <= 2023.9.1 makes aux tests fail
+aiohttp>=3.8.1
+packaging>=23.1
+# deps for examples
+flask>=2.0.1
+# deps for inference
+pysbd>=0.3.4
+# deps for notebooks
+umap-learn>=0.5.1
+pandas
+# deps for training
+matplotlib
+# coqui stack
+trainer>=0.0.32
+# config management
+coqpit>=0.0.16
+# chinese g2p deps
+jieba
+pypinyin
+# korean
+hangul_romanize
+# gruut+supported langs
+gruut[de,es,fr]==2.2.3
+# deps for korean
+jamo
+nltk
+g2pkk>=0.1.1
+# deps for bangla
+bangla
+bnnumerizer
+bnunicodenormalizer
+#deps for tortoise
+einops>=0.6.0
+transformers
+#deps for bark
+encodec>=0.1.1
+# deps for XTTS
+unidecode>=1.3.2
+num2words
+spacy[ja]>=3
+# after this
+# pip install -r requirements_xtts.txt
 # pip install TTS==0.21.1  --no-deps

soni_translate/audio_segments.py CHANGED Viewed

@@ -1,141 +1,141 @@
-from pydub import AudioSegment
-from tqdm import tqdm
-from .utils import run_command
-from .logging_setup import logger
-import numpy as np
-class Mixer:
-    def __init__(self):
-        self.parts = []
-    def __len__(self):
-        parts = self._sync()
-        seg = parts[0][1]
-        frame_count = max(offset + seg.frame_count() for offset, seg in parts)
-        return int(1000.0 * frame_count / seg.frame_rate)
-    def overlay(self, sound, position=0):
-        self.parts.append((position, sound))
-        return self
-    def _sync(self):
-        positions, segs = zip(*self.parts)
-        frame_rate = segs[0].frame_rate
-        array_type = segs[0].array_type # noqa
-        offsets = [int(frame_rate * pos / 1000.0) for pos in positions]
-        segs = AudioSegment.empty()._sync(*segs)
-        return list(zip(offsets, segs))
-    def append(self, sound):
-        self.overlay(sound, position=len(self))
-    def to_audio_segment(self):
-        parts = self._sync()
-        seg = parts[0][1]
-        channels = seg.channels
-        frame_count = max(offset + seg.frame_count() for offset, seg in parts)
-        sample_count = int(frame_count * seg.channels)
-        output = np.zeros(sample_count, dtype="int32")
-        for offset, seg in parts:
-            sample_offset = offset * channels
-            samples = np.frombuffer(seg.get_array_of_samples(), dtype="int32")
-            samples = np.int16(samples/np.max(np.abs(samples)) * 32767)
-            start = sample_offset
-            end = start + len(samples)
-            output[start:end] += samples
-        return seg._spawn(
-            output, overrides={"sample_width": 4}).normalize(headroom=0.0)
-def create_translated_audio(
-    result_diarize, audio_files, final_file, concat=False, avoid_overlap=False,
-):
-    total_duration = result_diarize["segments"][-1]["end"]  # in seconds
-    if concat:
-        """
-        file .\audio\1.ogg
-        file .\audio\2.ogg
-        file .\audio\3.ogg
-        file .\audio\4.ogg
-        ...
-        """
-        # Write the file paths to list.txt
-        with open("list.txt", "w") as file:
-            for i, audio_file in enumerate(audio_files):
-                if i == len(audio_files) - 1:  # Check if it's the last item
-                    file.write(f"file {audio_file}")
-                else:
-                    file.write(f"file {audio_file}\n")
-        # command = f"ffmpeg -f concat -safe 0 -i list.txt {final_file}"
-        command = (
-            f"ffmpeg -f concat -safe 0 -i list.txt -c:a pcm_s16le {final_file}"
-        )
-        run_command(command)
-    else:
-        # silent audio with total_duration
-        base_audio = AudioSegment.silent(
-            duration=int(total_duration * 1000), frame_rate=41000
-        )
-        combined_audio = Mixer()
-        combined_audio.overlay(base_audio)
-        logger.debug(
-            f"Audio duration: {total_duration // 60} "
-            f"minutes and {int(total_duration % 60)} seconds"
-        )
-        last_end_time = 0
-        previous_speaker = ""
-        for line, audio_file in tqdm(
-            zip(result_diarize["segments"], audio_files)
-        ):
-            start = float(line["start"])
-            # Overlay each audio at the corresponding time
-            try:
-                audio = AudioSegment.from_file(audio_file)
-                # audio_a = audio.speedup(playback_speed=1.5)
-                if avoid_overlap:
-                    speaker = line["speaker"]
-                    if (last_end_time - 0.500) > start:
-                        overlap_time = last_end_time - start
-                        if previous_speaker and previous_speaker != speaker:
-                            start = (last_end_time - 0.500)
-                        else:
-                            start = (last_end_time - 0.200)
-                        if overlap_time > 2.5:
-                            start = start - 0.3
-                        logger.info(
-                              f"Avoid overlap for {str(audio_file)} "
-                              f"with {str(start)}"
-                        )
-                    previous_speaker = speaker
-                    duration_tts_seconds = len(audio) / 1000.0  # to sec
-                    last_end_time = (start + duration_tts_seconds)
-                start_time = start * 1000  # to ms
-                combined_audio = combined_audio.overlay(
-                    audio, position=start_time
-                )
-            except Exception as error:
-                logger.debug(str(error))
-                logger.error(f"Error audio file {audio_file}")
-        # combined audio as a file
-        combined_audio_data = combined_audio.to_audio_segment()
-        combined_audio_data.export(
-            final_file, format="wav"
-        )  # best than ogg, change if the audio is anomalous

+from pydub import AudioSegment
+from tqdm import tqdm
+from .utils import run_command
+from .logging_setup import logger
+import numpy as np
+class Mixer:
+    def __init__(self):
+        self.parts = []
+    def __len__(self):
+        parts = self._sync()
+        seg = parts[0][1]
+        frame_count = max(offset + seg.frame_count() for offset, seg in parts)
+        return int(1000.0 * frame_count / seg.frame_rate)
+    def overlay(self, sound, position=0):
+        self.parts.append((position, sound))
+        return self
+    def _sync(self):
+        positions, segs = zip(*self.parts)
+        frame_rate = segs[0].frame_rate
+        array_type = segs[0].array_type # noqa
+        offsets = [int(frame_rate * pos / 1000.0) for pos in positions]
+        segs = AudioSegment.empty()._sync(*segs)
+        return list(zip(offsets, segs))
+    def append(self, sound):
+        self.overlay(sound, position=len(self))
+    def to_audio_segment(self):
+        parts = self._sync()
+        seg = parts[0][1]
+        channels = seg.channels
+        frame_count = max(offset + seg.frame_count() for offset, seg in parts)
+        sample_count = int(frame_count * seg.channels)
+        output = np.zeros(sample_count, dtype="int32")
+        for offset, seg in parts:
+            sample_offset = offset * channels
+            samples = np.frombuffer(seg.get_array_of_samples(), dtype="int32")
+            samples = np.int16(samples/np.max(np.abs(samples)) * 32767)
+            start = sample_offset
+            end = start + len(samples)
+            output[start:end] += samples
+        return seg._spawn(
+            output, overrides={"sample_width": 4}).normalize(headroom=0.0)
+def create_translated_audio(
+    result_diarize, audio_files, final_file, concat=False, avoid_overlap=False,
+):
+    total_duration = result_diarize["segments"][-1]["end"]  # in seconds
+    if concat:
+        """
+        file .\audio\1.ogg
+        file .\audio\2.ogg
+        file .\audio\3.ogg
+        file .\audio\4.ogg
+        ...
+        """
+        # Write the file paths to list.txt
+        with open("list.txt", "w") as file:
+            for i, audio_file in enumerate(audio_files):
+                if i == len(audio_files) - 1:  # Check if it's the last item
+                    file.write(f"file {audio_file}")
+                else:
+                    file.write(f"file {audio_file}\n")
+        # command = f"ffmpeg -f concat -safe 0 -i list.txt {final_file}"
+        command = (
+            f"ffmpeg -f concat -safe 0 -i list.txt -c:a pcm_s16le {final_file}"
+        )
+        run_command(command)
+    else:
+        # silent audio with total_duration
+        base_audio = AudioSegment.silent(
+            duration=int(total_duration * 1000), frame_rate=41000
+        )
+        combined_audio = Mixer()
+        combined_audio.overlay(base_audio)
+        logger.debug(
+            f"Audio duration: {total_duration // 60} "
+            f"minutes and {int(total_duration % 60)} seconds"
+        )
+        last_end_time = 0
+        previous_speaker = ""
+        for line, audio_file in tqdm(
+            zip(result_diarize["segments"], audio_files)
+        ):
+            start = float(line["start"])
+            # Overlay each audio at the corresponding time
+            try:
+                audio = AudioSegment.from_file(audio_file)
+                # audio_a = audio.speedup(playback_speed=1.5)
+                if avoid_overlap:
+                    speaker = line["speaker"]
+                    if (last_end_time - 0.500) > start:
+                        overlap_time = last_end_time - start
+                        if previous_speaker and previous_speaker != speaker:
+                            start = (last_end_time - 0.500)
+                        else:
+                            start = (last_end_time - 0.200)
+                        if overlap_time > 2.5:
+                            start = start - 0.3
+                        logger.info(
+                              f"Avoid overlap for {str(audio_file)} "
+                              f"with {str(start)}"
+                        )
+                    previous_speaker = speaker
+                    duration_tts_seconds = len(audio) / 1000.0  # to sec
+                    last_end_time = (start + duration_tts_seconds)
+                start_time = start * 1000  # to ms
+                combined_audio = combined_audio.overlay(
+                    audio, position=start_time
+                )
+            except Exception as error:
+                logger.debug(str(error))
+                logger.error(f"Error audio file {audio_file}")
+        # combined audio as a file
+        combined_audio_data = combined_audio.to_audio_segment()
+        combined_audio_data.export(
+            final_file, format="wav"
+        )  # best than ogg, change if the audio is anomalous

soni_translate/language_configuration.py CHANGED Viewed

@@ -1,551 +1,551 @@
-from .logging_setup import logger
-LANGUAGES_UNIDIRECTIONAL = {
-    "Aymara (ay)": "ay",
-    "Bambara (bm)": "bm",
-    "Cebuano (ceb)": "ceb",
-    "Chichewa (ny)": "ny",
-    "Divehi (dv)": "dv",
-    "Dogri (doi)": "doi",
-    "Ewe (ee)": "ee",
-    "Guarani (gn)": "gn",
-    "Iloko (ilo)": "ilo",
-    "Kinyarwanda (rw)": "rw",
-    "Krio (kri)": "kri",
-    "Kurdish (ku)": "ku",
-    "Kirghiz (ky)": "ky",
-    "Ganda (lg)": "lg",
-    "Maithili (mai)": "mai",
-    "Oriya (or)": "or",
-    "Oromo (om)": "om",
-    "Quechua (qu)": "qu",
-    "Samoan (sm)": "sm",
-    "Tigrinya (ti)": "ti",
-    "Tsonga (ts)": "ts",
-    "Akan (ak)": "ak",
-    "Uighur (ug)": "ug"
-}
-UNIDIRECTIONAL_L_LIST = LANGUAGES_UNIDIRECTIONAL.keys()
-LANGUAGES = {
-    "Automatic detection": "Automatic detection",
-    "Arabic (ar)": "ar",
-    "Chinese - Simplified (zh-CN)": "zh",
-    "Czech (cs)": "cs",
-    "Danish (da)": "da",
-    "Dutch (nl)": "nl",
-    "English (en)": "en",
-    "Finnish (fi)": "fi",
-    "French (fr)": "fr",
-    "German (de)": "de",
-    "Greek (el)": "el",
-    "Hebrew (he)": "he",
-    "Hungarian (hu)": "hu",
-    "Italian (it)": "it",
-    "Japanese (ja)": "ja",
-    "Korean (ko)": "ko",
-    "Persian (fa)": "fa",  # no aux gTTS
-    "Polish (pl)": "pl",
-    "Portuguese (pt)": "pt",
-    "Russian (ru)": "ru",
-    "Spanish (es)": "es",
-    "Turkish (tr)": "tr",
-    "Ukrainian (uk)": "uk",
-    "Urdu (ur)": "ur",
-    "Vietnamese (vi)": "vi",
-    "Hindi (hi)": "hi",
-    "Indonesian (id)": "id",
-    "Bengali (bn)": "bn",
-    "Telugu (te)": "te",
-    "Marathi (mr)": "mr",
-    "Tamil (ta)": "ta",
-    "Javanese (jw|jv)": "jw",
-    "Catalan (ca)": "ca",
-    "Nepali (ne)": "ne",
-    "Thai (th)": "th",
-    "Swedish (sv)": "sv",
-    "Amharic (am)": "am",
-    "Welsh (cy)": "cy",  # no aux gTTS
-    "Estonian (et)": "et",
-    "Croatian (hr)": "hr",
-    "Icelandic (is)": "is",
-    "Georgian (ka)": "ka",  # no aux gTTS
-    "Khmer (km)": "km",
-    "Slovak (sk)": "sk",
-    "Albanian (sq)": "sq",
-    "Serbian (sr)": "sr",
-    "Azerbaijani (az)": "az",  # no aux gTTS
-    "Bulgarian (bg)": "bg",
-    "Galician (gl)": "gl",  # no aux gTTS
-    "Gujarati (gu)": "gu",
-    "Kazakh (kk)": "kk",  # no aux gTTS
-    "Kannada (kn)": "kn",
-    "Lithuanian (lt)": "lt",  # no aux gTTS
-    "Latvian (lv)": "lv",
-    "Macedonian (mk)": "mk",  # no aux gTTS # error get align model
-    "Malayalam (ml)": "ml",
-    "Malay (ms)": "ms",  # error get align model
-    "Romanian (ro)": "ro",
-    "Sinhala (si)": "si",
-    "Sundanese (su)": "su",
-    "Swahili (sw)": "sw",  # error aling
-    "Afrikaans (af)": "af",
-    "Bosnian (bs)": "bs",
-    "Latin (la)": "la",
-    "Myanmar Burmese (my)": "my",
-    "Norwegian (no|nb)": "no",
-    "Chinese - Traditional (zh-TW)": "zh-TW",
-    "Assamese (as)": "as",
-    "Basque (eu)": "eu",
-    "Hausa (ha)": "ha",
-    "Haitian Creole (ht)": "ht",
-    "Armenian (hy)": "hy",
-    "Lao (lo)": "lo",
-    "Malagasy (mg)": "mg",
-    "Mongolian (mn)": "mn",
-    "Maltese (mt)": "mt",
-    "Punjabi (pa)": "pa",
-    "Pashto (ps)": "ps",
-    "Slovenian (sl)": "sl",
-    "Shona (sn)": "sn",
-    "Somali (so)": "so",
-    "Tajik (tg)": "tg",
-    "Turkmen (tk)": "tk",
-    "Tatar (tt)": "tt",
-    "Uzbek (uz)": "uz",
-    "Yoruba (yo)": "yo",
-    **LANGUAGES_UNIDIRECTIONAL
-}
-BASE_L_LIST = LANGUAGES.keys()
-LANGUAGES_LIST = [list(BASE_L_LIST)[0]] + sorted(list(BASE_L_LIST)[1:])
-INVERTED_LANGUAGES = {value: key for key, value in LANGUAGES.items()}
-EXTRA_ALIGN = {
-    "id": "indonesian-nlp/wav2vec2-large-xlsr-indonesian",
-    "bn": "arijitx/wav2vec2-large-xlsr-bengali",
-    "mr": "sumedh/wav2vec2-large-xlsr-marathi",
-    "ta": "Amrrs/wav2vec2-large-xlsr-53-tamil",
-    "jw": "cahya/wav2vec2-large-xlsr-javanese",
-    "ne": "shniranjan/wav2vec2-large-xlsr-300m-nepali",
-    "th": "sakares/wav2vec2-large-xlsr-thai-demo",
-    "sv": "KBLab/wav2vec2-large-voxrex-swedish",
-    "am": "agkphysics/wav2vec2-large-xlsr-53-amharic",
-    "cy": "Srulikbdd/Wav2Vec2-large-xlsr-welsh",
-    "et": "anton-l/wav2vec2-large-xlsr-53-estonian",
-    "hr": "classla/wav2vec2-xls-r-parlaspeech-hr",
-    "is": "carlosdanielhernandezmena/wav2vec2-large-xlsr-53-icelandic-ep10-1000h",
-    "ka": "MehdiHosseiniMoghadam/wav2vec2-large-xlsr-53-Georgian",
-    "km": "vitouphy/wav2vec2-xls-r-300m-khmer",
-    "sk": "infinitejoy/wav2vec2-large-xls-r-300m-slovak",
-    "sq": "Alimzhan/wav2vec2-large-xls-r-300m-albanian-colab",
-    "sr": "dnikolic/wav2vec2-xlsr-530-serbian-colab",
-    "az": "nijatzeynalov/wav2vec2-large-mms-1b-azerbaijani-common_voice15.0",
-    "bg": "infinitejoy/wav2vec2-large-xls-r-300m-bulgarian",
-    "gl": "ifrz/wav2vec2-large-xlsr-galician",
-    "gu": "Harveenchadha/vakyansh-wav2vec2-gujarati-gnm-100",
-    "kk": "aismlv/wav2vec2-large-xlsr-kazakh",
-    "kn": "Harveenchadha/vakyansh-wav2vec2-kannada-knm-560",
-    "lt": "DeividasM/wav2vec2-large-xlsr-53-lithuanian",
-    "lv": "anton-l/wav2vec2-large-xlsr-53-latvian",
-    "mk": "",  # Konstantin-Bogdanoski/wav2vec2-macedonian-base
-    "ml": "gvs/wav2vec2-large-xlsr-malayalam",
-    "ms": "",  # Duy/wav2vec2_malay
-    "ro": "anton-l/wav2vec2-large-xlsr-53-romanian",
-    "si": "IAmNotAnanth/wav2vec2-large-xls-r-300m-sinhala",
-    "su": "cahya/wav2vec2-large-xlsr-sundanese",
-    "sw": "",  # Lians/fine-tune-wav2vec2-large-swahili
-    "af": "",  # ylacombe/wav2vec2-common_voice-af-demo
-    "bs": "",
-    "la": "",
-    "my": "",
-    "no": "NbAiLab/wav2vec2-xlsr-300m-norwegian",
-    "zh-TW": "jonatasgrosman/wav2vec2-large-xlsr-53-chinese-zh-cn",
-    "as": "",
-    "eu": "", # cahya/wav2vec2-large-xlsr-basque # verify
-    "ha": "infinitejoy/wav2vec2-large-xls-r-300m-hausa",
-    "ht": "",
-    "hy": "infinitejoy/wav2vec2-large-xls-r-300m-armenian", # no (.)
-    "lo": "",
-    "mg": "",
-    "mn": "tugstugi/wav2vec2-large-xlsr-53-mongolian",
-    "mt": "carlosdanielhernandezmena/wav2vec2-large-xlsr-53-maltese-64h",
-    "pa": "kingabzpro/wav2vec2-large-xlsr-53-punjabi",
-    "ps": "aamirhs/wav2vec2-large-xls-r-300m-pashto-colab",
-    "sl": "anton-l/wav2vec2-large-xlsr-53-slovenian",
-    "sn": "",
-    "so": "",
-    "tg": "",
-    "tk": "",  # Ragav/wav2vec2-tk
-    "tt": "anton-l/wav2vec2-large-xlsr-53-tatar",
-    "uz": "",  # Mekhriddin/wav2vec2-large-xls-r-300m-uzbek-colab
-    "yo": "ogbi/wav2vec2-large-mms-1b-yoruba-test",
-}
-def fix_code_language(translate_to, syntax="google"):
-    if syntax == "google":
-        # google-translator, gTTS
-        replace_lang_code = {"zh": "zh-CN", "he": "iw", "zh-cn": "zh-CN"}
-    elif syntax == "coqui":
-        # coqui-xtts
-        replace_lang_code = {"zh": "zh-cn", "zh-CN": "zh-cn", "zh-TW": "zh-cn"}
-    new_code_lang = replace_lang_code.get(translate_to, translate_to)
-    logger.debug(f"Fix code {translate_to} -> {new_code_lang}")
-    return new_code_lang
-BARK_VOICES_LIST = {
-    "de_speaker_0-Male BARK": "v2/de_speaker_0",
-    "de_speaker_1-Male BARK": "v2/de_speaker_1",
-    "de_speaker_2-Male BARK": "v2/de_speaker_2",
-    "de_speaker_3-Female BARK": "v2/de_speaker_3",
-    "de_speaker_4-Male BARK": "v2/de_speaker_4",
-    "de_speaker_5-Male BARK": "v2/de_speaker_5",
-    "de_speaker_6-Male BARK": "v2/de_speaker_6",
-    "de_speaker_7-Male BARK": "v2/de_speaker_7",
-    "de_speaker_8-Female BARK": "v2/de_speaker_8",
-    "de_speaker_9-Male BARK": "v2/de_speaker_9",
-    "en_speaker_0-Male BARK": "v2/en_speaker_0",
-    "en_speaker_1-Male BARK": "v2/en_speaker_1",
-    "en_speaker_2-Male BARK": "v2/en_speaker_2",
-    "en_speaker_3-Male BARK": "v2/en_speaker_3",
-    "en_speaker_4-Male BARK": "v2/en_speaker_4",
-    "en_speaker_5-Male BARK": "v2/en_speaker_5",
-    "en_speaker_6-Male BARK": "v2/en_speaker_6",
-    "en_speaker_7-Male BARK": "v2/en_speaker_7",
-    "en_speaker_8-Male BARK": "v2/en_speaker_8",
-    "en_speaker_9-Female BARK": "v2/en_speaker_9",
-    "es_speaker_0-Male BARK": "v2/es_speaker_0",
-    "es_speaker_1-Male BARK": "v2/es_speaker_1",
-    "es_speaker_2-Male BARK": "v2/es_speaker_2",
-    "es_speaker_3-Male BARK": "v2/es_speaker_3",
-    "es_speaker_4-Male BARK": "v2/es_speaker_4",
-    "es_speaker_5-Male BARK": "v2/es_speaker_5",
-    "es_speaker_6-Male BARK": "v2/es_speaker_6",
-    "es_speaker_7-Male BARK": "v2/es_speaker_7",
-    "es_speaker_8-Female BARK": "v2/es_speaker_8",
-    "es_speaker_9-Female BARK": "v2/es_speaker_9",
-    "fr_speaker_0-Male BARK": "v2/fr_speaker_0",
-    "fr_speaker_1-Female BARK": "v2/fr_speaker_1",
-    "fr_speaker_2-Female BARK": "v2/fr_speaker_2",
-    "fr_speaker_3-Male BARK": "v2/fr_speaker_3",
-    "fr_speaker_4-Male BARK": "v2/fr_speaker_4",
-    "fr_speaker_5-Female BARK": "v2/fr_speaker_5",
-    "fr_speaker_6-Male BARK": "v2/fr_speaker_6",
-    "fr_speaker_7-Male BARK": "v2/fr_speaker_7",
-    "fr_speaker_8-Male BARK": "v2/fr_speaker_8",
-    "fr_speaker_9-Male BARK": "v2/fr_speaker_9",
-    "hi_speaker_0-Female BARK": "v2/hi_speaker_0",
-    "hi_speaker_1-Female BARK": "v2/hi_speaker_1",
-    "hi_speaker_2-Male BARK": "v2/hi_speaker_2",
-    "hi_speaker_3-Female BARK": "v2/hi_speaker_3",
-    "hi_speaker_4-Female BARK": "v2/hi_speaker_4",
-    "hi_speaker_5-Male BARK": "v2/hi_speaker_5",
-    "hi_speaker_6-Male BARK": "v2/hi_speaker_6",
-    "hi_speaker_7-Male BARK": "v2/hi_speaker_7",
-    "hi_speaker_8-Male BARK": "v2/hi_speaker_8",
-    "hi_speaker_9-Female BARK": "v2/hi_speaker_9",
-    "it_speaker_0-Male BARK": "v2/it_speaker_0",
-    "it_speaker_1-Male BARK": "v2/it_speaker_1",
-    "it_speaker_2-Female BARK": "v2/it_speaker_2",
-    "it_speaker_3-Male BARK": "v2/it_speaker_3",
-    "it_speaker_4-Male BARK": "v2/it_speaker_4",
-    "it_speaker_5-Male BARK": "v2/it_speaker_5",
-    "it_speaker_6-Male BARK": "v2/it_speaker_6",
-    "it_speaker_7-Female BARK": "v2/it_speaker_7",
-    "it_speaker_8-Male BARK": "v2/it_speaker_8",
-    "it_speaker_9-Female BARK": "v2/it_speaker_9",
-    "ja_speaker_0-Female BARK": "v2/ja_speaker_0",
-    "ja_speaker_1-Female BARK": "v2/ja_speaker_1",
-    "ja_speaker_2-Male BARK": "v2/ja_speaker_2",
-    "ja_speaker_3-Female BARK": "v2/ja_speaker_3",
-    "ja_speaker_4-Female BARK": "v2/ja_speaker_4",
-    "ja_speaker_5-Female BARK": "v2/ja_speaker_5",
-    "ja_speaker_6-Male BARK": "v2/ja_speaker_6",
-    "ja_speaker_7-Female BARK": "v2/ja_speaker_7",
-    "ja_speaker_8-Female BARK": "v2/ja_speaker_8",
-    "ja_speaker_9-Female BARK": "v2/ja_speaker_9",
-    "ko_speaker_0-Female BARK": "v2/ko_speaker_0",
-    "ko_speaker_1-Male BARK": "v2/ko_speaker_1",
-    "ko_speaker_2-Male BARK": "v2/ko_speaker_2",
-    "ko_speaker_3-Male BARK": "v2/ko_speaker_3",
-    "ko_speaker_4-Male BARK": "v2/ko_speaker_4",
-    "ko_speaker_5-Male BARK": "v2/ko_speaker_5",
-    "ko_speaker_6-Male BARK": "v2/ko_speaker_6",
-    "ko_speaker_7-Male BARK": "v2/ko_speaker_7",
-    "ko_speaker_8-Male BARK": "v2/ko_speaker_8",
-    "ko_speaker_9-Male BARK": "v2/ko_speaker_9",
-    "pl_speaker_0-Male BARK": "v2/pl_speaker_0",
-    "pl_speaker_1-Male BARK": "v2/pl_speaker_1",
-    "pl_speaker_2-Male BARK": "v2/pl_speaker_2",
-    "pl_speaker_3-Male BARK": "v2/pl_speaker_3",
-    "pl_speaker_4-Female BARK": "v2/pl_speaker_4",
-    "pl_speaker_5-Male BARK": "v2/pl_speaker_5",
-    "pl_speaker_6-Female BARK": "v2/pl_speaker_6",
-    "pl_speaker_7-Male BARK": "v2/pl_speaker_7",
-    "pl_speaker_8-Male BARK": "v2/pl_speaker_8",
-    "pl_speaker_9-Female BARK": "v2/pl_speaker_9",
-    "pt_speaker_0-Male BARK": "v2/pt_speaker_0",
-    "pt_speaker_1-Male BARK": "v2/pt_speaker_1",
-    "pt_speaker_2-Male BARK": "v2/pt_speaker_2",
-    "pt_speaker_3-Male BARK": "v2/pt_speaker_3",
-    "pt_speaker_4-Male BARK": "v2/pt_speaker_4",
-    "pt_speaker_5-Male BARK": "v2/pt_speaker_5",
-    "pt_speaker_6-Male BARK": "v2/pt_speaker_6",
-    "pt_speaker_7-Male BARK": "v2/pt_speaker_7",
-    "pt_speaker_8-Male BARK": "v2/pt_speaker_8",
-    "pt_speaker_9-Male BARK": "v2/pt_speaker_9",
-    "ru_speaker_0-Male BARK": "v2/ru_speaker_0",
-    "ru_speaker_1-Male BARK": "v2/ru_speaker_1",
-    "ru_speaker_2-Male BARK": "v2/ru_speaker_2",
-    "ru_speaker_3-Male BARK": "v2/ru_speaker_3",
-    "ru_speaker_4-Male BARK": "v2/ru_speaker_4",
-    "ru_speaker_5-Female BARK": "v2/ru_speaker_5",
-    "ru_speaker_6-Female BARK": "v2/ru_speaker_6",
-    "ru_speaker_7-Male BARK": "v2/ru_speaker_7",
-    "ru_speaker_8-Male BARK": "v2/ru_speaker_8",
-    "ru_speaker_9-Female BARK": "v2/ru_speaker_9",
-    "tr_speaker_0-Male BARK": "v2/tr_speaker_0",
-    "tr_speaker_1-Male BARK": "v2/tr_speaker_1",
-    "tr_speaker_2-Male BARK": "v2/tr_speaker_2",
-    "tr_speaker_3-Male BARK": "v2/tr_speaker_3",
-    "tr_speaker_4-Female BARK": "v2/tr_speaker_4",
-    "tr_speaker_5-Female BARK": "v2/tr_speaker_5",
-    "tr_speaker_6-Male BARK": "v2/tr_speaker_6",
-    "tr_speaker_7-Male BARK": "v2/tr_speaker_7",
-    "tr_speaker_8-Male BARK": "v2/tr_speaker_8",
-    "tr_speaker_9-Male BARK": "v2/tr_speaker_9",
-    "zh_speaker_0-Male BARK": "v2/zh_speaker_0",
-    "zh_speaker_1-Male BARK": "v2/zh_speaker_1",
-    "zh_speaker_2-Male BARK": "v2/zh_speaker_2",
-    "zh_speaker_3-Male BARK": "v2/zh_speaker_3",
-    "zh_speaker_4-Female BARK": "v2/zh_speaker_4",
-    "zh_speaker_5-Male BARK": "v2/zh_speaker_5",
-    "zh_speaker_6-Female BARK": "v2/zh_speaker_6",
-    "zh_speaker_7-Female BARK": "v2/zh_speaker_7",
-    "zh_speaker_8-Male BARK": "v2/zh_speaker_8",
-    "zh_speaker_9-Female BARK": "v2/zh_speaker_9",
-}
-VITS_VOICES_LIST = {
-    "ar-facebook-mms VITS": "facebook/mms-tts-ara",
-    # 'zh-facebook-mms VITS': 'facebook/mms-tts-cmn',
-    "zh_Hakka-facebook-mms VITS": "facebook/mms-tts-hak",
-    "zh_MinNan-facebook-mms VITS": "facebook/mms-tts-nan",
-    # 'cs-facebook-mms VITS': 'facebook/mms-tts-ces',
-    # 'da-facebook-mms VITS': 'facebook/mms-tts-dan',
-    "nl-facebook-mms VITS": "facebook/mms-tts-nld",
-    "en-facebook-mms VITS": "facebook/mms-tts-eng",
-    "fi-facebook-mms VITS": "facebook/mms-tts-fin",
-    "fr-facebook-mms VITS": "facebook/mms-tts-fra",
-    "de-facebook-mms VITS": "facebook/mms-tts-deu",
-    "el-facebook-mms VITS": "facebook/mms-tts-ell",
-    "el_Ancient-facebook-mms VITS": "facebook/mms-tts-grc",
-    "he-facebook-mms VITS": "facebook/mms-tts-heb",
-    "hu-facebook-mms VITS": "facebook/mms-tts-hun",
-    # 'it-facebook-mms VITS': 'facebook/mms-tts-ita',
-    # 'ja-facebook-mms VITS': 'facebook/mms-tts-jpn',
-    "ko-facebook-mms VITS": "facebook/mms-tts-kor",
-    "fa-facebook-mms VITS": "facebook/mms-tts-fas",
-    "pl-facebook-mms VITS": "facebook/mms-tts-pol",
-    "pt-facebook-mms VITS": "facebook/mms-tts-por",
-    "ru-facebook-mms VITS": "facebook/mms-tts-rus",
-    "es-facebook-mms VITS": "facebook/mms-tts-spa",
-    "tr-facebook-mms VITS": "facebook/mms-tts-tur",
-    "uk-facebook-mms VITS": "facebook/mms-tts-ukr",
-    "ur_arabic-facebook-mms VITS": "facebook/mms-tts-urd-script_arabic",
-    "ur_devanagari-facebook-mms VITS": "facebook/mms-tts-urd-script_devanagari",
-    "ur_latin-facebook-mms VITS": "facebook/mms-tts-urd-script_latin",
-    "vi-facebook-mms VITS": "facebook/mms-tts-vie",
-    "hi-facebook-mms VITS": "facebook/mms-tts-hin",
-    "hi_Fiji-facebook-mms VITS": "facebook/mms-tts-hif",
-    "id-facebook-mms VITS": "facebook/mms-tts-ind",
-    "bn-facebook-mms VITS": "facebook/mms-tts-ben",
-    "te-facebook-mms VITS": "facebook/mms-tts-tel",
-    "mr-facebook-mms VITS": "facebook/mms-tts-mar",
-    "ta-facebook-mms VITS": "facebook/mms-tts-tam",
-    "jw-facebook-mms VITS": "facebook/mms-tts-jav",
-    "jw_Suriname-facebook-mms VITS": "facebook/mms-tts-jvn",
-    "ca-facebook-mms VITS": "facebook/mms-tts-cat",
-    "ne-facebook-mms VITS": "facebook/mms-tts-nep",
-    "th-facebook-mms VITS": "facebook/mms-tts-tha",
-    "th_Northern-facebook-mms VITS": "facebook/mms-tts-nod",
-    "sv-facebook-mms VITS": "facebook/mms-tts-swe",
-    "am-facebook-mms VITS": "facebook/mms-tts-amh",
-    "cy-facebook-mms VITS": "facebook/mms-tts-cym",
-    # "et-facebook-mms VITS": "facebook/mms-tts-est",
-    # "ht-facebook-mms VITS": "facebook/mms-tts-hrv",
-    "is-facebook-mms VITS": "facebook/mms-tts-isl",
-    "km-facebook-mms VITS": "facebook/mms-tts-khm",
-    "km_Northern-facebook-mms VITS": "facebook/mms-tts-kxm",
-    # "sk-facebook-mms VITS": "facebook/mms-tts-slk",
-    "sq_Northern-facebook-mms VITS": "facebook/mms-tts-sqi",
-    "az_South-facebook-mms VITS": "facebook/mms-tts-azb",
-    "az_North_script_cyrillic-facebook-mms VITS": "facebook/mms-tts-azj-script_cyrillic",
-    "az_North_script_latin-facebook-mms VITS": "facebook/mms-tts-azj-script_latin",
-    "bg-facebook-mms VITS": "facebook/mms-tts-bul",
-    # "gl-facebook-mms VITS": "facebook/mms-tts-glg",
-    "gu-facebook-mms VITS": "facebook/mms-tts-guj",
-    "kk-facebook-mms VITS": "facebook/mms-tts-kaz",
-    "kn-facebook-mms VITS": "facebook/mms-tts-kan",
-    # "lt-facebook-mms VITS": "facebook/mms-tts-lit",
-    "lv-facebook-mms VITS": "facebook/mms-tts-lav",
-    # "mk-facebook-mms VITS": "facebook/mms-tts-mkd",
-    "ml-facebook-mms VITS": "facebook/mms-tts-mal",
-    "ms-facebook-mms VITS": "facebook/mms-tts-zlm",
-    "ms_Central-facebook-mms VITS": "facebook/mms-tts-pse",
-    "ms_Manado-facebook-mms VITS": "facebook/mms-tts-xmm",
-    "ro-facebook-mms VITS": "facebook/mms-tts-ron",
-    # "si-facebook-mms VITS": "facebook/mms-tts-sin",
-    "sw-facebook-mms VITS": "facebook/mms-tts-swh",
-    # "af-facebook-mms VITS": "facebook/mms-tts-afr",
-    # "bs-facebook-mms VITS": "facebook/mms-tts-bos",
-    "la-facebook-mms VITS": "facebook/mms-tts-lat",
-    "my-facebook-mms VITS": "facebook/mms-tts-mya",
-    # "no_Bokmål-facebook-mms VITS": "thomasht86/mms-tts-nob",  # verify
-    "as-facebook-mms VITS": "facebook/mms-tts-asm",
-    "as_Nagamese-facebook-mms VITS": "facebook/mms-tts-nag",
-    "eu-facebook-mms VITS": "facebook/mms-tts-eus",
-    "ha-facebook-mms VITS": "facebook/mms-tts-hau",
-    "ht-facebook-mms VITS": "facebook/mms-tts-hat",
-    "hy_Western-facebook-mms VITS": "facebook/mms-tts-hyw",
-    "lo-facebook-mms VITS": "facebook/mms-tts-lao",
-    "mg-facebook-mms VITS": "facebook/mms-tts-mlg",
-    "mn-facebook-mms VITS": "facebook/mms-tts-mon",
-    # "mt-facebook-mms VITS": "facebook/mms-tts-mlt",
-    "pa_Eastern-facebook-mms VITS": "facebook/mms-tts-pan",
-    # "pa_Western-facebook-mms VITS": "facebook/mms-tts-pnb",
-    # "ps-facebook-mms VITS": "facebook/mms-tts-pus",
-    # "sl-facebook-mms VITS": "facebook/mms-tts-slv",
-    "sn-facebook-mms VITS": "facebook/mms-tts-sna",
-    "so-facebook-mms VITS": "facebook/mms-tts-son",
-    "tg-facebook-mms VITS": "facebook/mms-tts-tgk",
-    "tk_script_arabic-facebook-mms VITS": "facebook/mms-tts-tuk-script_arabic",
-    "tk_script_latin-facebook-mms VITS": "facebook/mms-tts-tuk-script_latin",
-    "tt-facebook-mms VITS": "facebook/mms-tts-tat",
-    "tt_Crimean-facebook-mms VITS": "facebook/mms-tts-crh",
-    "uz_script_cyrillic-facebook-mms VITS": "facebook/mms-tts-uzb-script_cyrillic",
-    "yo-facebook-mms VITS": "facebook/mms-tts-yor",
-    "ay-facebook-mms VITS": "facebook/mms-tts-ayr",
-    "bm-facebook-mms VITS": "facebook/mms-tts-bam",
-    "ceb-facebook-mms VITS": "facebook/mms-tts-ceb",
-    "ny-facebook-mms VITS": "facebook/mms-tts-nya",
-    "dv-facebook-mms VITS": "facebook/mms-tts-div",
-    "doi-facebook-mms VITS": "facebook/mms-tts-dgo",
-    "ee-facebook-mms VITS": "facebook/mms-tts-ewe",
-    "gn-facebook-mms VITS": "facebook/mms-tts-grn",
-    "ilo-facebook-mms VITS": "facebook/mms-tts-ilo",
-    "rw-facebook-mms VITS": "facebook/mms-tts-kin",
-    "kri-facebook-mms VITS": "facebook/mms-tts-kri",
-    "ku_script_arabic-facebook-mms VITS": "facebook/mms-tts-kmr-script_arabic",
-    "ku_script_cyrillic-facebook-mms VITS": "facebook/mms-tts-kmr-script_cyrillic",
-    "ku_script_latin-facebook-mms VITS": "facebook/mms-tts-kmr-script_latin",
-    "ckb-facebook-mms VITS": "razhan/mms-tts-ckb",  # Verify w
-    "ky-facebook-mms VITS": "facebook/mms-tts-kir",
-    "lg-facebook-mms VITS": "facebook/mms-tts-lug",
-    "mai-facebook-mms VITS": "facebook/mms-tts-mai",
-    "or-facebook-mms VITS": "facebook/mms-tts-ory",
-    "om-facebook-mms VITS": "facebook/mms-tts-orm",
-    "qu_Huallaga-facebook-mms VITS": "facebook/mms-tts-qub",
-    "qu_Lambayeque-facebook-mms VITS": "facebook/mms-tts-quf",
-    "qu_South_Bolivian-facebook-mms VITS": "facebook/mms-tts-quh",
-    "qu_North_Bolivian-facebook-mms VITS": "facebook/mms-tts-qul",
-    "qu_Tena_Lowland-facebook-mms VITS": "facebook/mms-tts-quw",
-    "qu_Ayacucho-facebook-mms VITS": "facebook/mms-tts-quy",
-    "qu_Cusco-facebook-mms VITS": "facebook/mms-tts-quz",
-    "qu_Cajamarca-facebook-mms VITS": "facebook/mms-tts-qvc",
-    "qu_Eastern_Apurímac-facebook-mms VITS": "facebook/mms-tts-qve",
-    "qu_Huamalíes_Dos_de_Mayo_Huánuco-facebook-mms VITS": "facebook/mms-tts-qvh",
-    "qu_Margos_Yarowilca_Lauricocha-facebook-mms VITS": "facebook/mms-tts-qvm",
-    "qu_North_Junín-facebook-mms VITS": "facebook/mms-tts-qvn",
-    "qu_Napo-facebook-mms VITS": "facebook/mms-tts-qvo",
-    "qu_San_Martín-facebook-mms VITS": "facebook/mms-tts-qvs",
-    "qu_Huaylla_Wanca-facebook-mms VITS": "facebook/mms-tts-qvw",
-    "qu_Northern_Pastaza-facebook-mms VITS": "facebook/mms-tts-qvz",
-    "qu_Huaylas_Ancash-facebook-mms VITS": "facebook/mms-tts-qwh",
-    "qu_Panao-facebook-mms VITS": "facebook/mms-tts-qxh",
-    "qu_Salasaca_Highland-facebook-mms VITS": "facebook/mms-tts-qxl",
-    "qu_Northern_Conchucos_Ancash-facebook-mms VITS": "facebook/mms-tts-qxn",
-    "qu_Southern_Conchucos-facebook-mms VITS": "facebook/mms-tts-qxo",
-    "qu_Cañar_Highland-facebook-mms VITS": "facebook/mms-tts-qxr",
-    "sm-facebook-mms VITS": "facebook/mms-tts-smo",
-    "ti-facebook-mms VITS": "facebook/mms-tts-tir",
-    "ts-facebook-mms VITS": "facebook/mms-tts-tso",
-    "ak-facebook-mms VITS": "facebook/mms-tts-aka",
-    "ug_script_arabic-facebook-mms VITS": "facebook/mms-tts-uig-script_arabic",
-    "ug_script_cyrillic-facebook-mms VITS": "facebook/mms-tts-uig-script_cyrillic",
-}
-OPENAI_TTS_CODES = [
-    "af", "ar", "hy", "az", "be", "bs", "bg", "ca", "zh", "hr", "cs", "da",
-    "nl", "en", "et", "fi", "fr", "gl", "de", "el", "he", "hi", "hu", "is",
-    "id", "it", "ja", "kn", "kk", "ko", "lv", "lt", "mk", "ms", "mr", "mi",
-    "ne", "no", "fa", "pl", "pt", "ro", "ru", "sr", "sk", "sl", "es", "sw",
-    "sv", "tl", "ta", "th", "tr", "uk", "ur", "vi", "cy", "zh-TW"
-]
-OPENAI_TTS_MODELS = [
-    ">alloy OpenAI-TTS",
-    ">echo OpenAI-TTS",
-    ">fable OpenAI-TTS",
-    ">onyx OpenAI-TTS",
-    ">nova OpenAI-TTS",
-    ">shimmer OpenAI-TTS",
-    ">alloy HD OpenAI-TTS",
-    ">echo HD OpenAI-TTS",
-    ">fable HD OpenAI-TTS",
-    ">onyx HD OpenAI-TTS",
-    ">nova HD OpenAI-TTS",
-    ">shimmer HD OpenAI-TTS"
-]
-LANGUAGE_CODE_IN_THREE_LETTERS = {
-    "Automatic detection": "aut",
-    "ar": "ara",
-    "zh": "chi",
-    "cs": "cze",
-    "da": "dan",
-    "nl": "dut",
-    "en": "eng",
-    "fi": "fin",
-    "fr": "fre",
-    "de": "ger",
-    "el": "gre",
-    "he": "heb",
-    "hu": "hun",
-    "it": "ita",
-    "ja": "jpn",
-    "ko": "kor",
-    "fa": "per",
-    "pl": "pol",
-    "pt": "por",
-    "ru": "rus",
-    "es": "spa",
-    "tr": "tur",
-    "uk": "ukr",
-    "ur": "urd",
-    "vi": "vie",
-    "hi": "hin",
-    "id": "ind",
-    "bn": "ben",
-    "te": "tel",
-    "mr": "mar",
-    "ta": "tam",
-    "jw": "jav",
-    "ca": "cat",
-    "ne": "nep",
-    "th": "tha",
-    "sv": "swe",
-    "am": "amh",
-    "cy": "cym",
-    "et": "est",
-    "hr": "hrv",
-    "is": "isl",
-    "km": "khm",
-    "sk": "slk",
-    "sq": "sqi",
-    "sr": "srp",
-}

+from .logging_setup import logger
+LANGUAGES_UNIDIRECTIONAL = {
+    "Aymara (ay)": "ay",
+    "Bambara (bm)": "bm",
+    "Cebuano (ceb)": "ceb",
+    "Chichewa (ny)": "ny",
+    "Divehi (dv)": "dv",
+    "Dogri (doi)": "doi",
+    "Ewe (ee)": "ee",
+    "Guarani (gn)": "gn",
+    "Iloko (ilo)": "ilo",
+    "Kinyarwanda (rw)": "rw",
+    "Krio (kri)": "kri",
+    "Kurdish (ku)": "ku",
+    "Kirghiz (ky)": "ky",
+    "Ganda (lg)": "lg",
+    "Maithili (mai)": "mai",
+    "Oriya (or)": "or",
+    "Oromo (om)": "om",
+    "Quechua (qu)": "qu",
+    "Samoan (sm)": "sm",
+    "Tigrinya (ti)": "ti",
+    "Tsonga (ts)": "ts",
+    "Akan (ak)": "ak",
+    "Uighur (ug)": "ug"
+}
+UNIDIRECTIONAL_L_LIST = LANGUAGES_UNIDIRECTIONAL.keys()
+LANGUAGES = {
+    "Automatic detection": "Automatic detection",
+    "Arabic (ar)": "ar",
+    "Chinese - Simplified (zh-CN)": "zh",
+    "Czech (cs)": "cs",
+    "Danish (da)": "da",
+    "Dutch (nl)": "nl",
+    "English (en)": "en",
+    "Finnish (fi)": "fi",
+    "French (fr)": "fr",
+    "German (de)": "de",
+    "Greek (el)": "el",
+    "Hebrew (he)": "he",
+    "Hungarian (hu)": "hu",
+    "Italian (it)": "it",
+    "Japanese (ja)": "ja",
+    "Korean (ko)": "ko",
+    "Persian (fa)": "fa",  # no aux gTTS
+    "Polish (pl)": "pl",
+    "Portuguese (pt)": "pt",
+    "Russian (ru)": "ru",
+    "Spanish (es)": "es",
+    "Turkish (tr)": "tr",
+    "Ukrainian (uk)": "uk",
+    "Urdu (ur)": "ur",
+    "Vietnamese (vi)": "vi",
+    "Hindi (hi)": "hi",
+    "Indonesian (id)": "id",
+    "Bengali (bn)": "bn",
+    "Telugu (te)": "te",
+    "Marathi (mr)": "mr",
+    "Tamil (ta)": "ta",
+    "Javanese (jw|jv)": "jw",
+    "Catalan (ca)": "ca",
+    "Nepali (ne)": "ne",
+    "Thai (th)": "th",
+    "Swedish (sv)": "sv",
+    "Amharic (am)": "am",
+    "Welsh (cy)": "cy",  # no aux gTTS
+    "Estonian (et)": "et",
+    "Croatian (hr)": "hr",
+    "Icelandic (is)": "is",
+    "Georgian (ka)": "ka",  # no aux gTTS
+    "Khmer (km)": "km",
+    "Slovak (sk)": "sk",
+    "Albanian (sq)": "sq",
+    "Serbian (sr)": "sr",
+    "Azerbaijani (az)": "az",  # no aux gTTS
+    "Bulgarian (bg)": "bg",
+    "Galician (gl)": "gl",  # no aux gTTS
+    "Gujarati (gu)": "gu",
+    "Kazakh (kk)": "kk",  # no aux gTTS
+    "Kannada (kn)": "kn",
+    "Lithuanian (lt)": "lt",  # no aux gTTS
+    "Latvian (lv)": "lv",
+    "Macedonian (mk)": "mk",  # no aux gTTS # error get align model
+    "Malayalam (ml)": "ml",
+    "Malay (ms)": "ms",  # error get align model
+    "Romanian (ro)": "ro",
+    "Sinhala (si)": "si",
+    "Sundanese (su)": "su",
+    "Swahili (sw)": "sw",  # error aling
+    "Afrikaans (af)": "af",
+    "Bosnian (bs)": "bs",
+    "Latin (la)": "la",
+    "Myanmar Burmese (my)": "my",
+    "Norwegian (no|nb)": "no",
+    "Chinese - Traditional (zh-TW)": "zh-TW",
+    "Assamese (as)": "as",
+    "Basque (eu)": "eu",
+    "Hausa (ha)": "ha",
+    "Haitian Creole (ht)": "ht",
+    "Armenian (hy)": "hy",
+    "Lao (lo)": "lo",
+    "Malagasy (mg)": "mg",
+    "Mongolian (mn)": "mn",
+    "Maltese (mt)": "mt",
+    "Punjabi (pa)": "pa",
+    "Pashto (ps)": "ps",
+    "Slovenian (sl)": "sl",
+    "Shona (sn)": "sn",
+    "Somali (so)": "so",
+    "Tajik (tg)": "tg",
+    "Turkmen (tk)": "tk",
+    "Tatar (tt)": "tt",
+    "Uzbek (uz)": "uz",
+    "Yoruba (yo)": "yo",
+    **LANGUAGES_UNIDIRECTIONAL
+}
+BASE_L_LIST = LANGUAGES.keys()
+LANGUAGES_LIST = [list(BASE_L_LIST)[0]] + sorted(list(BASE_L_LIST)[1:])
+INVERTED_LANGUAGES = {value: key for key, value in LANGUAGES.items()}
+EXTRA_ALIGN = {
+    "id": "indonesian-nlp/wav2vec2-large-xlsr-indonesian",
+    "bn": "arijitx/wav2vec2-large-xlsr-bengali",
+    "mr": "sumedh/wav2vec2-large-xlsr-marathi",
+    "ta": "Amrrs/wav2vec2-large-xlsr-53-tamil",
+    "jw": "cahya/wav2vec2-large-xlsr-javanese",
+    "ne": "shniranjan/wav2vec2-large-xlsr-300m-nepali",
+    "th": "sakares/wav2vec2-large-xlsr-thai-demo",
+    "sv": "KBLab/wav2vec2-large-voxrex-swedish",
+    "am": "agkphysics/wav2vec2-large-xlsr-53-amharic",
+    "cy": "Srulikbdd/Wav2Vec2-large-xlsr-welsh",
+    "et": "anton-l/wav2vec2-large-xlsr-53-estonian",
+    "hr": "classla/wav2vec2-xls-r-parlaspeech-hr",
+    "is": "carlosdanielhernandezmena/wav2vec2-large-xlsr-53-icelandic-ep10-1000h",
+    "ka": "MehdiHosseiniMoghadam/wav2vec2-large-xlsr-53-Georgian",
+    "km": "vitouphy/wav2vec2-xls-r-300m-khmer",
+    "sk": "infinitejoy/wav2vec2-large-xls-r-300m-slovak",
+    "sq": "Alimzhan/wav2vec2-large-xls-r-300m-albanian-colab",
+    "sr": "dnikolic/wav2vec2-xlsr-530-serbian-colab",
+    "az": "nijatzeynalov/wav2vec2-large-mms-1b-azerbaijani-common_voice15.0",
+    "bg": "infinitejoy/wav2vec2-large-xls-r-300m-bulgarian",
+    "gl": "ifrz/wav2vec2-large-xlsr-galician",
+    "gu": "Harveenchadha/vakyansh-wav2vec2-gujarati-gnm-100",
+    "kk": "aismlv/wav2vec2-large-xlsr-kazakh",
+    "kn": "Harveenchadha/vakyansh-wav2vec2-kannada-knm-560",
+    "lt": "DeividasM/wav2vec2-large-xlsr-53-lithuanian",
+    "lv": "anton-l/wav2vec2-large-xlsr-53-latvian",
+    "mk": "",  # Konstantin-Bogdanoski/wav2vec2-macedonian-base
+    "ml": "gvs/wav2vec2-large-xlsr-malayalam",
+    "ms": "",  # Duy/wav2vec2_malay
+    "ro": "anton-l/wav2vec2-large-xlsr-53-romanian",
+    "si": "IAmNotAnanth/wav2vec2-large-xls-r-300m-sinhala",
+    "su": "cahya/wav2vec2-large-xlsr-sundanese",
+    "sw": "",  # Lians/fine-tune-wav2vec2-large-swahili
+    "af": "",  # ylacombe/wav2vec2-common_voice-af-demo
+    "bs": "",
+    "la": "",
+    "my": "",
+    "no": "NbAiLab/wav2vec2-xlsr-300m-norwegian",
+    "zh-TW": "jonatasgrosman/wav2vec2-large-xlsr-53-chinese-zh-cn",
+    "as": "",
+    "eu": "", # cahya/wav2vec2-large-xlsr-basque # verify
+    "ha": "infinitejoy/wav2vec2-large-xls-r-300m-hausa",
+    "ht": "",
+    "hy": "infinitejoy/wav2vec2-large-xls-r-300m-armenian", # no (.)
+    "lo": "",
+    "mg": "",
+    "mn": "tugstugi/wav2vec2-large-xlsr-53-mongolian",
+    "mt": "carlosdanielhernandezmena/wav2vec2-large-xlsr-53-maltese-64h",
+    "pa": "kingabzpro/wav2vec2-large-xlsr-53-punjabi",
+    "ps": "aamirhs/wav2vec2-large-xls-r-300m-pashto-colab",
+    "sl": "anton-l/wav2vec2-large-xlsr-53-slovenian",
+    "sn": "",
+    "so": "",
+    "tg": "",
+    "tk": "",  # Ragav/wav2vec2-tk
+    "tt": "anton-l/wav2vec2-large-xlsr-53-tatar",
+    "uz": "",  # Mekhriddin/wav2vec2-large-xls-r-300m-uzbek-colab
+    "yo": "ogbi/wav2vec2-large-mms-1b-yoruba-test",
+}
+def fix_code_language(translate_to, syntax="google"):
+    if syntax == "google":
+        # google-translator, gTTS
+        replace_lang_code = {"zh": "zh-CN", "he": "iw", "zh-cn": "zh-CN"}
+    elif syntax == "coqui":
+        # coqui-xtts
+        replace_lang_code = {"zh": "zh-cn", "zh-CN": "zh-cn", "zh-TW": "zh-cn"}
+    new_code_lang = replace_lang_code.get(translate_to, translate_to)
+    logger.debug(f"Fix code {translate_to} -> {new_code_lang}")
+    return new_code_lang
+BARK_VOICES_LIST = {
+    "de_speaker_0-Male BARK": "v2/de_speaker_0",
+    "de_speaker_1-Male BARK": "v2/de_speaker_1",
+    "de_speaker_2-Male BARK": "v2/de_speaker_2",
+    "de_speaker_3-Female BARK": "v2/de_speaker_3",
+    "de_speaker_4-Male BARK": "v2/de_speaker_4",
+    "de_speaker_5-Male BARK": "v2/de_speaker_5",
+    "de_speaker_6-Male BARK": "v2/de_speaker_6",
+    "de_speaker_7-Male BARK": "v2/de_speaker_7",
+    "de_speaker_8-Female BARK": "v2/de_speaker_8",
+    "de_speaker_9-Male BARK": "v2/de_speaker_9",
+    "en_speaker_0-Male BARK": "v2/en_speaker_0",
+    "en_speaker_1-Male BARK": "v2/en_speaker_1",
+    "en_speaker_2-Male BARK": "v2/en_speaker_2",
+    "en_speaker_3-Male BARK": "v2/en_speaker_3",
+    "en_speaker_4-Male BARK": "v2/en_speaker_4",
+    "en_speaker_5-Male BARK": "v2/en_speaker_5",
+    "en_speaker_6-Male BARK": "v2/en_speaker_6",
+    "en_speaker_7-Male BARK": "v2/en_speaker_7",
+    "en_speaker_8-Male BARK": "v2/en_speaker_8",
+    "en_speaker_9-Female BARK": "v2/en_speaker_9",
+    "es_speaker_0-Male BARK": "v2/es_speaker_0",
+    "es_speaker_1-Male BARK": "v2/es_speaker_1",
+    "es_speaker_2-Male BARK": "v2/es_speaker_2",
+    "es_speaker_3-Male BARK": "v2/es_speaker_3",
+    "es_speaker_4-Male BARK": "v2/es_speaker_4",
+    "es_speaker_5-Male BARK": "v2/es_speaker_5",
+    "es_speaker_6-Male BARK": "v2/es_speaker_6",
+    "es_speaker_7-Male BARK": "v2/es_speaker_7",
+    "es_speaker_8-Female BARK": "v2/es_speaker_8",
+    "es_speaker_9-Female BARK": "v2/es_speaker_9",
+    "fr_speaker_0-Male BARK": "v2/fr_speaker_0",
+    "fr_speaker_1-Female BARK": "v2/fr_speaker_1",
+    "fr_speaker_2-Female BARK": "v2/fr_speaker_2",
+    "fr_speaker_3-Male BARK": "v2/fr_speaker_3",
+    "fr_speaker_4-Male BARK": "v2/fr_speaker_4",
+    "fr_speaker_5-Female BARK": "v2/fr_speaker_5",
+    "fr_speaker_6-Male BARK": "v2/fr_speaker_6",
+    "fr_speaker_7-Male BARK": "v2/fr_speaker_7",
+    "fr_speaker_8-Male BARK": "v2/fr_speaker_8",
+    "fr_speaker_9-Male BARK": "v2/fr_speaker_9",
+    "hi_speaker_0-Female BARK": "v2/hi_speaker_0",
+    "hi_speaker_1-Female BARK": "v2/hi_speaker_1",
+    "hi_speaker_2-Male BARK": "v2/hi_speaker_2",
+    "hi_speaker_3-Female BARK": "v2/hi_speaker_3",
+    "hi_speaker_4-Female BARK": "v2/hi_speaker_4",
+    "hi_speaker_5-Male BARK": "v2/hi_speaker_5",
+    "hi_speaker_6-Male BARK": "v2/hi_speaker_6",
+    "hi_speaker_7-Male BARK": "v2/hi_speaker_7",
+    "hi_speaker_8-Male BARK": "v2/hi_speaker_8",
+    "hi_speaker_9-Female BARK": "v2/hi_speaker_9",
+    "it_speaker_0-Male BARK": "v2/it_speaker_0",
+    "it_speaker_1-Male BARK": "v2/it_speaker_1",
+    "it_speaker_2-Female BARK": "v2/it_speaker_2",
+    "it_speaker_3-Male BARK": "v2/it_speaker_3",
+    "it_speaker_4-Male BARK": "v2/it_speaker_4",
+    "it_speaker_5-Male BARK": "v2/it_speaker_5",
+    "it_speaker_6-Male BARK": "v2/it_speaker_6",
+    "it_speaker_7-Female BARK": "v2/it_speaker_7",
+    "it_speaker_8-Male BARK": "v2/it_speaker_8",
+    "it_speaker_9-Female BARK": "v2/it_speaker_9",
+    "ja_speaker_0-Female BARK": "v2/ja_speaker_0",
+    "ja_speaker_1-Female BARK": "v2/ja_speaker_1",
+    "ja_speaker_2-Male BARK": "v2/ja_speaker_2",
+    "ja_speaker_3-Female BARK": "v2/ja_speaker_3",
+    "ja_speaker_4-Female BARK": "v2/ja_speaker_4",
+    "ja_speaker_5-Female BARK": "v2/ja_speaker_5",
+    "ja_speaker_6-Male BARK": "v2/ja_speaker_6",
+    "ja_speaker_7-Female BARK": "v2/ja_speaker_7",
+    "ja_speaker_8-Female BARK": "v2/ja_speaker_8",
+    "ja_speaker_9-Female BARK": "v2/ja_speaker_9",
+    "ko_speaker_0-Female BARK": "v2/ko_speaker_0",
+    "ko_speaker_1-Male BARK": "v2/ko_speaker_1",
+    "ko_speaker_2-Male BARK": "v2/ko_speaker_2",
+    "ko_speaker_3-Male BARK": "v2/ko_speaker_3",
+    "ko_speaker_4-Male BARK": "v2/ko_speaker_4",
+    "ko_speaker_5-Male BARK": "v2/ko_speaker_5",
+    "ko_speaker_6-Male BARK": "v2/ko_speaker_6",
+    "ko_speaker_7-Male BARK": "v2/ko_speaker_7",
+    "ko_speaker_8-Male BARK": "v2/ko_speaker_8",
+    "ko_speaker_9-Male BARK": "v2/ko_speaker_9",
+    "pl_speaker_0-Male BARK": "v2/pl_speaker_0",
+    "pl_speaker_1-Male BARK": "v2/pl_speaker_1",
+    "pl_speaker_2-Male BARK": "v2/pl_speaker_2",
+    "pl_speaker_3-Male BARK": "v2/pl_speaker_3",
+    "pl_speaker_4-Female BARK": "v2/pl_speaker_4",
+    "pl_speaker_5-Male BARK": "v2/pl_speaker_5",
+    "pl_speaker_6-Female BARK": "v2/pl_speaker_6",
+    "pl_speaker_7-Male BARK": "v2/pl_speaker_7",
+    "pl_speaker_8-Male BARK": "v2/pl_speaker_8",
+    "pl_speaker_9-Female BARK": "v2/pl_speaker_9",
+    "pt_speaker_0-Male BARK": "v2/pt_speaker_0",
+    "pt_speaker_1-Male BARK": "v2/pt_speaker_1",
+    "pt_speaker_2-Male BARK": "v2/pt_speaker_2",
+    "pt_speaker_3-Male BARK": "v2/pt_speaker_3",
+    "pt_speaker_4-Male BARK": "v2/pt_speaker_4",
+    "pt_speaker_5-Male BARK": "v2/pt_speaker_5",
+    "pt_speaker_6-Male BARK": "v2/pt_speaker_6",
+    "pt_speaker_7-Male BARK": "v2/pt_speaker_7",
+    "pt_speaker_8-Male BARK": "v2/pt_speaker_8",
+    "pt_speaker_9-Male BARK": "v2/pt_speaker_9",
+    "ru_speaker_0-Male BARK": "v2/ru_speaker_0",
+    "ru_speaker_1-Male BARK": "v2/ru_speaker_1",
+    "ru_speaker_2-Male BARK": "v2/ru_speaker_2",
+    "ru_speaker_3-Male BARK": "v2/ru_speaker_3",
+    "ru_speaker_4-Male BARK": "v2/ru_speaker_4",
+    "ru_speaker_5-Female BARK": "v2/ru_speaker_5",
+    "ru_speaker_6-Female BARK": "v2/ru_speaker_6",
+    "ru_speaker_7-Male BARK": "v2/ru_speaker_7",
+    "ru_speaker_8-Male BARK": "v2/ru_speaker_8",
+    "ru_speaker_9-Female BARK": "v2/ru_speaker_9",
+    "tr_speaker_0-Male BARK": "v2/tr_speaker_0",
+    "tr_speaker_1-Male BARK": "v2/tr_speaker_1",
+    "tr_speaker_2-Male BARK": "v2/tr_speaker_2",
+    "tr_speaker_3-Male BARK": "v2/tr_speaker_3",
+    "tr_speaker_4-Female BARK": "v2/tr_speaker_4",
+    "tr_speaker_5-Female BARK": "v2/tr_speaker_5",
+    "tr_speaker_6-Male BARK": "v2/tr_speaker_6",
+    "tr_speaker_7-Male BARK": "v2/tr_speaker_7",
+    "tr_speaker_8-Male BARK": "v2/tr_speaker_8",
+    "tr_speaker_9-Male BARK": "v2/tr_speaker_9",
+    "zh_speaker_0-Male BARK": "v2/zh_speaker_0",
+    "zh_speaker_1-Male BARK": "v2/zh_speaker_1",
+    "zh_speaker_2-Male BARK": "v2/zh_speaker_2",
+    "zh_speaker_3-Male BARK": "v2/zh_speaker_3",
+    "zh_speaker_4-Female BARK": "v2/zh_speaker_4",
+    "zh_speaker_5-Male BARK": "v2/zh_speaker_5",
+    "zh_speaker_6-Female BARK": "v2/zh_speaker_6",
+    "zh_speaker_7-Female BARK": "v2/zh_speaker_7",
+    "zh_speaker_8-Male BARK": "v2/zh_speaker_8",
+    "zh_speaker_9-Female BARK": "v2/zh_speaker_9",
+}
+VITS_VOICES_LIST = {
+    "ar-facebook-mms VITS": "facebook/mms-tts-ara",
+    # 'zh-facebook-mms VITS': 'facebook/mms-tts-cmn',
+    "zh_Hakka-facebook-mms VITS": "facebook/mms-tts-hak",
+    "zh_MinNan-facebook-mms VITS": "facebook/mms-tts-nan",
+    # 'cs-facebook-mms VITS': 'facebook/mms-tts-ces',
+    # 'da-facebook-mms VITS': 'facebook/mms-tts-dan',
+    "nl-facebook-mms VITS": "facebook/mms-tts-nld",
+    "en-facebook-mms VITS": "facebook/mms-tts-eng",
+    "fi-facebook-mms VITS": "facebook/mms-tts-fin",
+    "fr-facebook-mms VITS": "facebook/mms-tts-fra",
+    "de-facebook-mms VITS": "facebook/mms-tts-deu",
+    "el-facebook-mms VITS": "facebook/mms-tts-ell",
+    "el_Ancient-facebook-mms VITS": "facebook/mms-tts-grc",
+    "he-facebook-mms VITS": "facebook/mms-tts-heb",
+    "hu-facebook-mms VITS": "facebook/mms-tts-hun",
+    # 'it-facebook-mms VITS': 'facebook/mms-tts-ita',
+    # 'ja-facebook-mms VITS': 'facebook/mms-tts-jpn',
+    "ko-facebook-mms VITS": "facebook/mms-tts-kor",
+    "fa-facebook-mms VITS": "facebook/mms-tts-fas",
+    "pl-facebook-mms VITS": "facebook/mms-tts-pol",
+    "pt-facebook-mms VITS": "facebook/mms-tts-por",
+    "ru-facebook-mms VITS": "facebook/mms-tts-rus",
+    "es-facebook-mms VITS": "facebook/mms-tts-spa",
+    "tr-facebook-mms VITS": "facebook/mms-tts-tur",
+    "uk-facebook-mms VITS": "facebook/mms-tts-ukr",
+    "ur_arabic-facebook-mms VITS": "facebook/mms-tts-urd-script_arabic",
+    "ur_devanagari-facebook-mms VITS": "facebook/mms-tts-urd-script_devanagari",
+    "ur_latin-facebook-mms VITS": "facebook/mms-tts-urd-script_latin",
+    "vi-facebook-mms VITS": "facebook/mms-tts-vie",
+    "hi-facebook-mms VITS": "facebook/mms-tts-hin",
+    "hi_Fiji-facebook-mms VITS": "facebook/mms-tts-hif",
+    "id-facebook-mms VITS": "facebook/mms-tts-ind",
+    "bn-facebook-mms VITS": "facebook/mms-tts-ben",
+    "te-facebook-mms VITS": "facebook/mms-tts-tel",
+    "mr-facebook-mms VITS": "facebook/mms-tts-mar",
+    "ta-facebook-mms VITS": "facebook/mms-tts-tam",
+    "jw-facebook-mms VITS": "facebook/mms-tts-jav",
+    "jw_Suriname-facebook-mms VITS": "facebook/mms-tts-jvn",
+    "ca-facebook-mms VITS": "facebook/mms-tts-cat",
+    "ne-facebook-mms VITS": "facebook/mms-tts-nep",
+    "th-facebook-mms VITS": "facebook/mms-tts-tha",
+    "th_Northern-facebook-mms VITS": "facebook/mms-tts-nod",
+    "sv-facebook-mms VITS": "facebook/mms-tts-swe",
+    "am-facebook-mms VITS": "facebook/mms-tts-amh",
+    "cy-facebook-mms VITS": "facebook/mms-tts-cym",
+    # "et-facebook-mms VITS": "facebook/mms-tts-est",
+    # "ht-facebook-mms VITS": "facebook/mms-tts-hrv",
+    "is-facebook-mms VITS": "facebook/mms-tts-isl",
+    "km-facebook-mms VITS": "facebook/mms-tts-khm",
+    "km_Northern-facebook-mms VITS": "facebook/mms-tts-kxm",
+    # "sk-facebook-mms VITS": "facebook/mms-tts-slk",
+    "sq_Northern-facebook-mms VITS": "facebook/mms-tts-sqi",
+    "az_South-facebook-mms VITS": "facebook/mms-tts-azb",
+    "az_North_script_cyrillic-facebook-mms VITS": "facebook/mms-tts-azj-script_cyrillic",
+    "az_North_script_latin-facebook-mms VITS": "facebook/mms-tts-azj-script_latin",
+    "bg-facebook-mms VITS": "facebook/mms-tts-bul",
+    # "gl-facebook-mms VITS": "facebook/mms-tts-glg",
+    "gu-facebook-mms VITS": "facebook/mms-tts-guj",
+    "kk-facebook-mms VITS": "facebook/mms-tts-kaz",
+    "kn-facebook-mms VITS": "facebook/mms-tts-kan",
+    # "lt-facebook-mms VITS": "facebook/mms-tts-lit",
+    "lv-facebook-mms VITS": "facebook/mms-tts-lav",
+    # "mk-facebook-mms VITS": "facebook/mms-tts-mkd",
+    "ml-facebook-mms VITS": "facebook/mms-tts-mal",
+    "ms-facebook-mms VITS": "facebook/mms-tts-zlm",
+    "ms_Central-facebook-mms VITS": "facebook/mms-tts-pse",
+    "ms_Manado-facebook-mms VITS": "facebook/mms-tts-xmm",
+    "ro-facebook-mms VITS": "facebook/mms-tts-ron",
+    # "si-facebook-mms VITS": "facebook/mms-tts-sin",
+    "sw-facebook-mms VITS": "facebook/mms-tts-swh",
+    # "af-facebook-mms VITS": "facebook/mms-tts-afr",
+    # "bs-facebook-mms VITS": "facebook/mms-tts-bos",
+    "la-facebook-mms VITS": "facebook/mms-tts-lat",
+    "my-facebook-mms VITS": "facebook/mms-tts-mya",
+    # "no_Bokmål-facebook-mms VITS": "thomasht86/mms-tts-nob",  # verify
+    "as-facebook-mms VITS": "facebook/mms-tts-asm",
+    "as_Nagamese-facebook-mms VITS": "facebook/mms-tts-nag",
+    "eu-facebook-mms VITS": "facebook/mms-tts-eus",
+    "ha-facebook-mms VITS": "facebook/mms-tts-hau",
+    "ht-facebook-mms VITS": "facebook/mms-tts-hat",
+    "hy_Western-facebook-mms VITS": "facebook/mms-tts-hyw",
+    "lo-facebook-mms VITS": "facebook/mms-tts-lao",
+    "mg-facebook-mms VITS": "facebook/mms-tts-mlg",
+    "mn-facebook-mms VITS": "facebook/mms-tts-mon",
+    # "mt-facebook-mms VITS": "facebook/mms-tts-mlt",
+    "pa_Eastern-facebook-mms VITS": "facebook/mms-tts-pan",
+    # "pa_Western-facebook-mms VITS": "facebook/mms-tts-pnb",
+    # "ps-facebook-mms VITS": "facebook/mms-tts-pus",
+    # "sl-facebook-mms VITS": "facebook/mms-tts-slv",
+    "sn-facebook-mms VITS": "facebook/mms-tts-sna",
+    "so-facebook-mms VITS": "facebook/mms-tts-son",
+    "tg-facebook-mms VITS": "facebook/mms-tts-tgk",
+    "tk_script_arabic-facebook-mms VITS": "facebook/mms-tts-tuk-script_arabic",
+    "tk_script_latin-facebook-mms VITS": "facebook/mms-tts-tuk-script_latin",
+    "tt-facebook-mms VITS": "facebook/mms-tts-tat",
+    "tt_Crimean-facebook-mms VITS": "facebook/mms-tts-crh",
+    "uz_script_cyrillic-facebook-mms VITS": "facebook/mms-tts-uzb-script_cyrillic",
+    "yo-facebook-mms VITS": "facebook/mms-tts-yor",
+    "ay-facebook-mms VITS": "facebook/mms-tts-ayr",
+    "bm-facebook-mms VITS": "facebook/mms-tts-bam",
+    "ceb-facebook-mms VITS": "facebook/mms-tts-ceb",
+    "ny-facebook-mms VITS": "facebook/mms-tts-nya",
+    "dv-facebook-mms VITS": "facebook/mms-tts-div",
+    "doi-facebook-mms VITS": "facebook/mms-tts-dgo",
+    "ee-facebook-mms VITS": "facebook/mms-tts-ewe",
+    "gn-facebook-mms VITS": "facebook/mms-tts-grn",
+    "ilo-facebook-mms VITS": "facebook/mms-tts-ilo",
+    "rw-facebook-mms VITS": "facebook/mms-tts-kin",
+    "kri-facebook-mms VITS": "facebook/mms-tts-kri",
+    "ku_script_arabic-facebook-mms VITS": "facebook/mms-tts-kmr-script_arabic",
+    "ku_script_cyrillic-facebook-mms VITS": "facebook/mms-tts-kmr-script_cyrillic",
+    "ku_script_latin-facebook-mms VITS": "facebook/mms-tts-kmr-script_latin",
+    "ckb-facebook-mms VITS": "razhan/mms-tts-ckb",  # Verify w
+    "ky-facebook-mms VITS": "facebook/mms-tts-kir",
+    "lg-facebook-mms VITS": "facebook/mms-tts-lug",
+    "mai-facebook-mms VITS": "facebook/mms-tts-mai",
+    "or-facebook-mms VITS": "facebook/mms-tts-ory",
+    "om-facebook-mms VITS": "facebook/mms-tts-orm",
+    "qu_Huallaga-facebook-mms VITS": "facebook/mms-tts-qub",
+    "qu_Lambayeque-facebook-mms VITS": "facebook/mms-tts-quf",
+    "qu_South_Bolivian-facebook-mms VITS": "facebook/mms-tts-quh",
+    "qu_North_Bolivian-facebook-mms VITS": "facebook/mms-tts-qul",
+    "qu_Tena_Lowland-facebook-mms VITS": "facebook/mms-tts-quw",
+    "qu_Ayacucho-facebook-mms VITS": "facebook/mms-tts-quy",
+    "qu_Cusco-facebook-mms VITS": "facebook/mms-tts-quz",
+    "qu_Cajamarca-facebook-mms VITS": "facebook/mms-tts-qvc",
+    "qu_Eastern_Apurímac-facebook-mms VITS": "facebook/mms-tts-qve",
+    "qu_Huamalíes_Dos_de_Mayo_Huánuco-facebook-mms VITS": "facebook/mms-tts-qvh",
+    "qu_Margos_Yarowilca_Lauricocha-facebook-mms VITS": "facebook/mms-tts-qvm",
+    "qu_North_Junín-facebook-mms VITS": "facebook/mms-tts-qvn",
+    "qu_Napo-facebook-mms VITS": "facebook/mms-tts-qvo",
+    "qu_San_Martín-facebook-mms VITS": "facebook/mms-tts-qvs",
+    "qu_Huaylla_Wanca-facebook-mms VITS": "facebook/mms-tts-qvw",
+    "qu_Northern_Pastaza-facebook-mms VITS": "facebook/mms-tts-qvz",
+    "qu_Huaylas_Ancash-facebook-mms VITS": "facebook/mms-tts-qwh",
+    "qu_Panao-facebook-mms VITS": "facebook/mms-tts-qxh",
+    "qu_Salasaca_Highland-facebook-mms VITS": "facebook/mms-tts-qxl",
+    "qu_Northern_Conchucos_Ancash-facebook-mms VITS": "facebook/mms-tts-qxn",
+    "qu_Southern_Conchucos-facebook-mms VITS": "facebook/mms-tts-qxo",
+    "qu_Cañar_Highland-facebook-mms VITS": "facebook/mms-tts-qxr",
+    "sm-facebook-mms VITS": "facebook/mms-tts-smo",
+    "ti-facebook-mms VITS": "facebook/mms-tts-tir",
+    "ts-facebook-mms VITS": "facebook/mms-tts-tso",
+    "ak-facebook-mms VITS": "facebook/mms-tts-aka",
+    "ug_script_arabic-facebook-mms VITS": "facebook/mms-tts-uig-script_arabic",
+    "ug_script_cyrillic-facebook-mms VITS": "facebook/mms-tts-uig-script_cyrillic",
+}
+OPENAI_TTS_CODES = [
+    "af", "ar", "hy", "az", "be", "bs", "bg", "ca", "zh", "hr", "cs", "da",
+    "nl", "en", "et", "fi", "fr", "gl", "de", "el", "he", "hi", "hu", "is",
+    "id", "it", "ja", "kn", "kk", "ko", "lv", "lt", "mk", "ms", "mr", "mi",
+    "ne", "no", "fa", "pl", "pt", "ro", "ru", "sr", "sk", "sl", "es", "sw",
+    "sv", "tl", "ta", "th", "tr", "uk", "ur", "vi", "cy", "zh-TW"
+]
+OPENAI_TTS_MODELS = [
+    ">alloy OpenAI-TTS",
+    ">echo OpenAI-TTS",
+    ">fable OpenAI-TTS",
+    ">onyx OpenAI-TTS",
+    ">nova OpenAI-TTS",
+    ">shimmer OpenAI-TTS",
+    ">alloy HD OpenAI-TTS",
+    ">echo HD OpenAI-TTS",
+    ">fable HD OpenAI-TTS",
+    ">onyx HD OpenAI-TTS",
+    ">nova HD OpenAI-TTS",
+    ">shimmer HD OpenAI-TTS"
+]
+LANGUAGE_CODE_IN_THREE_LETTERS = {
+    "Automatic detection": "aut",
+    "ar": "ara",
+    "zh": "chi",
+    "cs": "cze",
+    "da": "dan",
+    "nl": "dut",
+    "en": "eng",
+    "fi": "fin",
+    "fr": "fre",
+    "de": "ger",
+    "el": "gre",
+    "he": "heb",
+    "hu": "hun",
+    "it": "ita",
+    "ja": "jpn",
+    "ko": "kor",
+    "fa": "per",
+    "pl": "pol",
+    "pt": "por",
+    "ru": "rus",
+    "es": "spa",
+    "tr": "tur",
+    "uk": "ukr",
+    "ur": "urd",
+    "vi": "vie",
+    "hi": "hin",
+    "id": "ind",
+    "bn": "ben",
+    "te": "tel",
+    "mr": "mar",
+    "ta": "tam",
+    "jw": "jav",
+    "ca": "cat",
+    "ne": "nep",
+    "th": "tha",
+    "sv": "swe",
+    "am": "amh",
+    "cy": "cym",
+    "et": "est",
+    "hr": "hrv",
+    "is": "isl",
+    "km": "khm",
+    "sk": "slk",
+    "sq": "sqi",
+    "sr": "srp",
+}

soni_translate/languages_gui.py CHANGED Viewed

@@ -2,7 +2,7 @@
 news = """ ## 📖 News
-        🔥 2024/18/05: Overlap reduction. OpenAI API key integration for transcription, translation, and TTS. Output type: subtitles by speaker, separate audio sound, and video only with subtitles. Now you have access to a better-performing version of Whisper for transcribing speech. For example, you can use `kotoba-tech/kotoba-whisper-v1.1` for Japanese transcription, available [here](https://huggingface.co/kotoba-tech/kotoba-whisper-v1.1). You can find these improved models on the [Hugging Face Whisper page](https://huggingface.co/models?pipeline_tag=automatic-speech-recognition&sort=trending&search=whisper). Simply copy the repository ID and paste it into the 'Whisper ASR model' in 'Advanced Settings'. Support for ass subtitles and batch processing with subtitles. Vocal enhancement before transcription. Added CPU mode with `app_rvc.py --cpu_mode`. TTS now supports up to 12 speakers. OpenVoiceV2 has been integrated for voice imitation. PDF to videobook (displays images from the PDF).
         🔥 2024/03/02: Preserve file names in output. Multiple archives can now be submitted simultaneously by specifying their paths, directories or URLs separated by commas. Added option for disabling diarization. Implemented soft subtitles. Format output (MP3, MP4, MKV, WAV, and OGG), and resolved issues related to file reading and diarization.

 news = """ ## 📖 News
+        🔥 2024/05/18: Overlap reduction. OpenAI API key integration for transcription, translation, and TTS. Output type: subtitles by speaker, separate audio sound, and video only with subtitles. Now you have access to a better-performing version of Whisper for transcribing speech. For example, you can use `kotoba-tech/kotoba-whisper-v1.1` for Japanese transcription, available [here](https://huggingface.co/kotoba-tech/kotoba-whisper-v1.1). You can find these improved models on the [Hugging Face Whisper page](https://huggingface.co/models?pipeline_tag=automatic-speech-recognition&sort=trending&search=whisper). Simply copy the repository ID and paste it into the 'Whisper ASR model' in 'Advanced Settings'. Support for ass subtitles and batch processing with subtitles. Vocal enhancement before transcription. Added CPU mode with `app_rvc.py --cpu_mode`. TTS now supports up to 12 speakers. OpenVoiceV2 has been integrated for voice imitation. PDF to videobook (displays images from the PDF).
         🔥 2024/03/02: Preserve file names in output. Multiple archives can now be submitted simultaneously by specifying their paths, directories or URLs separated by commas. Added option for disabling diarization. Implemented soft subtitles. Format output (MP3, MP4, MKV, WAV, and OGG), and resolved issues related to file reading and diarization.

soni_translate/logging_setup.py CHANGED Viewed

@@ -1,68 +1,68 @@
-import logging
-import sys
-import warnings
-import os
-def configure_logging_libs(debug=False):
-    warnings.filterwarnings(
-      action="ignore", category=UserWarning, module="pyannote"
-    )
-    modules = [
-      "numba", "httpx", "markdown_it", "speechbrain", "fairseq", "pyannote",
-      "faiss",
-      "pytorch_lightning.utilities.migration.utils",
-      "pytorch_lightning.utilities.migration",
-      "pytorch_lightning",
-      "lightning",
-      "lightning.pytorch.utilities.migration.utils",
-    ]
-    try:
-        for module in modules:
-            logging.getLogger(module).setLevel(logging.WARNING)
-        os.environ['TF_CPP_MIN_LOG_LEVEL'] = "3" if not debug else "1"
-        # fix verbose pyannote audio
-        def fix_verbose_pyannote(*args, what=""):
-            pass
-        import pyannote.audio.core.model # noqa
-        pyannote.audio.core.model.check_version = fix_verbose_pyannote
-    except Exception as error:
-        logger.error(str(error))
-def setup_logger(name_log):
-    logger = logging.getLogger(name_log)
-    logger.setLevel(logging.INFO)
-    _default_handler = logging.StreamHandler()  # Set sys.stderr as stream.
-    _default_handler.flush = sys.stderr.flush
-    logger.addHandler(_default_handler)
-    logger.propagate = False
-    handlers = logger.handlers
-    for handler in handlers:
-        formatter = logging.Formatter("[%(levelname)s] >> %(message)s")
-        handler.setFormatter(formatter)
-    # logger.handlers
-    return logger
-logger = setup_logger("sonitranslate")
-logger.setLevel(logging.INFO)
-def set_logging_level(verbosity_level):
-    logging_level_mapping = {
-        "debug": logging.DEBUG,
-        "info": logging.INFO,
-        "warning": logging.WARNING,
-        "error": logging.ERROR,
-        "critical": logging.CRITICAL,
-    }
-    logger.setLevel(logging_level_mapping.get(verbosity_level, logging.INFO))

+import logging
+import sys
+import warnings
+import os
+def configure_logging_libs(debug=False):
+    warnings.filterwarnings(
+      action="ignore", category=UserWarning, module="pyannote"
+    )
+    modules = [
+      "numba", "httpx", "markdown_it", "speechbrain", "fairseq", "pyannote",
+      "faiss",
+      "pytorch_lightning.utilities.migration.utils",
+      "pytorch_lightning.utilities.migration",
+      "pytorch_lightning",
+      "lightning",
+      "lightning.pytorch.utilities.migration.utils",
+    ]
+    try:
+        for module in modules:
+            logging.getLogger(module).setLevel(logging.WARNING)
+        os.environ['TF_CPP_MIN_LOG_LEVEL'] = "3" if not debug else "1"
+        # fix verbose pyannote audio
+        def fix_verbose_pyannote(*args, what=""):
+            pass
+        import pyannote.audio.core.model # noqa
+        pyannote.audio.core.model.check_version = fix_verbose_pyannote
+    except Exception as error:
+        logger.error(str(error))
+def setup_logger(name_log):
+    logger = logging.getLogger(name_log)
+    logger.setLevel(logging.INFO)
+    _default_handler = logging.StreamHandler()  # Set sys.stderr as stream.
+    _default_handler.flush = sys.stderr.flush
+    logger.addHandler(_default_handler)
+    logger.propagate = False
+    handlers = logger.handlers
+    for handler in handlers:
+        formatter = logging.Formatter("[%(levelname)s] >> %(message)s")
+        handler.setFormatter(formatter)
+    # logger.handlers
+    return logger
+logger = setup_logger("sonitranslate")
+logger.setLevel(logging.INFO)
+def set_logging_level(verbosity_level):
+    logging_level_mapping = {
+        "debug": logging.DEBUG,
+        "info": logging.INFO,
+        "warning": logging.WARNING,
+        "error": logging.ERROR,
+        "critical": logging.CRITICAL,
+    }
+    logger.setLevel(logging_level_mapping.get(verbosity_level, logging.INFO))

soni_translate/mdx_net.py CHANGED Viewed

@@ -1,594 +1,582 @@
-import gc
-import hashlib
-import os
-import queue
-import threading
-import json
-import shlex
-import sys
-import subprocess
-import librosa
-import numpy as np
-import soundfile as sf
-import torch
-from tqdm import tqdm
-try:
-    from .utils import (
-        remove_directory_contents,
-        create_directories,
-    )
-except:  # noqa
-    from utils import (
-        remove_directory_contents,
-        create_directories,
-    )
-from .logging_setup import logger
-try:
-    import onnxruntime as ort
-except Exception as error:
-    logger.error(str(error))
-# import warnings
-# warnings.filterwarnings("ignore")
-stem_naming = {
-    "Vocals": "Instrumental",
-    "Other": "Instruments",
-    "Instrumental": "Vocals",
-    "Drums": "Drumless",
-    "Bass": "Bassless",
-}
-class MDXModel:
-    def __init__(
-        self,
-        device,
-        dim_f,
-        dim_t,
-        n_fft,
-        hop=1024,
-        stem_name=None,
-        compensation=1.000,
-    ):
-        self.dim_f = dim_f
-        self.dim_t = dim_t
-        self.dim_c = 4
-        self.n_fft = n_fft
-        self.hop = hop
-        self.stem_name = stem_name
-        self.compensation = compensation
-        self.n_bins = self.n_fft // 2 + 1
-        self.chunk_size = hop * (self.dim_t - 1)
-        self.window = torch.hann_window(
-            window_length=self.n_fft, periodic=True
-        ).to(device)
-        out_c = self.dim_c
-        self.freq_pad = torch.zeros(
-            [1, out_c, self.n_bins - self.dim_f, self.dim_t]
-        ).to(device)
-    def stft(self, x):
-        x = x.reshape([-1, self.chunk_size])
-        x = torch.stft(
-            x,
-            n_fft=self.n_fft,
-            hop_length=self.hop,
-            window=self.window,
-            center=True,
-            return_complex=True,
-        )
-        x = torch.view_as_real(x)
-        x = x.permute([0, 3, 1, 2])
-        x = x.reshape([-1, 2, 2, self.n_bins, self.dim_t]).reshape(
-            [-1, 4, self.n_bins, self.dim_t]
-        )
-        return x[:, :, : self.dim_f]
-    def istft(self, x, freq_pad=None):
-        freq_pad = (
-            self.freq_pad.repeat([x.shape[0], 1, 1, 1])
-            if freq_pad is None
-            else freq_pad
-        )
-        x = torch.cat([x, freq_pad], -2)
-        # c = 4*2 if self.target_name=='*' else 2
-        x = x.reshape([-1, 2, 2, self.n_bins, self.dim_t]).reshape(
-            [-1, 2, self.n_bins, self.dim_t]
-        )
-        x = x.permute([0, 2, 3, 1])
-        x = x.contiguous()
-        x = torch.view_as_complex(x)
-        x = torch.istft(
-            x,
-            n_fft=self.n_fft,
-            hop_length=self.hop,
-            window=self.window,
-            center=True,
-        )
-        return x.reshape([-1, 2, self.chunk_size])
-class MDX:
-    DEFAULT_SR = 44100
-    # Unit: seconds
-    DEFAULT_CHUNK_SIZE = 0 * DEFAULT_SR
-    DEFAULT_MARGIN_SIZE = 1 * DEFAULT_SR
-    def __init__(
-        self, model_path: str, params: MDXModel, processor=0
-    ):
-        # Set the device and the provider (CPU or CUDA)
-        self.device = (
-            torch.device(f"cuda:{processor}")
-            if processor >= 0
-            else torch.device("cpu")
-        )
-        self.provider = (
-            ["CUDAExecutionProvider"]
-            if processor >= 0
-            else ["CPUExecutionProvider"]
-        )
-        self.model = params
-        # Load the ONNX model using ONNX Runtime
-        self.ort = ort.InferenceSession(model_path, providers=self.provider)
-        # Preload the model for faster performance
-        self.ort.run(
-            None,
-            {"input": torch.rand(1, 4, params.dim_f, params.dim_t).numpy()},
-        )
-        self.process = lambda spec: self.ort.run(
-            None, {"input": spec.cpu().numpy()}
-        )[0]
-        self.prog = None
-    @staticmethod
-    def get_hash(model_path):
-        try:
-            with open(model_path, "rb") as f:
-                f.seek(-10000 * 1024, 2)
-                model_hash = hashlib.md5(f.read()).hexdigest()
-        except: # noqa
-            model_hash = hashlib.md5(open(model_path, "rb").read()).hexdigest()
-        return model_hash
-    @staticmethod
-    def segment(
-        wave,
-        combine=True,
-        chunk_size=DEFAULT_CHUNK_SIZE,
-        margin_size=DEFAULT_MARGIN_SIZE,
-    ):
-        """
-        Segment or join segmented wave array
-        Args:
-            wave: (np.array) Wave array to be segmented or joined
-            combine: (bool) If True, combines segmented wave array.
-                If False, segments wave array.
-            chunk_size: (int) Size of each segment (in samples)
-            margin_size: (int) Size of margin between segments (in samples)
-        Returns:
-            numpy array: Segmented or joined wave array
-        """
-        if combine:
-            # Initializing as None instead of [] for later numpy array concatenation
-            processed_wave = None
-            for segment_count, segment in enumerate(wave):
-                start = 0 if segment_count == 0 else margin_size
-                end = None if segment_count == len(wave) - 1 else -margin_size
-                if margin_size == 0:
-                    end = None
-                if processed_wave is None:  # Create array for first segment
-                    processed_wave = segment[:, start:end]
-                else:  # Concatenate to existing array for subsequent segments
-                    processed_wave = np.concatenate(
-                        (processed_wave, segment[:, start:end]), axis=-1
-                    )
-        else:
-            processed_wave = []
-            sample_count = wave.shape[-1]
-            if chunk_size <= 0 or chunk_size > sample_count:
-                chunk_size = sample_count
-            if margin_size > chunk_size:
-                margin_size = chunk_size
-            for segment_count, skip in enumerate(
-                range(0, sample_count, chunk_size)
-            ):
-                margin = 0 if segment_count == 0 else margin_size
-                end = min(skip + chunk_size + margin_size, sample_count)
-                start = skip - margin
-                cut = wave[:, start:end].copy()
-                processed_wave.append(cut)
-                if end == sample_count:
-                    break
-        return processed_wave
-    def pad_wave(self, wave):
-        """
-        Pad the wave array to match the required chunk size
-        Args:
-            wave: (np.array) Wave array to be padded
-        Returns:
-            tuple: (padded_wave, pad, trim)
-                - padded_wave: Padded wave array
-                - pad: Number of samples that were padded
-                - trim: Number of samples that were trimmed
-        """
-        n_sample = wave.shape[1]
-        trim = self.model.n_fft // 2
-        gen_size = self.model.chunk_size - 2 * trim
-        pad = gen_size - n_sample % gen_size
-        # Padded wave
-        wave_p = np.concatenate(
-            (
-                np.zeros((2, trim)),
-                wave,
-                np.zeros((2, pad)),
-                np.zeros((2, trim)),
-            ),
-            1,
-        )
-        mix_waves = []
-        for i in range(0, n_sample + pad, gen_size):
-            waves = np.array(wave_p[:, i:i + self.model.chunk_size])
-            mix_waves.append(waves)
-        mix_waves = torch.tensor(mix_waves, dtype=torch.float32).to(
-            self.device
-        )
-        return mix_waves, pad, trim
-    def _process_wave(self, mix_waves, trim, pad, q: queue.Queue, _id: int):
-        """
-        Process each wave segment in a multi-threaded environment
-        Args:
-            mix_waves: (torch.Tensor) Wave segments to be processed
-            trim: (int) Number of samples trimmed during padding
-            pad: (int) Number of samples padded during padding
-            q: (queue.Queue) Queue to hold the processed wave segments
-            _id: (int) Identifier of the processed wave segment
-        Returns:
-            numpy array: Processed wave segment
-        """
-        mix_waves = mix_waves.split(1)
-        with torch.no_grad():
-            pw = []
-            for mix_wave in mix_waves:
-                self.prog.update()
-                spec = self.model.stft(mix_wave)
-                processed_spec = torch.tensor(self.process(spec))
-                processed_wav = self.model.istft(
-                    processed_spec.to(self.device)
-                )
-                processed_wav = (
-                    processed_wav[:, :, trim:-trim]
-                    .transpose(0, 1)
-                    .reshape(2, -1)
-                    .cpu()
-                    .numpy()
-                )
-                pw.append(processed_wav)
-        processed_signal = np.concatenate(pw, axis=-1)[:, :-pad]
-        q.put({_id: processed_signal})
-        return processed_signal
-    def process_wave(self, wave: np.array, mt_threads=1):
-        """
-        Process the wave array in a multi-threaded environment
-        Args:
-            wave: (np.array) Wave array to be processed
-            mt_threads: (int) Number of threads to be used for processing
-        Returns:
-            numpy array: Processed wave array
-        """
-        self.prog = tqdm(total=0)
-        chunk = wave.shape[-1] // mt_threads
-        waves = self.segment(wave, False, chunk)
-        # Create a queue to hold the processed wave segments
-        q = queue.Queue()
-        threads = []
-        for c, batch in enumerate(waves):
-            mix_waves, pad, trim = self.pad_wave(batch)
-            self.prog.total = len(mix_waves) * mt_threads
-            thread = threading.Thread(
-                target=self._process_wave, args=(mix_waves, trim, pad, q, c)
-            )
-            thread.start()
-            threads.append(thread)
-        for thread in threads:
-            thread.join()
-        self.prog.close()
-        processed_batches = []
-        while not q.empty():
-            processed_batches.append(q.get())
-        processed_batches = [
-            list(wave.values())[0]
-            for wave in sorted(
-                processed_batches, key=lambda d: list(d.keys())[0]
-            )
-        ]
-        assert len(processed_batches) == len(
-            waves
-        ), "Incomplete processed batches, please reduce batch size!"
-        return self.segment(processed_batches, True, chunk)
-def run_mdx(
-    model_params,
-    output_dir,
-    model_path,
-    filename,
-    exclude_main=False,
-    exclude_inversion=False,
-    suffix=None,
-    invert_suffix=None,
-    denoise=False,
-    keep_orig=True,
-    m_threads=2,
-    device_base="cuda",
-):
-    if device_base == "cuda":
-        device = torch.device("cuda:0")
-        processor_num = 0
-        device_properties = torch.cuda.get_device_properties(device)
-        vram_gb = device_properties.total_memory / 1024**3
-        m_threads = 1 if vram_gb < 8 else 2
-    else:
-        device = torch.device("cpu")
-        processor_num = -1
-        m_threads = 1
-    if os.environ.get("ZERO_GPU") == "TRUE":
-        duration = librosa.get_duration(filename=filename)
-        if duration < 60:
-            pass
-        elif duration >= 60 and duration <= 900:
-            m_threads = 4
-        elif duration > 900:
-            m_threads = 16
-    logger.info(f"MDX-NET Threads: {m_threads}, duration {duration}")
-    model_hash = MDX.get_hash(model_path)
-    mp = model_params.get(model_hash)
-    model = MDXModel(
-        device,
-        dim_f=mp["mdx_dim_f_set"],
-        dim_t=2 ** mp["mdx_dim_t_set"],
-        n_fft=mp["mdx_n_fft_scale_set"],
-        stem_name=mp["primary_stem"],
-        compensation=mp["compensate"],
-    )
-    mdx_sess = MDX(model_path, model, processor=processor_num)
-    wave, sr = librosa.load(filename, mono=False, sr=44100)
-    # normalizing input wave gives better output
-    peak = max(np.max(wave), abs(np.min(wave)))
-    wave /= peak
-    if denoise:
-        wave_processed = -(mdx_sess.process_wave(-wave, m_threads)) + (
-            mdx_sess.process_wave(wave, m_threads)
-        )
-        wave_processed *= 0.5
-    else:
-        wave_processed = mdx_sess.process_wave(wave, m_threads)
-    # return to previous peak
-    wave_processed *= peak
-    stem_name = model.stem_name if suffix is None else suffix
-    main_filepath = None
-    if not exclude_main:
-        main_filepath = os.path.join(
-            output_dir,
-            f"{os.path.basename(os.path.splitext(filename)[0])}_{stem_name}.wav",
-        )
-        sf.write(main_filepath, wave_processed.T, sr)
-    invert_filepath = None
-    if not exclude_inversion:
-        diff_stem_name = (
-            stem_naming.get(stem_name)
-            if invert_suffix is None
-            else invert_suffix
-        )
-        stem_name = (
-            f"{stem_name}_diff" if diff_stem_name is None else diff_stem_name
-        )
-        invert_filepath = os.path.join(
-            output_dir,
-            f"{os.path.basename(os.path.splitext(filename)[0])}_{stem_name}.wav",
-        )
-        sf.write(
-            invert_filepath,
-            (-wave_processed.T * model.compensation) + wave.T,
-            sr,
-        )
-    if not keep_orig:
-        os.remove(filename)
-    del mdx_sess, wave_processed, wave
-    gc.collect()
-    torch.cuda.empty_cache()
-    return main_filepath, invert_filepath
-MDX_DOWNLOAD_LINK = "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/"
-UVR_MODELS = [
-    "UVR-MDX-NET-Voc_FT.onnx",
-    "UVR_MDXNET_KARA_2.onnx",
-    "Reverb_HQ_By_FoxJoy.onnx",
-    "UVR-MDX-NET-Inst_HQ_4.onnx",
-]
-BASE_DIR = "."  # os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-mdxnet_models_dir = os.path.join(BASE_DIR, "mdx_models")
-output_dir = os.path.join(BASE_DIR, "clean_song_output")
-def convert_to_stereo_and_wav(audio_path):
-    wave, sr = librosa.load(audio_path, mono=False, sr=44100)
-    # check if mono
-    if type(wave[0]) != np.ndarray or audio_path[-4:].lower() != ".wav": # noqa
-        stereo_path = f"{os.path.splitext(audio_path)[0]}_stereo.wav"
-        stereo_path = os.path.join(output_dir, stereo_path)
-        command = shlex.split(
-            f'ffmpeg -y -loglevel error -i "{audio_path}" -ac 2 -f wav "{stereo_path}"'
-        )
-        sub_params = {
-            "stdout": subprocess.PIPE,
-            "stderr": subprocess.PIPE,
-            "creationflags": subprocess.CREATE_NO_WINDOW
-            if sys.platform == "win32"
-            else 0,
-        }
-        process_wav = subprocess.Popen(command, **sub_params)
-        output, errors = process_wav.communicate()
-        if process_wav.returncode != 0 or not os.path.exists(stereo_path):
-            raise Exception("Error processing audio to stereo wav")
-        return stereo_path
-    else:
-        return audio_path
-def process_uvr_task(
-    orig_song_path: str = "aud_test.mp3",
-    main_vocals: bool = False,
-    dereverb: bool = True,
-    song_id: str = "mdx",  # folder output name
-    only_voiceless: bool = False,
-    remove_files_output_dir: bool = False,
-):
-    if os.environ.get("SONITR_DEVICE") == "cpu":
-        device_base = "cpu"
-    else:
-        device_base = "cuda" if torch.cuda.is_available() else "cpu"
-    if remove_files_output_dir:
-        remove_directory_contents(output_dir)
-    with open(os.path.join(mdxnet_models_dir, "data.json")) as infile:
-        mdx_model_params = json.load(infile)
-    song_output_dir = os.path.join(output_dir, song_id)
-    create_directories(song_output_dir)
-    orig_song_path = convert_to_stereo_and_wav(orig_song_path)
-    logger.debug(f"onnxruntime device >> {ort.get_device()}")
-    if only_voiceless:
-        logger.info("Voiceless Track Separation...")
-        return run_mdx(
-            mdx_model_params,
-            song_output_dir,
-            os.path.join(mdxnet_models_dir, "UVR-MDX-NET-Inst_HQ_4.onnx"),
-            orig_song_path,
-            suffix="Voiceless",
-            denoise=False,
-            keep_orig=True,
-            exclude_inversion=True,
-            device_base=device_base,
-        )
-    logger.info("Vocal Track Isolation and Voiceless Track Separation...")
-    vocals_path, instrumentals_path = run_mdx(
-        mdx_model_params,
-        song_output_dir,
-        os.path.join(mdxnet_models_dir, "UVR-MDX-NET-Voc_FT.onnx"),
-        orig_song_path,
-        denoise=True,
-        keep_orig=True,
-        device_base=device_base,
-    )
-    if main_vocals:
-        logger.info("Main Voice Separation from Supporting Vocals...")
-        backup_vocals_path, main_vocals_path = run_mdx(
-            mdx_model_params,
-            song_output_dir,
-            os.path.join(mdxnet_models_dir, "UVR_MDXNET_KARA_2.onnx"),
-            vocals_path,
-            suffix="Backup",
-            invert_suffix="Main",
-            denoise=True,
-            device_base=device_base,
-        )
-    else:
-        backup_vocals_path, main_vocals_path = None, vocals_path
-    if dereverb:
-        logger.info("Vocal Clarity Enhancement through De-Reverberation...")
-        _, vocals_dereverb_path = run_mdx(
-            mdx_model_params,
-            song_output_dir,
-            os.path.join(mdxnet_models_dir, "Reverb_HQ_By_FoxJoy.onnx"),
-            main_vocals_path,
-            invert_suffix="DeReverb",
-            exclude_main=True,
-            denoise=True,
-            device_base=device_base,
-        )
-    else:
-        vocals_dereverb_path = main_vocals_path
-    return (
-        vocals_path,
-        instrumentals_path,
-        backup_vocals_path,
-        main_vocals_path,
-        vocals_dereverb_path,
-    )
-if __name__ == "__main__":
-    from utils import download_manager
-    for id_model in UVR_MODELS:
-        download_manager(
-            os.path.join(MDX_DOWNLOAD_LINK, id_model), mdxnet_models_dir
-        )
-    (
-        vocals_path_,
-        instrumentals_path_,
-        backup_vocals_path_,
-        main_vocals_path_,
-        vocals_dereverb_path_,
-    ) = process_uvr_task(
-        orig_song_path="aud.mp3",
-        main_vocals=True,
-        dereverb=True,
-        song_id="mdx",
-        remove_files_output_dir=True,
-    )

+import gc
+import hashlib
+import os
+import queue
+import threading
+import json
+import shlex
+import sys
+import subprocess
+import librosa
+import numpy as np
+import soundfile as sf
+import torch
+from tqdm import tqdm
+try:
+    from .utils import (
+        remove_directory_contents,
+        create_directories,
+    )
+except:  # noqa
+    from utils import (
+        remove_directory_contents,
+        create_directories,
+    )
+from .logging_setup import logger
+try:
+    import onnxruntime as ort
+except Exception as error:
+    logger.error(str(error))
+# import warnings
+# warnings.filterwarnings("ignore")
+stem_naming = {
+    "Vocals": "Instrumental",
+    "Other": "Instruments",
+    "Instrumental": "Vocals",
+    "Drums": "Drumless",
+    "Bass": "Bassless",
+}
+class MDXModel:
+    def __init__(
+        self,
+        device,
+        dim_f,
+        dim_t,
+        n_fft,
+        hop=1024,
+        stem_name=None,
+        compensation=1.000,
+    ):
+        self.dim_f = dim_f
+        self.dim_t = dim_t
+        self.dim_c = 4
+        self.n_fft = n_fft
+        self.hop = hop
+        self.stem_name = stem_name
+        self.compensation = compensation
+        self.n_bins = self.n_fft // 2 + 1
+        self.chunk_size = hop * (self.dim_t - 1)
+        self.window = torch.hann_window(
+            window_length=self.n_fft, periodic=True
+        ).to(device)
+        out_c = self.dim_c
+        self.freq_pad = torch.zeros(
+            [1, out_c, self.n_bins - self.dim_f, self.dim_t]
+        ).to(device)
+    def stft(self, x):
+        x = x.reshape([-1, self.chunk_size])
+        x = torch.stft(
+            x,
+            n_fft=self.n_fft,
+            hop_length=self.hop,
+            window=self.window,
+            center=True,
+            return_complex=True,
+        )
+        x = torch.view_as_real(x)
+        x = x.permute([0, 3, 1, 2])
+        x = x.reshape([-1, 2, 2, self.n_bins, self.dim_t]).reshape(
+            [-1, 4, self.n_bins, self.dim_t]
+        )
+        return x[:, :, : self.dim_f]
+    def istft(self, x, freq_pad=None):
+        freq_pad = (
+            self.freq_pad.repeat([x.shape[0], 1, 1, 1])
+            if freq_pad is None
+            else freq_pad
+        )
+        x = torch.cat([x, freq_pad], -2)
+        # c = 4*2 if self.target_name=='*' else 2
+        x = x.reshape([-1, 2, 2, self.n_bins, self.dim_t]).reshape(
+            [-1, 2, self.n_bins, self.dim_t]
+        )
+        x = x.permute([0, 2, 3, 1])
+        x = x.contiguous()
+        x = torch.view_as_complex(x)
+        x = torch.istft(
+            x,
+            n_fft=self.n_fft,
+            hop_length=self.hop,
+            window=self.window,
+            center=True,
+        )
+        return x.reshape([-1, 2, self.chunk_size])
+class MDX:
+    DEFAULT_SR = 44100
+    # Unit: seconds
+    DEFAULT_CHUNK_SIZE = 0 * DEFAULT_SR
+    DEFAULT_MARGIN_SIZE = 1 * DEFAULT_SR
+    def __init__(
+        self, model_path: str, params: MDXModel, processor=0
+    ):
+        # Set the device and the provider (CPU or CUDA)
+        self.device = (
+            torch.device(f"cuda:{processor}")
+            if processor >= 0
+            else torch.device("cpu")
+        )
+        self.provider = (
+            ["CUDAExecutionProvider"]
+            if processor >= 0
+            else ["CPUExecutionProvider"]
+        )
+        self.model = params
+        # Load the ONNX model using ONNX Runtime
+        self.ort = ort.InferenceSession(model_path, providers=self.provider)
+        # Preload the model for faster performance
+        self.ort.run(
+            None,
+            {"input": torch.rand(1, 4, params.dim_f, params.dim_t).numpy()},
+        )
+        self.process = lambda spec: self.ort.run(
+            None, {"input": spec.cpu().numpy()}
+        )[0]
+        self.prog = None
+    @staticmethod
+    def get_hash(model_path):
+        try:
+            with open(model_path, "rb") as f:
+                f.seek(-10000 * 1024, 2)
+                model_hash = hashlib.md5(f.read()).hexdigest()
+        except: # noqa
+            model_hash = hashlib.md5(open(model_path, "rb").read()).hexdigest()
+        return model_hash
+    @staticmethod
+    def segment(
+        wave,
+        combine=True,
+        chunk_size=DEFAULT_CHUNK_SIZE,
+        margin_size=DEFAULT_MARGIN_SIZE,
+    ):
+        """
+        Segment or join segmented wave array
+        Args:
+            wave: (np.array) Wave array to be segmented or joined
+            combine: (bool) If True, combines segmented wave array.
+                If False, segments wave array.
+            chunk_size: (int) Size of each segment (in samples)
+            margin_size: (int) Size of margin between segments (in samples)
+        Returns:
+            numpy array: Segmented or joined wave array
+        """
+        if combine:
+            # Initializing as None instead of [] for later numpy array concatenation
+            processed_wave = None
+            for segment_count, segment in enumerate(wave):
+                start = 0 if segment_count == 0 else margin_size
+                end = None if segment_count == len(wave) - 1 else -margin_size
+                if margin_size == 0:
+                    end = None
+                if processed_wave is None:  # Create array for first segment
+                    processed_wave = segment[:, start:end]
+                else:  # Concatenate to existing array for subsequent segments
+                    processed_wave = np.concatenate(
+                        (processed_wave, segment[:, start:end]), axis=-1
+                    )
+        else:
+            processed_wave = []
+            sample_count = wave.shape[-1]
+            if chunk_size <= 0 or chunk_size > sample_count:
+                chunk_size = sample_count
+            if margin_size > chunk_size:
+                margin_size = chunk_size
+            for segment_count, skip in enumerate(
+                range(0, sample_count, chunk_size)
+            ):
+                margin = 0 if segment_count == 0 else margin_size
+                end = min(skip + chunk_size + margin_size, sample_count)
+                start = skip - margin
+                cut = wave[:, start:end].copy()
+                processed_wave.append(cut)
+                if end == sample_count:
+                    break
+        return processed_wave
+    def pad_wave(self, wave):
+        """
+        Pad the wave array to match the required chunk size
+        Args:
+            wave: (np.array) Wave array to be padded
+        Returns:
+            tuple: (padded_wave, pad, trim)
+                - padded_wave: Padded wave array
+                - pad: Number of samples that were padded
+                - trim: Number of samples that were trimmed
+        """
+        n_sample = wave.shape[1]
+        trim = self.model.n_fft // 2
+        gen_size = self.model.chunk_size - 2 * trim
+        pad = gen_size - n_sample % gen_size
+        # Padded wave
+        wave_p = np.concatenate(
+            (
+                np.zeros((2, trim)),
+                wave,
+                np.zeros((2, pad)),
+                np.zeros((2, trim)),
+            ),
+            1,
+        )
+        mix_waves = []
+        for i in range(0, n_sample + pad, gen_size):
+            waves = np.array(wave_p[:, i:i + self.model.chunk_size])
+            mix_waves.append(waves)
+        mix_waves = torch.tensor(mix_waves, dtype=torch.float32).to(
+            self.device
+        )
+        return mix_waves, pad, trim
+    def _process_wave(self, mix_waves, trim, pad, q: queue.Queue, _id: int):
+        """
+        Process each wave segment in a multi-threaded environment
+        Args:
+            mix_waves: (torch.Tensor) Wave segments to be processed
+            trim: (int) Number of samples trimmed during padding
+            pad: (int) Number of samples padded during padding
+            q: (queue.Queue) Queue to hold the processed wave segments
+            _id: (int) Identifier of the processed wave segment
+        Returns:
+            numpy array: Processed wave segment
+        """
+        mix_waves = mix_waves.split(1)
+        with torch.no_grad():
+            pw = []
+            for mix_wave in mix_waves:
+                self.prog.update()
+                spec = self.model.stft(mix_wave)
+                processed_spec = torch.tensor(self.process(spec))
+                processed_wav = self.model.istft(
+                    processed_spec.to(self.device)
+                )
+                processed_wav = (
+                    processed_wav[:, :, trim:-trim]
+                    .transpose(0, 1)
+                    .reshape(2, -1)
+                    .cpu()
+                    .numpy()
+                )
+                pw.append(processed_wav)
+        processed_signal = np.concatenate(pw, axis=-1)[:, :-pad]
+        q.put({_id: processed_signal})
+        return processed_signal
+    def process_wave(self, wave: np.array, mt_threads=1):
+        """
+        Process the wave array in a multi-threaded environment
+        Args:
+            wave: (np.array) Wave array to be processed
+            mt_threads: (int) Number of threads to be used for processing
+        Returns:
+            numpy array: Processed wave array
+        """
+        self.prog = tqdm(total=0)
+        chunk = wave.shape[-1] // mt_threads
+        waves = self.segment(wave, False, chunk)
+        # Create a queue to hold the processed wave segments
+        q = queue.Queue()
+        threads = []
+        for c, batch in enumerate(waves):
+            mix_waves, pad, trim = self.pad_wave(batch)
+            self.prog.total = len(mix_waves) * mt_threads
+            thread = threading.Thread(
+                target=self._process_wave, args=(mix_waves, trim, pad, q, c)
+            )
+            thread.start()
+            threads.append(thread)
+        for thread in threads:
+            thread.join()
+        self.prog.close()
+        processed_batches = []
+        while not q.empty():
+            processed_batches.append(q.get())
+        processed_batches = [
+            list(wave.values())[0]
+            for wave in sorted(
+                processed_batches, key=lambda d: list(d.keys())[0]
+            )
+        ]
+        assert len(processed_batches) == len(
+            waves
+        ), "Incomplete processed batches, please reduce batch size!"
+        return self.segment(processed_batches, True, chunk)
+def run_mdx(
+    model_params,
+    output_dir,
+    model_path,
+    filename,
+    exclude_main=False,
+    exclude_inversion=False,
+    suffix=None,
+    invert_suffix=None,
+    denoise=False,
+    keep_orig=True,
+    m_threads=2,
+    device_base="cuda",
+):
+    if device_base == "cuda":
+        device = torch.device("cuda:0")
+        processor_num = 0
+        device_properties = torch.cuda.get_device_properties(device)
+        vram_gb = device_properties.total_memory / 1024**3
+        m_threads = 1 if vram_gb < 8 else 2
+    else:
+        device = torch.device("cpu")
+        processor_num = -1
+        m_threads = 1
+    model_hash = MDX.get_hash(model_path)
+    mp = model_params.get(model_hash)
+    model = MDXModel(
+        device,
+        dim_f=mp["mdx_dim_f_set"],
+        dim_t=2 ** mp["mdx_dim_t_set"],
+        n_fft=mp["mdx_n_fft_scale_set"],
+        stem_name=mp["primary_stem"],
+        compensation=mp["compensate"],
+    )
+    mdx_sess = MDX(model_path, model, processor=processor_num)
+    wave, sr = librosa.load(filename, mono=False, sr=44100)
+    # normalizing input wave gives better output
+    peak = max(np.max(wave), abs(np.min(wave)))
+    wave /= peak
+    if denoise:
+        wave_processed = -(mdx_sess.process_wave(-wave, m_threads)) + (
+            mdx_sess.process_wave(wave, m_threads)
+        )
+        wave_processed *= 0.5
+    else:
+        wave_processed = mdx_sess.process_wave(wave, m_threads)
+    # return to previous peak
+    wave_processed *= peak
+    stem_name = model.stem_name if suffix is None else suffix
+    main_filepath = None
+    if not exclude_main:
+        main_filepath = os.path.join(
+            output_dir,
+            f"{os.path.basename(os.path.splitext(filename)[0])}_{stem_name}.wav",
+        )
+        sf.write(main_filepath, wave_processed.T, sr)
+    invert_filepath = None
+    if not exclude_inversion:
+        diff_stem_name = (
+            stem_naming.get(stem_name)
+            if invert_suffix is None
+            else invert_suffix
+        )
+        stem_name = (
+            f"{stem_name}_diff" if diff_stem_name is None else diff_stem_name
+        )
+        invert_filepath = os.path.join(
+            output_dir,
+            f"{os.path.basename(os.path.splitext(filename)[0])}_{stem_name}.wav",
+        )
+        sf.write(
+            invert_filepath,
+            (-wave_processed.T * model.compensation) + wave.T,
+            sr,
+        )
+    if not keep_orig:
+        os.remove(filename)
+    del mdx_sess, wave_processed, wave
+    gc.collect()
+    torch.cuda.empty_cache()
+    return main_filepath, invert_filepath
+MDX_DOWNLOAD_LINK = "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/"
+UVR_MODELS = [
+    "UVR-MDX-NET-Voc_FT.onnx",
+    "UVR_MDXNET_KARA_2.onnx",
+    "Reverb_HQ_By_FoxJoy.onnx",
+    "UVR-MDX-NET-Inst_HQ_4.onnx",
+]
+BASE_DIR = "."  # os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+mdxnet_models_dir = os.path.join(BASE_DIR, "mdx_models")
+output_dir = os.path.join(BASE_DIR, "clean_song_output")
+def convert_to_stereo_and_wav(audio_path):
+    wave, sr = librosa.load(audio_path, mono=False, sr=44100)
+    # check if mono
+    if type(wave[0]) != np.ndarray or audio_path[-4:].lower() != ".wav": # noqa
+        stereo_path = f"{os.path.splitext(audio_path)[0]}_stereo.wav"
+        stereo_path = os.path.join(output_dir, stereo_path)
+        command = shlex.split(
+            f'ffmpeg -y -loglevel error -i "{audio_path}" -ac 2 -f wav "{stereo_path}"'
+        )
+        sub_params = {
+            "stdout": subprocess.PIPE,
+            "stderr": subprocess.PIPE,
+            "creationflags": subprocess.CREATE_NO_WINDOW
+            if sys.platform == "win32"
+            else 0,
+        }
+        process_wav = subprocess.Popen(command, **sub_params)
+        output, errors = process_wav.communicate()
+        if process_wav.returncode != 0 or not os.path.exists(stereo_path):
+            raise Exception("Error processing audio to stereo wav")
+        return stereo_path
+    else:
+        return audio_path
+def process_uvr_task(
+    orig_song_path: str = "aud_test.mp3",
+    main_vocals: bool = False,
+    dereverb: bool = True,
+    song_id: str = "mdx",  # folder output name
+    only_voiceless: bool = False,
+    remove_files_output_dir: bool = False,
+):
+    if os.environ.get("SONITR_DEVICE") == "cpu":
+        device_base = "cpu"
+    else:
+        device_base = "cuda" if torch.cuda.is_available() else "cpu"
+    if remove_files_output_dir:
+        remove_directory_contents(output_dir)
+    with open(os.path.join(mdxnet_models_dir, "data.json")) as infile:
+        mdx_model_params = json.load(infile)
+    song_output_dir = os.path.join(output_dir, song_id)
+    create_directories(song_output_dir)
+    orig_song_path = convert_to_stereo_and_wav(orig_song_path)
+    logger.debug(f"onnxruntime device >> {ort.get_device()}")
+    if only_voiceless:
+        logger.info("Voiceless Track Separation...")
+        return run_mdx(
+            mdx_model_params,
+            song_output_dir,
+            os.path.join(mdxnet_models_dir, "UVR-MDX-NET-Inst_HQ_4.onnx"),
+            orig_song_path,
+            suffix="Voiceless",
+            denoise=False,
+            keep_orig=True,
+            exclude_inversion=True,
+            device_base=device_base,
+        )
+    logger.info("Vocal Track Isolation and Voiceless Track Separation...")
+    vocals_path, instrumentals_path = run_mdx(
+        mdx_model_params,
+        song_output_dir,
+        os.path.join(mdxnet_models_dir, "UVR-MDX-NET-Voc_FT.onnx"),
+        orig_song_path,
+        denoise=True,
+        keep_orig=True,
+        device_base=device_base,
+    )
+    if main_vocals:
+        logger.info("Main Voice Separation from Supporting Vocals...")
+        backup_vocals_path, main_vocals_path = run_mdx(
+            mdx_model_params,
+            song_output_dir,
+            os.path.join(mdxnet_models_dir, "UVR_MDXNET_KARA_2.onnx"),
+            vocals_path,
+            suffix="Backup",
+            invert_suffix="Main",
+            denoise=True,
+            device_base=device_base,
+        )
+    else:
+        backup_vocals_path, main_vocals_path = None, vocals_path
+    if dereverb:
+        logger.info("Vocal Clarity Enhancement through De-Reverberation...")
+        _, vocals_dereverb_path = run_mdx(
+            mdx_model_params,
+            song_output_dir,
+            os.path.join(mdxnet_models_dir, "Reverb_HQ_By_FoxJoy.onnx"),
+            main_vocals_path,
+            invert_suffix="DeReverb",
+            exclude_main=True,
+            denoise=True,
+            device_base=device_base,
+        )
+    else:
+        vocals_dereverb_path = main_vocals_path
+    return (
+        vocals_path,
+        instrumentals_path,
+        backup_vocals_path,
+        main_vocals_path,
+        vocals_dereverb_path,
+    )
+if __name__ == "__main__":
+    from utils import download_manager
+    for id_model in UVR_MODELS:
+        download_manager(
+            os.path.join(MDX_DOWNLOAD_LINK, id_model), mdxnet_models_dir
+        )
+    (
+        vocals_path_,
+        instrumentals_path_,
+        backup_vocals_path_,
+        main_vocals_path_,
+        vocals_dereverb_path_,
+    ) = process_uvr_task(
+        orig_song_path="aud.mp3",
+        main_vocals=True,
+        dereverb=True,
+        song_id="mdx",
+        remove_files_output_dir=True,
+    )

soni_translate/postprocessor.py CHANGED Viewed

@@ -1,231 +1,231 @@
-from .utils import remove_files, run_command
-from .text_multiformat_processor import get_subtitle
-from .logging_setup import logger
-import unicodedata
-import shutil
-import copy
-import os
-import re
-OUTPUT_TYPE_OPTIONS = [
-    "video (mp4)",
-    "video (mkv)",
-    "audio (mp3)",
-    "audio (ogg)",
-    "audio (wav)",
-    "subtitle",
-    "subtitle [by speaker]",
-    "video [subtitled] (mp4)",
-    "video [subtitled] (mkv)",
-    "audio [original vocal sound]",
-    "audio [original background sound]",
-    "audio [original vocal and background sound]",
-    "audio [original vocal-dereverb sound]",
-    "audio [original vocal-dereverb and background sound]",
-    "raw media",
-]
-DOCS_OUTPUT_TYPE_OPTIONS = [
-    "videobook (mp4)",
-    "videobook (mkv)",
-    "audiobook (wav)",
-    "audiobook (mp3)",
-    "audiobook (ogg)",
-    "book (txt)",
-]  # Add DOCX and etc.
-def get_no_ext_filename(file_path):
-    file_name_with_extension = os.path.basename(rf"{file_path}")
-    filename_without_extension, _ = os.path.splitext(file_name_with_extension)
-    return filename_without_extension
-def get_video_info(link):
-    aux_name = f"video_url_{link}"
-    params_dlp = {"quiet": True, "no_warnings": True, "noplaylist": True}
-    try:
-        from yt_dlp import YoutubeDL
-        with YoutubeDL(params_dlp) as ydl:
-            if link.startswith(("www.youtube.com/", "m.youtube.com/")):
-                link = "https://" + link
-            info_dict = ydl.extract_info(link, download=False, process=False)
-            video_id = info_dict.get("id", aux_name)
-            video_title = info_dict.get("title", video_id)
-            if "youtube.com" in link and "&list=" in link:
-                video_title = ydl.extract_info(
-                    "https://m.youtube.com/watch?v="+video_id,
-                    download=False,
-                    process=False
-                ).get("title", video_title)
-    except Exception as error:
-        logger.error(str(error))
-        video_title, video_id = aux_name, "NO_ID"
-    return video_title, video_id
-def sanitize_file_name(file_name):
-    # Normalize the string to NFKD form to separate combined
-    # characters into base characters and diacritics
-    normalized_name = unicodedata.normalize("NFKD", file_name)
-    # Replace any non-ASCII characters or special symbols with an underscore
-    sanitized_name = re.sub(r"[^\w\s.-]", "_", normalized_name)
-    return sanitized_name
-def get_output_file(
-        original_file,
-        new_file_name,
-        soft_subtitles,
-        output_directory="",
-):
-    directory_base = "."  # default directory
-    if output_directory and os.path.isdir(output_directory):
-        new_file_path = os.path.join(output_directory, new_file_name)
-    else:
-        new_file_path = os.path.join(directory_base, "outputs", new_file_name)
-    remove_files(new_file_path)
-    cm = None
-    if soft_subtitles and original_file.endswith(".mp4"):
-        if new_file_path.endswith(".mp4"):
-            cm = f'ffmpeg -y -i "{original_file}" -i sub_tra.srt -i sub_ori.srt -map 0:v -map 0:a -map 1 -map 2 -c:v copy -c:a copy -c:s mov_text "{new_file_path}"'
-        else:
-            cm = f'ffmpeg -y -i "{original_file}" -i sub_tra.srt -i sub_ori.srt -map 0:v -map 0:a -map 1 -map 2 -c:v copy -c:a copy -c:s srt -movflags use_metadata_tags -map_metadata 0 "{new_file_path}"'
-    elif new_file_path.endswith(".mkv"):
-        cm = f'ffmpeg -i "{original_file}" -c:v copy -c:a copy "{new_file_path}"'
-    elif new_file_path.endswith(".wav") and not original_file.endswith(".wav"):
-        cm = f'ffmpeg -y -i "{original_file}" -acodec pcm_s16le -ar 44100 -ac 2 "{new_file_path}"'
-    elif new_file_path.endswith(".ogg"):
-        cm = f'ffmpeg -i "{original_file}" -c:a libvorbis "{new_file_path}"'
-    elif new_file_path.endswith(".mp3") and not original_file.endswith(".mp3"):
-        cm = f'ffmpeg -y -i "{original_file}" -codec:a libmp3lame -qscale:a 2 "{new_file_path}"'
-    if cm:
-        try:
-            run_command(cm)
-        except Exception as error:
-            logger.error(str(error))
-            remove_files(new_file_path)
-            shutil.copy2(original_file, new_file_path)
-    else:
-        shutil.copy2(original_file, new_file_path)
-    return os.path.abspath(new_file_path)
-def media_out(
-    media_file,
-    lang_code,
-    media_out_name="",
-    extension="mp4",
-    file_obj="video_dub.mp4",
-    soft_subtitles=False,
-    subtitle_files="disable",
-):
-    if media_out_name:
-        base_name = media_out_name + "_origin"
-    else:
-        if os.path.exists(media_file):
-            base_name = get_no_ext_filename(media_file)
-        else:
-            base_name, _ = get_video_info(media_file)
-        media_out_name = f"{base_name}__{lang_code}"
-    f_name = f"{sanitize_file_name(media_out_name)}.{extension}"
-    if subtitle_files != "disable":
-        final_media = [get_output_file(file_obj, f_name, soft_subtitles)]
-        name_tra = f"{sanitize_file_name(media_out_name)}.{subtitle_files}"
-        name_ori = f"{sanitize_file_name(base_name)}.{subtitle_files}"
-        tgt_subs = f"sub_tra.{subtitle_files}"
-        ori_subs = f"sub_ori.{subtitle_files}"
-        final_subtitles = [
-            get_output_file(tgt_subs, name_tra, False),
-            get_output_file(ori_subs, name_ori, False)
-        ]
-        return final_media + final_subtitles
-    else:
-        return get_output_file(file_obj, f_name, soft_subtitles)
-def get_subtitle_speaker(media_file, result, language, extension, base_name):
-    segments_base = copy.deepcopy(result)
-    # Sub segments by speaker
-    segments_by_speaker = {}
-    for segment in segments_base["segments"]:
-        if segment["speaker"] not in segments_by_speaker.keys():
-            segments_by_speaker[segment["speaker"]] = [segment]
-        else:
-            segments_by_speaker[segment["speaker"]].append(segment)
-    if not base_name:
-        if os.path.exists(media_file):
-            base_name = get_no_ext_filename(media_file)
-        else:
-            base_name, _ = get_video_info(media_file)
-    files_subs = []
-    for name_sk, segments in segments_by_speaker.items():
-        subtitle_speaker = get_subtitle(
-            language,
-            {"segments": segments},
-            extension,
-            filename=name_sk,
-        )
-        media_out_name = f"{base_name}_{language}_{name_sk}"
-        output = media_out(
-            media_file,  # no need
-            language,
-            media_out_name,
-            extension,
-            file_obj=subtitle_speaker,
-        )
-        files_subs.append(output)
-    return files_subs
-def sound_separate(media_file, task_uvr):
-    from .mdx_net import process_uvr_task
-    outputs = []
-    if "vocal" in task_uvr:
-        try:
-            _, _, _, _, vocal_audio = process_uvr_task(
-                orig_song_path=media_file,
-                main_vocals=False,
-                dereverb=True if "dereverb" in task_uvr else False,
-                remove_files_output_dir=True,
-            )
-            outputs.append(vocal_audio)
-        except Exception as error:
-            logger.error(str(error))
-    if "background" in task_uvr:
-        try:
-            background_audio, _ = process_uvr_task(
-                orig_song_path=media_file,
-                song_id="voiceless",
-                only_voiceless=True,
-                remove_files_output_dir=False if "vocal" in task_uvr else True,
-            )
-            # copy_files(background_audio, ".")
-            outputs.append(background_audio)
-        except Exception as error:
-            logger.error(str(error))
-    if not outputs:
-        raise Exception("Error in uvr process")
-    return outputs

+from .utils import remove_files, run_command
+from .text_multiformat_processor import get_subtitle
+from .logging_setup import logger
+import unicodedata
+import shutil
+import copy
+import os
+import re
+OUTPUT_TYPE_OPTIONS = [
+    "video (mp4)",
+    "video (mkv)",
+    "audio (mp3)",
+    "audio (ogg)",
+    "audio (wav)",
+    "subtitle",
+    "subtitle [by speaker]",
+    "video [subtitled] (mp4)",
+    "video [subtitled] (mkv)",
+    "audio [original vocal sound]",
+    "audio [original background sound]",
+    "audio [original vocal and background sound]",
+    "audio [original vocal-dereverb sound]",
+    "audio [original vocal-dereverb and background sound]",
+    "raw media",
+]
+DOCS_OUTPUT_TYPE_OPTIONS = [
+    "videobook (mp4)",
+    "videobook (mkv)",
+    "audiobook (wav)",
+    "audiobook (mp3)",
+    "audiobook (ogg)",
+    "book (txt)",
+]  # Add DOCX and etc.
+def get_no_ext_filename(file_path):
+    file_name_with_extension = os.path.basename(rf"{file_path}")
+    filename_without_extension, _ = os.path.splitext(file_name_with_extension)
+    return filename_without_extension
+def get_video_info(link):
+    aux_name = f"video_url_{link}"
+    params_dlp = {"quiet": True, "no_warnings": True, "noplaylist": True}
+    try:
+        from yt_dlp import YoutubeDL
+        with YoutubeDL(params_dlp) as ydl:
+            if link.startswith(("www.youtube.com/", "m.youtube.com/")):
+                link = "https://" + link
+            info_dict = ydl.extract_info(link, download=False, process=False)
+            video_id = info_dict.get("id", aux_name)
+            video_title = info_dict.get("title", video_id)
+            if "youtube.com" in link and "&list=" in link:
+                video_title = ydl.extract_info(
+                    "https://m.youtube.com/watch?v="+video_id,
+                    download=False,
+                    process=False
+                ).get("title", video_title)
+    except Exception as error:
+        logger.error(str(error))
+        video_title, video_id = aux_name, "NO_ID"
+    return video_title, video_id
+def sanitize_file_name(file_name):
+    # Normalize the string to NFKD form to separate combined
+    # characters into base characters and diacritics
+    normalized_name = unicodedata.normalize("NFKD", file_name)
+    # Replace any non-ASCII characters or special symbols with an underscore
+    sanitized_name = re.sub(r"[^\w\s.-]", "_", normalized_name)
+    return sanitized_name
+def get_output_file(
+        original_file,
+        new_file_name,
+        soft_subtitles,
+        output_directory="",
+):
+    directory_base = "."  # default directory
+    if output_directory and os.path.isdir(output_directory):
+        new_file_path = os.path.join(output_directory, new_file_name)
+    else:
+        new_file_path = os.path.join(directory_base, "outputs", new_file_name)
+    remove_files(new_file_path)
+    cm = None
+    if soft_subtitles and original_file.endswith(".mp4"):
+        if new_file_path.endswith(".mp4"):
+            cm = f'ffmpeg -y -i "{original_file}" -i sub_tra.srt -i sub_ori.srt -map 0:v -map 0:a -map 1 -map 2 -c:v copy -c:a copy -c:s mov_text "{new_file_path}"'
+        else:
+            cm = f'ffmpeg -y -i "{original_file}" -i sub_tra.srt -i sub_ori.srt -map 0:v -map 0:a -map 1 -map 2 -c:v copy -c:a copy -c:s srt -movflags use_metadata_tags -map_metadata 0 "{new_file_path}"'
+    elif new_file_path.endswith(".mkv"):
+        cm = f'ffmpeg -i "{original_file}" -c:v copy -c:a copy "{new_file_path}"'
+    elif new_file_path.endswith(".wav") and not original_file.endswith(".wav"):
+        cm = f'ffmpeg -y -i "{original_file}" -acodec pcm_s16le -ar 44100 -ac 2 "{new_file_path}"'
+    elif new_file_path.endswith(".ogg"):
+        cm = f'ffmpeg -i "{original_file}" -c:a libvorbis "{new_file_path}"'
+    elif new_file_path.endswith(".mp3") and not original_file.endswith(".mp3"):
+        cm = f'ffmpeg -y -i "{original_file}" -codec:a libmp3lame -qscale:a 2 "{new_file_path}"'
+    if cm:
+        try:
+            run_command(cm)
+        except Exception as error:
+            logger.error(str(error))
+            remove_files(new_file_path)
+            shutil.copy2(original_file, new_file_path)
+    else:
+        shutil.copy2(original_file, new_file_path)
+    return os.path.abspath(new_file_path)
+def media_out(
+    media_file,
+    lang_code,
+    media_out_name="",
+    extension="mp4",
+    file_obj="video_dub.mp4",
+    soft_subtitles=False,
+    subtitle_files="disable",
+):
+    if media_out_name:
+        base_name = media_out_name + "_origin"
+    else:
+        if os.path.exists(media_file):
+            base_name = get_no_ext_filename(media_file)
+        else:
+            base_name, _ = get_video_info(media_file)
+        media_out_name = f"{base_name}__{lang_code}"
+    f_name = f"{sanitize_file_name(media_out_name)}.{extension}"
+    if subtitle_files != "disable":
+        final_media = [get_output_file(file_obj, f_name, soft_subtitles)]
+        name_tra = f"{sanitize_file_name(media_out_name)}.{subtitle_files}"
+        name_ori = f"{sanitize_file_name(base_name)}.{subtitle_files}"
+        tgt_subs = f"sub_tra.{subtitle_files}"
+        ori_subs = f"sub_ori.{subtitle_files}"
+        final_subtitles = [
+            get_output_file(tgt_subs, name_tra, False),
+            get_output_file(ori_subs, name_ori, False)
+        ]
+        return final_media + final_subtitles
+    else:
+        return get_output_file(file_obj, f_name, soft_subtitles)
+def get_subtitle_speaker(media_file, result, language, extension, base_name):
+    segments_base = copy.deepcopy(result)
+    # Sub segments by speaker
+    segments_by_speaker = {}
+    for segment in segments_base["segments"]:
+        if segment["speaker"] not in segments_by_speaker.keys():
+            segments_by_speaker[segment["speaker"]] = [segment]
+        else:
+            segments_by_speaker[segment["speaker"]].append(segment)
+    if not base_name:
+        if os.path.exists(media_file):
+            base_name = get_no_ext_filename(media_file)
+        else:
+            base_name, _ = get_video_info(media_file)
+    files_subs = []
+    for name_sk, segments in segments_by_speaker.items():
+        subtitle_speaker = get_subtitle(
+            language,
+            {"segments": segments},
+            extension,
+            filename=name_sk,
+        )
+        media_out_name = f"{base_name}_{language}_{name_sk}"
+        output = media_out(
+            media_file,  # no need
+            language,
+            media_out_name,
+            extension,
+            file_obj=subtitle_speaker,
+        )
+        files_subs.append(output)
+    return files_subs
+def sound_separate(media_file, task_uvr):
+    from .mdx_net import process_uvr_task
+    outputs = []
+    if "vocal" in task_uvr:
+        try:
+            _, _, _, _, vocal_audio = process_uvr_task(
+                orig_song_path=media_file,
+                main_vocals=False,
+                dereverb=True if "dereverb" in task_uvr else False,
+                remove_files_output_dir=True,
+            )
+            outputs.append(vocal_audio)
+        except Exception as error:
+            logger.error(str(error))
+    if "background" in task_uvr:
+        try:
+            background_audio, _ = process_uvr_task(
+                orig_song_path=media_file,
+                song_id="voiceless",
+                only_voiceless=True,
+                remove_files_output_dir=False if "vocal" in task_uvr else True,
+            )
+            # copy_files(background_audio, ".")
+            outputs.append(background_audio)
+        except Exception as error:
+            logger.error(str(error))
+    if not outputs:
+        raise Exception("Error in uvr process")
+    return outputs

soni_translate/preprocessor.py CHANGED Viewed

@@ -1,309 +1,309 @@
-from .utils import remove_files
-import os, shutil, subprocess, time, shlex, sys # noqa
-from .logging_setup import logger
-import json
-ERROR_INCORRECT_CODEC_PARAMETERS = [
-    "prores",  # mov
-    "ffv1",  # mkv
-    "msmpeg4v3",  # avi
-    "wmv2",  # wmv
-    "theora",  # ogv
-]  # fix final merge
-TESTED_CODECS = [
-    "h264",  # mp4
-    "h265",  # mp4
-    "hevc",  # test
-    "vp9",  # webm
-    "mpeg4",  # mp4
-    "mpeg2video",  # mpg
-    "mjpeg",  # avi
-]
-class OperationFailedError(Exception):
-    def __init__(self, message="The operation did not complete successfully."):
-        self.message = message
-        super().__init__(self.message)
-def get_video_codec(video_file):
-    command_base = rf'ffprobe -v error -select_streams v:0 -show_entries stream=codec_name -of json "{video_file}"'
-    command = shlex.split(command_base)
-    try:
-        process = subprocess.Popen(
-            command,
-            stdout=subprocess.PIPE,
-            creationflags=subprocess.CREATE_NO_WINDOW if sys.platform == "win32" else 0,
-        )
-        output, _ = process.communicate()
-        codec_info = json.loads(output.decode('utf-8'))
-        codec_name = codec_info['streams'][0]['codec_name']
-        return codec_name
-    except Exception as error:
-        logger.debug(str(error))
-        return None
-def audio_preprocessor(preview, base_audio, audio_wav, use_cuda=False):
-    base_audio = base_audio.strip()
-    previous_files_to_remove = [audio_wav]
-    remove_files(previous_files_to_remove)
-    if preview:
-        logger.warning(
-            "Creating a preview video of 10 seconds, to disable "
-            "this option, go to advanced settings and turn off preview."
-        )
-        wav_ = f'ffmpeg -y -i "{base_audio}" -ss 00:00:20 -t 00:00:10 -vn -acodec pcm_s16le -ar 44100 -ac 2 audio.wav'
-    else:
-        wav_ = f'ffmpeg -y -i "{base_audio}" -vn -acodec pcm_s16le -ar 44100 -ac 2 audio.wav'
-    # Run cmd process
-    sub_params = {
-        "stdout": subprocess.PIPE,
-        "stderr": subprocess.PIPE,
-        "creationflags": subprocess.CREATE_NO_WINDOW
-        if sys.platform == "win32"
-        else 0,
-    }
-    wav_ = shlex.split(wav_)
-    result_convert_audio = subprocess.Popen(wav_, **sub_params)
-    output, errors = result_convert_audio.communicate()
-    time.sleep(1)
-    if result_convert_audio.returncode in [1, 2] or not os.path.exists(
-        audio_wav
-    ):
-        raise OperationFailedError(f"Error can't create the audio file:\n{errors.decode('utf-8')}")
-def audio_video_preprocessor(
-    preview, video, OutputFile, audio_wav, use_cuda=False
-):
-    video = video.strip()
-    previous_files_to_remove = [OutputFile, "audio.webm", audio_wav]
-    remove_files(previous_files_to_remove)
-    if os.path.exists(video):
-        if preview:
-            logger.warning(
-                "Creating a preview video of 10 seconds, "
-                "to disable this option, go to advanced "
-                "settings and turn off preview."
-            )
-            mp4_ = f'ffmpeg -y -i "{video}" -ss 00:00:20 -t 00:00:10 -c:v libx264 -c:a aac -strict experimental Video.mp4'
-        else:
-            video_codec = get_video_codec(video)
-            if not video_codec:
-                logger.debug("No video codec found in video")
-            else:
-                logger.info(f"Video codec: {video_codec}")
-            # Check if the file ends with ".mp4" extension or is valid codec
-            if video.endswith(".mp4") or video_codec in TESTED_CODECS:
-                destination_path = os.path.join(os.getcwd(), "Video.mp4")
-                shutil.copy(video, destination_path)
-                time.sleep(0.5)
-                if os.path.exists(OutputFile):
-                    mp4_ = "ffmpeg -h"
-                else:
-                    mp4_ = f'ffmpeg -y -i "{video}" -c copy Video.mp4'
-            else:
-                logger.warning(
-                    "File does not have the '.mp4' extension  or a "
-                    "supported codec. Converting video to mp4 (codec: h264)."
-                )
-                mp4_ = f'ffmpeg -y -i "{video}" -c:v libx264 -c:a aac -strict experimental Video.mp4'
-    else:
-        if preview:
-            logger.warning(
-                "Creating a preview from the link, 10 seconds "
-                "to disable this option, go to advanced "
-                "settings and turn off preview."
-            )
-            # https://github.com/yt-dlp/yt-dlp/issues/2220
-            mp4_ = f'yt-dlp -f "mp4" --downloader ffmpeg --downloader-args "ffmpeg_i: -ss 00:00:20 -t 00:00:10" --force-overwrites --max-downloads 1 --no-warnings --no-playlist --no-abort-on-error --ignore-no-formats-error --restrict-filenames -o {OutputFile} {video}'
-            wav_ = "ffmpeg -y -i Video.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 audio.wav"
-        else:
-            mp4_ = f'yt-dlp -f "mp4" --force-overwrites --max-downloads 1 --no-warnings --no-playlist --no-abort-on-error --ignore-no-formats-error --restrict-filenames -o {OutputFile} {video}'
-            wav_ = f"python -m yt_dlp --output {audio_wav} --force-overwrites --max-downloads 1 --no-warnings --no-playlist --no-abort-on-error --ignore-no-formats-error --extract-audio --audio-format wav {video}"
-    # Run cmd process
-    mp4_ = shlex.split(mp4_)
-    sub_params = {
-        "stdout": subprocess.PIPE,
-        "stderr": subprocess.PIPE,
-        "creationflags": subprocess.CREATE_NO_WINDOW
-        if sys.platform == "win32"
-        else 0,
-    }
-    if os.path.exists(video):
-        logger.info("Process video...")
-        result_convert_video = subprocess.Popen(mp4_, **sub_params)
-        # result_convert_video.wait()
-        output, errors = result_convert_video.communicate()
-        time.sleep(1)
-        if result_convert_video.returncode in [1, 2] or not os.path.exists(
-            OutputFile
-        ):
-            raise OperationFailedError(f"Error processing video:\n{errors.decode('utf-8')}")
-        logger.info("Process audio...")
-        wav_ = "ffmpeg -y -i Video.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 audio.wav"
-        wav_ = shlex.split(wav_)
-        result_convert_audio = subprocess.Popen(wav_, **sub_params)
-        output, errors = result_convert_audio.communicate()
-        time.sleep(1)
-        if result_convert_audio.returncode in [1, 2] or not os.path.exists(
-            audio_wav
-        ):
-            raise OperationFailedError(f"Error can't create the audio file:\n{errors.decode('utf-8')}")
-    else:
-        wav_ = shlex.split(wav_)
-        if preview:
-            result_convert_video = subprocess.Popen(mp4_, **sub_params)
-            output, errors = result_convert_video.communicate()
-            time.sleep(0.5)
-            result_convert_audio = subprocess.Popen(wav_, **sub_params)
-            output, errors = result_convert_audio.communicate()
-            time.sleep(0.5)
-            if result_convert_audio.returncode in [1, 2] or not os.path.exists(
-                audio_wav
-            ):
-                raise OperationFailedError(
-                    f"Error can't create the preview file:\n{errors.decode('utf-8')}"
-                )
-        else:
-            logger.info("Process audio...")
-            result_convert_audio = subprocess.Popen(wav_, **sub_params)
-            output, errors = result_convert_audio.communicate()
-            time.sleep(1)
-            if result_convert_audio.returncode in [1, 2] or not os.path.exists(
-                audio_wav
-            ):
-                raise OperationFailedError(f"Error can't download the audio:\n{errors.decode('utf-8')}")
-            logger.info("Process video...")
-            result_convert_video = subprocess.Popen(mp4_, **sub_params)
-            output, errors = result_convert_video.communicate()
-            time.sleep(1)
-            if result_convert_video.returncode in [1, 2] or not os.path.exists(
-                OutputFile
-            ):
-                raise OperationFailedError(f"Error can't download the video:\n{errors.decode('utf-8')}")
-def old_audio_video_preprocessor(preview, video, OutputFile, audio_wav):
-    previous_files_to_remove = [OutputFile, "audio.webm", audio_wav]
-    remove_files(previous_files_to_remove)
-    if os.path.exists(video):
-        if preview:
-            logger.warning(
-                "Creating a preview video of 10 seconds, "
-                "to disable this option, go to advanced "
-                "settings and turn off preview."
-            )
-            command = f'ffmpeg -y -i "{video}" -ss 00:00:20 -t 00:00:10 -c:v libx264 -c:a aac -strict experimental Video.mp4'
-            result_convert_video = subprocess.run(
-                command, capture_output=True, text=True, shell=True
-            )
-        else:
-            # Check if the file ends with ".mp4" extension
-            if video.endswith(".mp4"):
-                destination_path = os.path.join(os.getcwd(), "Video.mp4")
-                shutil.copy(video, destination_path)
-                result_convert_video = {}
-                result_convert_video = subprocess.run(
-                    "echo Video copied",
-                    capture_output=True,
-                    text=True,
-                    shell=True,
-                )
-            else:
-                logger.warning(
-                    "File does not have the '.mp4' extension. Converting video."
-                )
-                command = f'ffmpeg -y -i "{video}" -c:v libx264 -c:a aac -strict experimental Video.mp4'
-                result_convert_video = subprocess.run(
-                    command, capture_output=True, text=True, shell=True
-                )
-        if result_convert_video.returncode in [1, 2]:
-            raise OperationFailedError("Error can't convert the video")
-        for i in range(120):
-            time.sleep(1)
-            logger.info("Process video...")
-            if os.path.exists(OutputFile):
-                time.sleep(1)
-                command = "ffmpeg -y -i Video.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 audio.wav"
-                result_convert_audio = subprocess.run(
-                    command, capture_output=True, text=True, shell=True
-                )
-                time.sleep(1)
-                break
-            if i == 119:
-                # if not os.path.exists(OutputFile):
-                raise OperationFailedError("Error processing video")
-        if result_convert_audio.returncode in [1, 2]:
-            raise OperationFailedError(
-                f"Error can't create the audio file: {result_convert_audio.stderr}"
-            )
-        for i in range(120):
-            time.sleep(1)
-            logger.info("Process audio...")
-            if os.path.exists(audio_wav):
-                break
-            if i == 119:
-                raise OperationFailedError("Error can't create the audio file")
-    else:
-        video = video.strip()
-        if preview:
-            logger.warning(
-                "Creating a preview from the link, 10 "
-                "seconds to disable this option, go to "
-                "advanced settings and turn off preview."
-            )
-            # https://github.com/yt-dlp/yt-dlp/issues/2220
-            mp4_ = f'yt-dlp -f "mp4" --downloader ffmpeg --downloader-args "ffmpeg_i: -ss 00:00:20 -t 00:00:10" --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --restrict-filenames -o {OutputFile} {video}'
-            wav_ = "ffmpeg -y -i Video.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 audio.wav"
-            result_convert_video = subprocess.run(
-                mp4_, capture_output=True, text=True, shell=True
-            )
-            result_convert_audio = subprocess.run(
-                wav_, capture_output=True, text=True, shell=True
-            )
-            if result_convert_audio.returncode in [1, 2]:
-                raise OperationFailedError("Error can't download a preview")
-        else:
-            mp4_ = f'yt-dlp -f "mp4" --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --restrict-filenames -o {OutputFile} {video}'
-            wav_ = f"python -m yt_dlp --output {audio_wav} --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --extract-audio --audio-format wav {video}"
-            result_convert_audio = subprocess.run(
-                wav_, capture_output=True, text=True, shell=True
-            )
-            if result_convert_audio.returncode in [1, 2]:
-                raise OperationFailedError("Error can't download the audio")
-            for i in range(120):
-                time.sleep(1)
-                logger.info("Process audio...")
-                if os.path.exists(audio_wav) and not os.path.exists(
-                    "audio.webm"
-                ):
-                    time.sleep(1)
-                    result_convert_video = subprocess.run(
-                        mp4_, capture_output=True, text=True, shell=True
-                    )
-                    break
-                if i == 119:
-                    raise OperationFailedError("Error downloading the audio")
-            if result_convert_video.returncode in [1, 2]:
-                raise OperationFailedError("Error can't download the video")

+from .utils import remove_files
+import os, shutil, subprocess, time, shlex, sys # noqa
+from .logging_setup import logger
+import json
+ERROR_INCORRECT_CODEC_PARAMETERS = [
+    "prores",  # mov
+    "ffv1",  # mkv
+    "msmpeg4v3",  # avi
+    "wmv2",  # wmv
+    "theora",  # ogv
+]  # fix final merge
+TESTED_CODECS = [
+    "h264",  # mp4
+    "h265",  # mp4
+    "hevc",
+    "vp9",  # webm
+    "mpeg4",  # mp4
+    "mpeg2video",  # mpg
+    "mjpeg",  # avi
+]
+class OperationFailedError(Exception):
+    def __init__(self, message="The operation did not complete successfully."):
+        self.message = message
+        super().__init__(self.message)
+def get_video_codec(video_file):
+    command_base = rf'ffprobe -v error -select_streams v:0 -show_entries stream=codec_name -of json "{video_file}"'
+    command = shlex.split(command_base)
+    try:
+        process = subprocess.Popen(
+            command,
+            stdout=subprocess.PIPE,
+            creationflags=subprocess.CREATE_NO_WINDOW if sys.platform == "win32" else 0,
+        )
+        output, _ = process.communicate()
+        codec_info = json.loads(output.decode('utf-8'))
+        codec_name = codec_info['streams'][0]['codec_name']
+        return codec_name
+    except Exception as error:
+        logger.debug(str(error))
+        return None
+def audio_preprocessor(preview, base_audio, audio_wav, use_cuda=False):
+    base_audio = base_audio.strip()
+    previous_files_to_remove = [audio_wav]
+    remove_files(previous_files_to_remove)
+    if preview:
+        logger.warning(
+            "Creating a preview video of 10 seconds, to disable "
+            "this option, go to advanced settings and turn off preview."
+        )
+        wav_ = f'ffmpeg -y -i "{base_audio}" -ss 00:00:20 -t 00:00:10 -vn -acodec pcm_s16le -ar 44100 -ac 2 audio.wav'
+    else:
+        wav_ = f'ffmpeg -y -i "{base_audio}" -vn -acodec pcm_s16le -ar 44100 -ac 2 audio.wav'
+    # Run cmd process
+    sub_params = {
+        "stdout": subprocess.PIPE,
+        "stderr": subprocess.PIPE,
+        "creationflags": subprocess.CREATE_NO_WINDOW
+        if sys.platform == "win32"
+        else 0,
+    }
+    wav_ = shlex.split(wav_)
+    result_convert_audio = subprocess.Popen(wav_, **sub_params)
+    output, errors = result_convert_audio.communicate()
+    time.sleep(1)
+    if result_convert_audio.returncode in [1, 2] or not os.path.exists(
+        audio_wav
+    ):
+        raise OperationFailedError(f"Error can't create the audio file:\n{errors.decode('utf-8')}")
+def audio_video_preprocessor(
+    preview, video, OutputFile, audio_wav, use_cuda=False
+):
+    video = video.strip()
+    previous_files_to_remove = [OutputFile, "audio.webm", audio_wav]
+    remove_files(previous_files_to_remove)
+    if os.path.exists(video):
+        if preview:
+            logger.warning(
+                "Creating a preview video of 10 seconds, "
+                "to disable this option, go to advanced "
+                "settings and turn off preview."
+            )
+            mp4_ = f'ffmpeg -y -i "{video}" -ss 00:00:20 -t 00:00:10 -c:v libx264 -c:a aac -strict experimental Video.mp4'
+        else:
+            video_codec = get_video_codec(video)
+            if not video_codec:
+                logger.debug("No video codec found in video")
+            else:
+                logger.info(f"Video codec: {video_codec}")
+            # Check if the file ends with ".mp4" extension or is valid codec
+            if video.endswith(".mp4") or video_codec in TESTED_CODECS:
+                destination_path = os.path.join(os.getcwd(), "Video.mp4")
+                shutil.copy(video, destination_path)
+                time.sleep(0.5)
+                if os.path.exists(OutputFile):
+                    mp4_ = "ffmpeg -h"
+                else:
+                    mp4_ = f'ffmpeg -y -i "{video}" -c copy Video.mp4'
+            else:
+                logger.warning(
+                    "File does not have the '.mp4' extension  or a "
+                    "supported codec. Converting video to mp4 (codec: h264)."
+                )
+                mp4_ = f'ffmpeg -y -i "{video}" -c:v libx264 -c:a aac -strict experimental Video.mp4'
+    else:
+        if preview:
+            logger.warning(
+                "Creating a preview from the link, 10 seconds "
+                "to disable this option, go to advanced "
+                "settings and turn off preview."
+            )
+            # https://github.com/yt-dlp/yt-dlp/issues/2220
+            mp4_ = f'yt-dlp -f "mp4" --downloader ffmpeg --downloader-args "ffmpeg_i: -ss 00:00:20 -t 00:00:10" --force-overwrites --max-downloads 1 --no-warnings --no-playlist --no-abort-on-error --ignore-no-formats-error --restrict-filenames -o {OutputFile} {video}'
+            wav_ = "ffmpeg -y -i Video.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 audio.wav"
+        else:
+            mp4_ = f'yt-dlp -f "mp4" --force-overwrites --max-downloads 1 --no-warnings --no-playlist --no-abort-on-error --ignore-no-formats-error --restrict-filenames -o {OutputFile} {video}'
+            wav_ = f"python -m yt_dlp --output {audio_wav} --force-overwrites --max-downloads 1 --no-warnings --no-playlist --no-abort-on-error --ignore-no-formats-error --extract-audio --audio-format wav {video}"
+    # Run cmd process
+    mp4_ = shlex.split(mp4_)
+    sub_params = {
+        "stdout": subprocess.PIPE,
+        "stderr": subprocess.PIPE,
+        "creationflags": subprocess.CREATE_NO_WINDOW
+        if sys.platform == "win32"
+        else 0,
+    }
+    if os.path.exists(video):
+        logger.info("Process video...")
+        result_convert_video = subprocess.Popen(mp4_, **sub_params)
+        # result_convert_video.wait()
+        output, errors = result_convert_video.communicate()
+        time.sleep(1)
+        if result_convert_video.returncode in [1, 2] or not os.path.exists(
+            OutputFile
+        ):
+            raise OperationFailedError(f"Error processing video:\n{errors.decode('utf-8')}")
+        logger.info("Process audio...")
+        wav_ = "ffmpeg -y -i Video.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 audio.wav"
+        wav_ = shlex.split(wav_)
+        result_convert_audio = subprocess.Popen(wav_, **sub_params)
+        output, errors = result_convert_audio.communicate()
+        time.sleep(1)
+        if result_convert_audio.returncode in [1, 2] or not os.path.exists(
+            audio_wav
+        ):
+            raise OperationFailedError(f"Error can't create the audio file:\n{errors.decode('utf-8')}")
+    else:
+        wav_ = shlex.split(wav_)
+        if preview:
+            result_convert_video = subprocess.Popen(mp4_, **sub_params)
+            output, errors = result_convert_video.communicate()
+            time.sleep(0.5)
+            result_convert_audio = subprocess.Popen(wav_, **sub_params)
+            output, errors = result_convert_audio.communicate()
+            time.sleep(0.5)
+            if result_convert_audio.returncode in [1, 2] or not os.path.exists(
+                audio_wav
+            ):
+                raise OperationFailedError(
+                    f"Error can't create the preview file:\n{errors.decode('utf-8')}"
+                )
+        else:
+            logger.info("Process audio...")
+            result_convert_audio = subprocess.Popen(wav_, **sub_params)
+            output, errors = result_convert_audio.communicate()
+            time.sleep(1)
+            if result_convert_audio.returncode in [1, 2] or not os.path.exists(
+                audio_wav
+            ):
+                raise OperationFailedError(f"Error can't download the audio:\n{errors.decode('utf-8')}")
+            logger.info("Process video...")
+            result_convert_video = subprocess.Popen(mp4_, **sub_params)
+            output, errors = result_convert_video.communicate()
+            time.sleep(1)
+            if result_convert_video.returncode in [1, 2] or not os.path.exists(
+                OutputFile
+            ):
+                raise OperationFailedError(f"Error can't download the video:\n{errors.decode('utf-8')}")
+def old_audio_video_preprocessor(preview, video, OutputFile, audio_wav):
+    previous_files_to_remove = [OutputFile, "audio.webm", audio_wav]
+    remove_files(previous_files_to_remove)
+    if os.path.exists(video):
+        if preview:
+            logger.warning(
+                "Creating a preview video of 10 seconds, "
+                "to disable this option, go to advanced "
+                "settings and turn off preview."
+            )
+            command = f'ffmpeg -y -i "{video}" -ss 00:00:20 -t 00:00:10 -c:v libx264 -c:a aac -strict experimental Video.mp4'
+            result_convert_video = subprocess.run(
+                command, capture_output=True, text=True, shell=True
+            )
+        else:
+            # Check if the file ends with ".mp4" extension
+            if video.endswith(".mp4"):
+                destination_path = os.path.join(os.getcwd(), "Video.mp4")
+                shutil.copy(video, destination_path)
+                result_convert_video = {}
+                result_convert_video = subprocess.run(
+                    "echo Video copied",
+                    capture_output=True,
+                    text=True,
+                    shell=True,
+                )
+            else:
+                logger.warning(
+                    "File does not have the '.mp4' extension. Converting video."
+                )
+                command = f'ffmpeg -y -i "{video}" -c:v libx264 -c:a aac -strict experimental Video.mp4'
+                result_convert_video = subprocess.run(
+                    command, capture_output=True, text=True, shell=True
+                )
+        if result_convert_video.returncode in [1, 2]:
+            raise OperationFailedError("Error can't convert the video")
+        for i in range(120):
+            time.sleep(1)
+            logger.info("Process video...")
+            if os.path.exists(OutputFile):
+                time.sleep(1)
+                command = "ffmpeg -y -i Video.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 audio.wav"
+                result_convert_audio = subprocess.run(
+                    command, capture_output=True, text=True, shell=True
+                )
+                time.sleep(1)
+                break
+            if i == 119:
+                # if not os.path.exists(OutputFile):
+                raise OperationFailedError("Error processing video")
+        if result_convert_audio.returncode in [1, 2]:
+            raise OperationFailedError(
+                f"Error can't create the audio file: {result_convert_audio.stderr}"
+            )
+        for i in range(120):
+            time.sleep(1)
+            logger.info("Process audio...")
+            if os.path.exists(audio_wav):
+                break
+            if i == 119:
+                raise OperationFailedError("Error can't create the audio file")
+    else:
+        video = video.strip()
+        if preview:
+            logger.warning(
+                "Creating a preview from the link, 10 "
+                "seconds to disable this option, go to "
+                "advanced settings and turn off preview."
+            )
+            # https://github.com/yt-dlp/yt-dlp/issues/2220
+            mp4_ = f'yt-dlp -f "mp4" --downloader ffmpeg --downloader-args "ffmpeg_i: -ss 00:00:20 -t 00:00:10" --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --restrict-filenames -o {OutputFile} {video}'
+            wav_ = "ffmpeg -y -i Video.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 audio.wav"
+            result_convert_video = subprocess.run(
+                mp4_, capture_output=True, text=True, shell=True
+            )
+            result_convert_audio = subprocess.run(
+                wav_, capture_output=True, text=True, shell=True
+            )
+            if result_convert_audio.returncode in [1, 2]:
+                raise OperationFailedError("Error can't download a preview")
+        else:
+            mp4_ = f'yt-dlp -f "mp4" --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --restrict-filenames -o {OutputFile} {video}'
+            wav_ = f"python -m yt_dlp --output {audio_wav} --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --extract-audio --audio-format wav {video}"
+            result_convert_audio = subprocess.run(
+                wav_, capture_output=True, text=True, shell=True
+            )
+            if result_convert_audio.returncode in [1, 2]:
+                raise OperationFailedError("Error can't download the audio")
+            for i in range(120):
+                time.sleep(1)
+                logger.info("Process audio...")
+                if os.path.exists(audio_wav) and not os.path.exists(
+                    "audio.webm"
+                ):
+                    time.sleep(1)
+                    result_convert_video = subprocess.run(
+                        mp4_, capture_output=True, text=True, shell=True
+                    )
+                    break
+                if i == 119:
+                    raise OperationFailedError("Error downloading the audio")
+            if result_convert_video.returncode in [1, 2]:
+                raise OperationFailedError("Error can't download the video")

soni_translate/speech_segmentation.py CHANGED Viewed

@@ -1,499 +1,447 @@
-from whisperx.alignment import (
-    DEFAULT_ALIGN_MODELS_TORCH as DAMT,
-    DEFAULT_ALIGN_MODELS_HF as DAMHF,
-)
-from whisperx.utils import TO_LANGUAGE_CODE
-import whisperx
-import torch
-import gc
-import os
-import soundfile as sf
-from IPython.utils import capture # noqa
-from .language_configuration import EXTRA_ALIGN, INVERTED_LANGUAGES
-from .logging_setup import logger
-from .postprocessor import sanitize_file_name
-from .utils import remove_directory_contents, run_command
-# ZERO GPU CONFIG
-import spaces
-import copy
-import random
-import time
-def random_sleep():
-    if os.environ.get("ZERO_GPU") == "TRUE":
-        print("Random sleep")
-        sleep_time = round(random.uniform(7.2, 9.9), 1)
-        time.sleep(sleep_time)
-@spaces.GPU
-def load_and_transcribe_audio(asr_model, audio, compute_type, language, asr_options, batch_size, segment_duration_limit):
-    # Load model
-    model = whisperx.load_model(
-        asr_model,
-        os.environ.get("SONITR_DEVICE") if os.environ.get("ZERO_GPU") != "TRUE" else "cuda",
-        compute_type=compute_type,
-        language=language,
-        asr_options=asr_options,
-    )
-    # Transcribe audio
-    result = model.transcribe(
-        audio,
-        batch_size=batch_size,
-        chunk_size=segment_duration_limit,
-        print_progress=True,
-    )
-    del model
-    gc.collect()
-    torch.cuda.empty_cache()  # noqa
-    return result
-def load_align_and_align_segments(result, audio, DAMHF):
-    # Load alignment model
-    model_a, metadata = whisperx.load_align_model(
-        language_code=result["language"],
-        device=os.environ.get("SONITR_DEVICE") if os.environ.get("ZERO_GPU") != "TRUE" else "cuda",
-        model_name=None
-        if result["language"] in DAMHF.keys()
-        else EXTRA_ALIGN[result["language"]],
-    )
-    # Align segments
-    alignment_result = whisperx.align(
-        result["segments"],
-        model_a,
-        metadata,
-        audio,
-        os.environ.get("SONITR_DEVICE") if os.environ.get("ZERO_GPU") != "TRUE" else "cuda",
-        return_char_alignments=True,
-        print_progress=False,
-    )
-    # Clean up
-    del model_a
-    gc.collect()
-    torch.cuda.empty_cache()  # noqa
-    return alignment_result
-@spaces.GPU
-def diarize_audio(diarize_model, audio_wav, min_speakers, max_speakers):
-    if os.environ.get("ZERO_GPU") == "TRUE":
-        diarize_model.model.to(torch.device("cuda"))
-    diarize_segments = diarize_model(
-        audio_wav,
-        min_speakers=min_speakers,
-        max_speakers=max_speakers
-    )
-    return diarize_segments
-# ZERO GPU CONFIG
-ASR_MODEL_OPTIONS = [
-    "tiny",
-    "base",
-    "small",
-    "medium",
-    "large",
-    "large-v1",
-    "large-v2",
-    "large-v3",
-    "distil-large-v2",
-    "Systran/faster-distil-whisper-large-v3",
-    "tiny.en",
-    "base.en",
-    "small.en",
-    "medium.en",
-    "distil-small.en",
-    "distil-medium.en",
-    "OpenAI_API_Whisper",
-]
-COMPUTE_TYPE_GPU = [
-    "default",
-    "auto",
-    "int8",
-    "int8_float32",
-    "int8_float16",
-    "int8_bfloat16",
-    "float16",
-    "bfloat16",
-    "float32"
-]
-COMPUTE_TYPE_CPU = [
-    "default",
-    "auto",
-    "int8",
-    "int8_float32",
-    "int16",
-    "float32",
-]
-WHISPER_MODELS_PATH = './WHISPER_MODELS'
-def openai_api_whisper(
-    input_audio_file,
-    source_lang=None,
-    chunk_duration=1800
-):
-    info = sf.info(input_audio_file)
-    duration = info.duration
-    output_directory = "./whisper_api_audio_parts"
-    os.makedirs(output_directory, exist_ok=True)
-    remove_directory_contents(output_directory)
-    if duration > chunk_duration:
-        # Split the audio file into smaller chunks with 30-minute duration
-        cm = f'ffmpeg -i "{input_audio_file}" -f segment -segment_time {chunk_duration} -c:a libvorbis "{output_directory}/output%03d.ogg"'
-        run_command(cm)
-        # Get list of generated chunk files
-        chunk_files = sorted(
-            [f"{output_directory}/{f}" for f in os.listdir(output_directory) if f.endswith('.ogg')]
-        )
-    else:
-        one_file = f"{output_directory}/output000.ogg"
-        cm = f'ffmpeg -i "{input_audio_file}" -c:a libvorbis {one_file}'
-        run_command(cm)
-        chunk_files = [one_file]
-    # Transcript
-    segments = []
-    language = source_lang if source_lang else None
-    for i, chunk in enumerate(chunk_files):
-        from openai import OpenAI
-        client = OpenAI()
-        audio_file = open(chunk, "rb")
-        transcription = client.audio.transcriptions.create(
-          model="whisper-1",
-          file=audio_file,
-          language=language,
-          response_format="verbose_json",
-          timestamp_granularities=["segment"],
-        )
-        try:
-            transcript_dict = transcription.model_dump()
-        except: # noqa
-            transcript_dict = transcription.to_dict()
-        if language is None:
-            logger.info(f'Language detected: {transcript_dict["language"]}')
-            language = TO_LANGUAGE_CODE[transcript_dict["language"]]
-        chunk_time = chunk_duration * (i)
-        for seg in transcript_dict["segments"]:
-            if "start" in seg.keys():
-                segments.append(
-                    {
-                        "text": seg["text"],
-                        "start": seg["start"] + chunk_time,
-                        "end": seg["end"] + chunk_time,
-                    }
-                )
-    audio = whisperx.load_audio(input_audio_file)
-    result = {"segments": segments, "language": language}
-    return audio, result
-def find_whisper_models():
-    path = WHISPER_MODELS_PATH
-    folders = []
-    if os.path.exists(path):
-        for folder in os.listdir(path):
-            folder_path = os.path.join(path, folder)
-            if (
-                os.path.isdir(folder_path)
-                and 'model.bin' in os.listdir(folder_path)
-            ):
-                folders.append(folder)
-    return folders
-def transcribe_speech(
-    audio_wav,
-    asr_model,
-    compute_type,
-    batch_size,
-    SOURCE_LANGUAGE,
-    literalize_numbers=True,
-    segment_duration_limit=15,
-):
-    """
-    Transcribe speech using a whisper model.
-    Parameters:
-    - audio_wav (str): Path to the audio file in WAV format.
-    - asr_model (str): The whisper model to be loaded.
-    - compute_type (str): Type of compute to be used (e.g., 'int8', 'float16').
-    - batch_size (int): Batch size for transcription.
-    - SOURCE_LANGUAGE (str): Source language for transcription.
-    Returns:
-    - Tuple containing:
-        - audio: Loaded audio file.
-        - result: Transcription result as a dictionary.
-    """
-    if asr_model == "OpenAI_API_Whisper":
-        if literalize_numbers:
-            logger.info(
-                "OpenAI's API Whisper does not support "
-                "the literalization of numbers."
-            )
-        return openai_api_whisper(audio_wav, SOURCE_LANGUAGE)
-    # https://github.com/openai/whisper/discussions/277
-    prompt = "以下是普通话的句子。" if SOURCE_LANGUAGE == "zh" else None
-    SOURCE_LANGUAGE = (
-        SOURCE_LANGUAGE if SOURCE_LANGUAGE != "zh-TW" else "zh"
-    )
-    asr_options = {
-        "initial_prompt": prompt,
-        "suppress_numerals": literalize_numbers
-    }
-    if asr_model not in ASR_MODEL_OPTIONS:
-        base_dir = WHISPER_MODELS_PATH
-        if not os.path.exists(base_dir):
-            os.makedirs(base_dir)
-        model_dir = os.path.join(base_dir, sanitize_file_name(asr_model))
-        if not os.path.exists(model_dir):
-            from ctranslate2.converters import TransformersConverter
-            quantization = "float32"
-            # Download new model
-            try:
-                converter = TransformersConverter(
-                    asr_model,
-                    low_cpu_mem_usage=True,
-                    copy_files=[
-                        "tokenizer_config.json", "preprocessor_config.json"
-                    ]
-                )
-                converter.convert(
-                    model_dir,
-                    quantization=quantization,
-                    force=False
-                )
-            except Exception as error:
-                if "File tokenizer_config.json does not exist" in str(error):
-                    converter._copy_files = [
-                        "tokenizer.json", "preprocessor_config.json"
-                    ]
-                    converter.convert(
-                        model_dir,
-                        quantization=quantization,
-                        force=True
-                    )
-                else:
-                    raise error
-        asr_model = model_dir
-        logger.info(f"ASR Model: {str(model_dir)}")
-    audio = whisperx.load_audio(audio_wav)
-    result = load_and_transcribe_audio(
-        asr_model, audio, compute_type, SOURCE_LANGUAGE, asr_options, batch_size, segment_duration_limit
-    )
-    if result["language"] == "zh" and not prompt:
-        result["language"] = "zh-TW"
-        logger.info("Chinese - Traditional (zh-TW)")
-    return audio, result
-def align_speech(audio, result):
-    """
-    Aligns speech segments based on the provided audio and result metadata.
-    Parameters:
-    - audio (array): The audio data in a suitable format for alignment.
-    - result (dict): Metadata containing information about the segments
-         and language.
-    Returns:
-    - result (dict): Updated metadata after aligning the segments with
-        the audio. This includes character-level alignments if
-        'return_char_alignments' is set to True.
-    Notes:
-    - This function uses language-specific models to align speech segments.
-    - It performs language compatibility checks and selects the
-        appropriate alignment model.
-    - Cleans up memory by releasing resources after alignment.
-    """
-    DAMHF.update(DAMT)  # lang align
-    if (
-        not result["language"] in DAMHF.keys()
-        and not result["language"] in EXTRA_ALIGN.keys()
-    ):
-        logger.warning(
-            "Automatic detection: Source language not compatible with align"
-        )
-        raise ValueError(
-            f"Detected language {result['language']}  incompatible, "
-            "you can select the source language to avoid this error."
-        )
-    if (
-        result["language"] in EXTRA_ALIGN.keys()
-        and EXTRA_ALIGN[result["language"]] == ""
-    ):
-        lang_name = (
-            INVERTED_LANGUAGES[result["language"]]
-            if result["language"] in INVERTED_LANGUAGES.keys()
-            else result["language"]
-        )
-        logger.warning(
-            "No compatible wav2vec2 model found "
-            f"for the language '{lang_name}', skipping alignment."
-        )
-        return result
-    # random_sleep()
-    result = load_align_and_align_segments(result, audio, DAMHF)
-    return result
-diarization_models = {
-    "pyannote_3.1": "pyannote/speaker-diarization-3.1",
-    "pyannote_2.1": "pyannote/speaker-diarization@2.1",
-    "disable": "",
-}
-def reencode_speakers(result):
-    if result["segments"][0]["speaker"] == "SPEAKER_00":
-        return result
-    speaker_mapping = {}
-    counter = 0
-    logger.debug("Reencode speakers")
-    for segment in result["segments"]:
-        old_speaker = segment["speaker"]
-        if old_speaker not in speaker_mapping:
-            speaker_mapping[old_speaker] = f"SPEAKER_{counter:02d}"
-            counter += 1
-        segment["speaker"] = speaker_mapping[old_speaker]
-    return result
-def diarize_speech(
-    audio_wav,
-    result,
-    min_speakers,
-    max_speakers,
-    YOUR_HF_TOKEN,
-    model_name="pyannote/speaker-diarization@2.1",
-):
-    """
-    Performs speaker diarization on speech segments.
-    Parameters:
-    - audio_wav (array): Audio data in WAV format to perform speaker
-        diarization.
-    - result (dict): Metadata containing information about speech segments
-        and alignments.
-    - min_speakers (int): Minimum number of speakers expected in the audio.
-    - max_speakers (int): Maximum number of speakers expected in the audio.
-    - YOUR_HF_TOKEN (str): Your Hugging Face API token for model
-        authentication.
-    - model_name (str): Name of the speaker diarization model to be used
-        (default: "pyannote/speaker-diarization@2.1").
-    Returns:
-    - result_diarize (dict): Updated metadata after assigning speaker
-        labels to segments.
-    Notes:
-    - This function utilizes a speaker diarization model to label speaker
-        segments in the audio.
-    - It assigns speakers to word-level segments based on diarization results.
-    - Cleans up memory by releasing resources after diarization.
-    - If only one speaker is specified, each segment is automatically assigned
-        as the first speaker, eliminating the need for diarization inference.
-    """
-    if max(min_speakers, max_speakers) > 1 and model_name:
-        try:
-            diarize_model = whisperx.DiarizationPipeline(
-                model_name=model_name,
-                use_auth_token=YOUR_HF_TOKEN,
-                device=os.environ.get("SONITR_DEVICE"),
-            )
-        except Exception as error:
-            error_str = str(error)
-            gc.collect()
-            torch.cuda.empty_cache()  # noqa
-            if "'NoneType' object has no attribute 'to'" in error_str:
-                if model_name == diarization_models["pyannote_2.1"]:
-                    raise ValueError(
-                        "Accept the license agreement for using Pyannote 2.1."
-                        " You need to have an account on Hugging Face and "
-                        "accept the license to use the models: "
-                        "https://huggingface.co/pyannote/speaker-diarization "
-                        "and https://huggingface.co/pyannote/segmentation "
-                        "Get your KEY TOKEN here: "
-                        "https://hf.co/settings/tokens "
-                    )
-                elif model_name == diarization_models["pyannote_3.1"]:
-                    raise ValueError(
-                        "New Licence Pyannote 3.1: You need to have an account"
-                        " on Hugging Face and accept the license to use the "
-                        "models: https://huggingface.co/pyannote/speaker-diarization-3.1 " # noqa
-                        "and https://huggingface.co/pyannote/segmentation-3.0 "
-                    )
-            else:
-                raise error
-        random_sleep()
-        diarize_segments = diarize_audio(diarize_model, audio_wav, min_speakers, max_speakers)
-        result_diarize = whisperx.assign_word_speakers(
-            diarize_segments, result
-        )
-        for segment in result_diarize["segments"]:
-            if "speaker" not in segment:
-                segment["speaker"] = "SPEAKER_00"
-                logger.warning(
-                    f"No speaker detected in {segment['start']}. First TTS "
-                    f"will be used for the segment text: {segment['text']} "
-                )
-        del diarize_model
-        gc.collect()
-        torch.cuda.empty_cache()  # noqa
-    else:
-        result_diarize = result
-        result_diarize["segments"] = [
-            {**item, "speaker": "SPEAKER_00"}
-            for item in result_diarize["segments"]
-        ]
-    return reencode_speakers(result_diarize)

+from whisperx.alignment import (
+    DEFAULT_ALIGN_MODELS_TORCH as DAMT,
+    DEFAULT_ALIGN_MODELS_HF as DAMHF,
+)
+from whisperx.utils import TO_LANGUAGE_CODE
+import whisperx
+import torch
+import gc
+import os
+import soundfile as sf
+from IPython.utils import capture # noqa
+from .language_configuration import EXTRA_ALIGN, INVERTED_LANGUAGES
+from .logging_setup import logger
+from .postprocessor import sanitize_file_name
+from .utils import remove_directory_contents, run_command
+ASR_MODEL_OPTIONS = [
+    "tiny",
+    "base",
+    "small",
+    "medium",
+    "large",
+    "large-v1",
+    "large-v2",
+    "large-v3",
+    "distil-large-v2",
+    "Systran/faster-distil-whisper-large-v3",
+    "tiny.en",
+    "base.en",
+    "small.en",
+    "medium.en",
+    "distil-small.en",
+    "distil-medium.en",
+    "OpenAI_API_Whisper",
+]
+COMPUTE_TYPE_GPU = [
+    "default",
+    "auto",
+    "int8",
+    "int8_float32",
+    "int8_float16",
+    "int8_bfloat16",
+    "float16",
+    "bfloat16",
+    "float32"
+]
+COMPUTE_TYPE_CPU = [
+    "default",
+    "auto",
+    "int8",
+    "int8_float32",
+    "int16",
+    "float32",
+]
+WHISPER_MODELS_PATH = './WHISPER_MODELS'
+def openai_api_whisper(
+    input_audio_file,
+    source_lang=None,
+    chunk_duration=1800
+):
+    info = sf.info(input_audio_file)
+    duration = info.duration
+    output_directory = "./whisper_api_audio_parts"
+    os.makedirs(output_directory, exist_ok=True)
+    remove_directory_contents(output_directory)
+    if duration > chunk_duration:
+        # Split the audio file into smaller chunks with 30-minute duration
+        cm = f'ffmpeg -i "{input_audio_file}" -f segment -segment_time {chunk_duration} -c:a libvorbis "{output_directory}/output%03d.ogg"'
+        run_command(cm)
+        # Get list of generated chunk files
+        chunk_files = sorted(
+            [f"{output_directory}/{f}" for f in os.listdir(output_directory) if f.endswith('.ogg')]
+        )
+    else:
+        one_file = f"{output_directory}/output000.ogg"
+        cm = f'ffmpeg -i "{input_audio_file}" -c:a libvorbis {one_file}'
+        run_command(cm)
+        chunk_files = [one_file]
+    # Transcript
+    segments = []
+    language = source_lang if source_lang else None
+    for i, chunk in enumerate(chunk_files):
+        from openai import OpenAI
+        client = OpenAI()
+        audio_file = open(chunk, "rb")
+        transcription = client.audio.transcriptions.create(
+          model="whisper-1",
+          file=audio_file,
+          language=language,
+          response_format="verbose_json",
+          timestamp_granularities=["segment"],
+        )
+        try:
+            transcript_dict = transcription.model_dump()
+        except: # noqa
+            transcript_dict = transcription.to_dict()
+        if language is None:
+            logger.info(f'Language detected: {transcript_dict["language"]}')
+            language = TO_LANGUAGE_CODE[transcript_dict["language"]]
+        chunk_time = chunk_duration * (i)
+        for seg in transcript_dict["segments"]:
+            if "start" in seg.keys():
+                segments.append(
+                    {
+                        "text": seg["text"],
+                        "start": seg["start"] + chunk_time,
+                        "end": seg["end"] + chunk_time,
+                    }
+                )
+    audio = whisperx.load_audio(input_audio_file)
+    result = {"segments": segments, "language": language}
+    return audio, result
+def find_whisper_models():
+    path = WHISPER_MODELS_PATH
+    folders = []
+    if os.path.exists(path):
+        for folder in os.listdir(path):
+            folder_path = os.path.join(path, folder)
+            if (
+                os.path.isdir(folder_path)
+                and 'model.bin' in os.listdir(folder_path)
+            ):
+                folders.append(folder)
+    return folders
+def transcribe_speech(
+    audio_wav,
+    asr_model,
+    compute_type,
+    batch_size,
+    SOURCE_LANGUAGE,
+    literalize_numbers=True,
+    segment_duration_limit=15,
+):
+    """
+    Transcribe speech using a whisper model.
+    Parameters:
+    - audio_wav (str): Path to the audio file in WAV format.
+    - asr_model (str): The whisper model to be loaded.
+    - compute_type (str): Type of compute to be used (e.g., 'int8', 'float16').
+    - batch_size (int): Batch size for transcription.
+    - SOURCE_LANGUAGE (str): Source language for transcription.
+    Returns:
+    - Tuple containing:
+        - audio: Loaded audio file.
+        - result: Transcription result as a dictionary.
+    """
+    if asr_model == "OpenAI_API_Whisper":
+        if literalize_numbers:
+            logger.info(
+                "OpenAI's API Whisper does not support "
+                "the literalization of numbers."
+            )
+        return openai_api_whisper(audio_wav, SOURCE_LANGUAGE)
+    # https://github.com/openai/whisper/discussions/277
+    prompt = "以下是普通话的句子。" if SOURCE_LANGUAGE == "zh" else None
+    SOURCE_LANGUAGE = (
+        SOURCE_LANGUAGE if SOURCE_LANGUAGE != "zh-TW" else "zh"
+    )
+    asr_options = {
+        "initial_prompt": prompt,
+        "suppress_numerals": literalize_numbers
+    }
+    if asr_model not in ASR_MODEL_OPTIONS:
+        base_dir = WHISPER_MODELS_PATH
+        if not os.path.exists(base_dir):
+            os.makedirs(base_dir)
+        model_dir = os.path.join(base_dir, sanitize_file_name(asr_model))
+        if not os.path.exists(model_dir):
+            from ctranslate2.converters import TransformersConverter
+            quantization = "float32"
+            # Download new model
+            try:
+                converter = TransformersConverter(
+                    asr_model,
+                    low_cpu_mem_usage=True,
+                    copy_files=[
+                        "tokenizer_config.json", "preprocessor_config.json"
+                    ]
+                )
+                converter.convert(
+                    model_dir,
+                    quantization=quantization,
+                    force=False
+                )
+            except Exception as error:
+                if "File tokenizer_config.json does not exist" in str(error):
+                    converter._copy_files = [
+                        "tokenizer.json", "preprocessor_config.json"
+                    ]
+                    converter.convert(
+                        model_dir,
+                        quantization=quantization,
+                        force=True
+                    )
+                else:
+                    raise error
+        asr_model = model_dir
+        logger.info(f"ASR Model: {str(model_dir)}")
+    model = whisperx.load_model(
+        asr_model,
+        os.environ.get("SONITR_DEVICE"),
+        compute_type=compute_type,
+        language=SOURCE_LANGUAGE,
+        asr_options=asr_options,
+    )
+    audio = whisperx.load_audio(audio_wav)
+    result = model.transcribe(
+        audio,
+        batch_size=batch_size,
+        chunk_size=segment_duration_limit,
+        print_progress=True,
+    )
+    if result["language"] == "zh" and not prompt:
+        result["language"] = "zh-TW"
+        logger.info("Chinese - Traditional (zh-TW)")
+    del model
+    gc.collect()
+    torch.cuda.empty_cache()  # noqa
+    return audio, result
+def align_speech(audio, result):
+    """
+    Aligns speech segments based on the provided audio and result metadata.
+    Parameters:
+    - audio (array): The audio data in a suitable format for alignment.
+    - result (dict): Metadata containing information about the segments
+         and language.
+    Returns:
+    - result (dict): Updated metadata after aligning the segments with
+        the audio. This includes character-level alignments if
+        'return_char_alignments' is set to True.
+    Notes:
+    - This function uses language-specific models to align speech segments.
+    - It performs language compatibility checks and selects the
+        appropriate alignment model.
+    - Cleans up memory by releasing resources after alignment.
+    """
+    DAMHF.update(DAMT)  # lang align
+    if (
+        not result["language"] in DAMHF.keys()
+        and not result["language"] in EXTRA_ALIGN.keys()
+    ):
+        logger.warning(
+            "Automatic detection: Source language not compatible with align"
+        )
+        raise ValueError(
+            f"Detected language {result['language']}  incompatible, "
+            "you can select the source language to avoid this error."
+        )
+    if (
+        result["language"] in EXTRA_ALIGN.keys()
+        and EXTRA_ALIGN[result["language"]] == ""
+    ):
+        lang_name = (
+            INVERTED_LANGUAGES[result["language"]]
+            if result["language"] in INVERTED_LANGUAGES.keys()
+            else result["language"]
+        )
+        logger.warning(
+            "No compatible wav2vec2 model found "
+            f"for the language '{lang_name}', skipping alignment."
+        )
+        return result
+    model_a, metadata = whisperx.load_align_model(
+        language_code=result["language"],
+        device=os.environ.get("SONITR_DEVICE"),
+        model_name=None
+        if result["language"] in DAMHF.keys()
+        else EXTRA_ALIGN[result["language"]],
+    )
+    result = whisperx.align(
+        result["segments"],
+        model_a,
+        metadata,
+        audio,
+        os.environ.get("SONITR_DEVICE"),
+        return_char_alignments=True,
+        print_progress=False,
+    )
+    del model_a
+    gc.collect()
+    torch.cuda.empty_cache()  # noqa
+    return result
+diarization_models = {
+    "pyannote_3.1": "pyannote/speaker-diarization-3.1",
+    "pyannote_2.1": "pyannote/speaker-diarization@2.1",
+    "disable": "",
+}
+def reencode_speakers(result):
+    if result["segments"][0]["speaker"] == "SPEAKER_00":
+        return result
+    speaker_mapping = {}
+    counter = 0
+    logger.debug("Reencode speakers")
+    for segment in result["segments"]:
+        old_speaker = segment["speaker"]
+        if old_speaker not in speaker_mapping:
+            speaker_mapping[old_speaker] = f"SPEAKER_{counter:02d}"
+            counter += 1
+        segment["speaker"] = speaker_mapping[old_speaker]
+    return result
+def diarize_speech(
+    audio_wav,
+    result,
+    min_speakers,
+    max_speakers,
+    YOUR_HF_TOKEN,
+    model_name="pyannote/speaker-diarization@2.1",
+):
+    """
+    Performs speaker diarization on speech segments.
+    Parameters:
+    - audio_wav (array): Audio data in WAV format to perform speaker
+        diarization.
+    - result (dict): Metadata containing information about speech segments
+        and alignments.
+    - min_speakers (int): Minimum number of speakers expected in the audio.
+    - max_speakers (int): Maximum number of speakers expected in the audio.
+    - YOUR_HF_TOKEN (str): Your Hugging Face API token for model
+        authentication.
+    - model_name (str): Name of the speaker diarization model to be used
+        (default: "pyannote/speaker-diarization@2.1").
+    Returns:
+    - result_diarize (dict): Updated metadata after assigning speaker
+        labels to segments.
+    Notes:
+    - This function utilizes a speaker diarization model to label speaker
+        segments in the audio.
+    - It assigns speakers to word-level segments based on diarization results.
+    - Cleans up memory by releasing resources after diarization.
+    - If only one speaker is specified, each segment is automatically assigned
+        as the first speaker, eliminating the need for diarization inference.
+    """
+    if max(min_speakers, max_speakers) > 1 and model_name:
+        try:
+            diarize_model = whisperx.DiarizationPipeline(
+                model_name=model_name,
+                use_auth_token=YOUR_HF_TOKEN,
+                device=os.environ.get("SONITR_DEVICE"),
+            )
+        except Exception as error:
+            error_str = str(error)
+            gc.collect()
+            torch.cuda.empty_cache()  # noqa
+            if "'NoneType' object has no attribute 'to'" in error_str:
+                if model_name == diarization_models["pyannote_2.1"]:
+                    raise ValueError(
+                        "Accept the license agreement for using Pyannote 2.1."
+                        " You need to have an account on Hugging Face and "
+                        "accept the license to use the models: "
+                        "https://huggingface.co/pyannote/speaker-diarization "
+                        "and https://huggingface.co/pyannote/segmentation "
+                        "Get your KEY TOKEN here: "
+                        "https://hf.co/settings/tokens "
+                    )
+                elif model_name == diarization_models["pyannote_3.1"]:
+                    raise ValueError(
+                        "New Licence Pyannote 3.1: You need to have an account"
+                        " on Hugging Face and accept the license to use the "
+                        "models: https://huggingface.co/pyannote/speaker-diarization-3.1 " # noqa
+                        "and https://huggingface.co/pyannote/segmentation-3.0 "
+                    )
+            else:
+                raise error
+        diarize_segments = diarize_model(
+            audio_wav, min_speakers=min_speakers, max_speakers=max_speakers
+        )
+        result_diarize = whisperx.assign_word_speakers(
+            diarize_segments, result
+        )
+        for segment in result_diarize["segments"]:
+            if "speaker" not in segment:
+                segment["speaker"] = "SPEAKER_00"
+                logger.warning(
+                    f"No speaker detected in {segment['start']}. First TTS "
+                    f"will be used for the segment text: {segment['text']} "
+                )
+        del diarize_model
+        gc.collect()
+        torch.cuda.empty_cache()  # noqa
+    else:
+        result_diarize = result
+        result_diarize["segments"] = [
+            {**item, "speaker": "SPEAKER_00"}
+            for item in result_diarize["segments"]
+        ]
+    return reencode_speakers(result_diarize)

soni_translate/text_multiformat_processor.py CHANGED Viewed

@@ -1,987 +1,987 @@
-from .logging_setup import logger
-from whisperx.utils import get_writer
-from .utils import remove_files, run_command, remove_directory_contents
-from typing import List
-import srt
-import re
-import os
-import copy
-import string
-import soundfile as sf
-from PIL import Image, ImageOps, ImageDraw, ImageFont
-punctuation_list = list(
-    string.punctuation + "¡¿«»„”“”‚‘’「」『』《》（）【】〈〉〔〕〖〗〘〙〚〛⸤⸥⸨⸩"
-)
-symbol_list = punctuation_list + ["", "..", "..."]
-def extract_from_srt(file_path):
-    with open(file_path, "r", encoding="utf-8") as file:
-        srt_content = file.read()
-    subtitle_generator = srt.parse(srt_content)
-    srt_content_list = list(subtitle_generator)
-    return srt_content_list
-def clean_text(text):
-    # Remove content within square brackets
-    text = re.sub(r'\[.*?\]', '', text)
-    # Add pattern to remove content within <comment> tags
-    text = re.sub(r'<comment>.*?</comment>', '', text)
-    # Remove HTML tags
-    text = re.sub(r'<.*?>', '', text)
-    # Remove "♫" and "♪" content
-    text = re.sub(r'♫.*?♫', '', text)
-    text = re.sub(r'♪.*?♪', '', text)
-    # Replace newline characters with an empty string
-    text = text.replace("\n", ". ")
-    # Remove double quotation marks
-    text = text.replace('"', '')
-    # Collapse multiple spaces and replace with a single space
-    text = re.sub(r"\s+", " ", text)
-    # Normalize spaces around periods
-    text = re.sub(r"[\s\.]+(?=\s)", ". ", text)
-    # Check if there are ♫ or ♪ symbols present
-    if '♫' in text or '♪' in text:
-        return ""
-    text = text.strip()
-    # Valid text
-    return text if text not in symbol_list else ""
-def srt_file_to_segments(file_path, speaker=False):
-    try:
-        srt_content_list = extract_from_srt(file_path)
-    except Exception as error:
-        logger.error(str(error))
-        fixed_file = "fixed_sub.srt"
-        remove_files(fixed_file)
-        fix_sub = f'ffmpeg -i "{file_path}" "{fixed_file}" -y'
-        run_command(fix_sub)
-        srt_content_list = extract_from_srt(fixed_file)
-    segments = []
-    for segment in srt_content_list:
-        text = clean_text(str(segment.content))
-        if text:
-            segments.append(
-                {
-                    "text": text,
-                    "start": float(segment.start.total_seconds()),
-                    "end": float(segment.end.total_seconds()),
-                }
-            )
-    if not segments:
-        raise Exception("No data found in srt subtitle file")
-    if speaker:
-        segments = [{**seg, "speaker": "SPEAKER_00"} for seg in segments]
-    return {"segments": segments}
-# documents
-def dehyphenate(lines: List[str], line_no: int) -> List[str]:
-    next_line = lines[line_no + 1]
-    word_suffix = next_line.split(" ")[0]
-    lines[line_no] = lines[line_no][:-1] + word_suffix
-    lines[line_no + 1] = lines[line_no + 1][len(word_suffix):]
-    return lines
-def remove_hyphens(text: str) -> str:
-    """
-    This fails for:
-    * Natural dashes: well-known, self-replication, use-cases, non-semantic,
-                      Post-processing, Window-wise, viewpoint-dependent
-    * Trailing math operands: 2 - 4
-    * Names: Lopez-Ferreras, VGG-19, CIFAR-100
-    """
-    lines = [line.rstrip() for line in text.split("\n")]
-    # Find dashes
-    line_numbers = []
-    for line_no, line in enumerate(lines[:-1]):
-        if line.endswith("-"):
-            line_numbers.append(line_no)
-    # Replace
-    for line_no in line_numbers:
-        lines = dehyphenate(lines, line_no)
-    return "\n".join(lines)
-def pdf_to_txt(pdf_file, start_page, end_page):
-    from pypdf import PdfReader
-    with open(pdf_file, "rb") as file:
-        reader = PdfReader(file)
-        logger.debug(f"Total pages: {reader.get_num_pages()}")
-        text = ""
-        start_page_idx = max((start_page-1), 0)
-        end_page_inx = min((end_page), (reader.get_num_pages()))
-        document_pages = reader.pages[start_page_idx:end_page_inx]
-        logger.info(
-            f"Selected pages from {start_page_idx} to {end_page_inx}: "
-            f"{len(document_pages)}"
-        )
-        for page in document_pages:
-            text += remove_hyphens(page.extract_text())
-    return text
-def docx_to_txt(docx_file):
-    # https://github.com/AlJohri/docx2pdf update
-    from docx import Document
-    doc = Document(docx_file)
-    text = ""
-    for paragraph in doc.paragraphs:
-        text += paragraph.text + "\n"
-    return text
-def replace_multiple_elements(text, replacements):
-    pattern = re.compile("|".join(map(re.escape, replacements.keys())))
-    replaced_text = pattern.sub(
-        lambda match: replacements[match.group(0)], text
-    )
-    # Remove multiple spaces
-    replaced_text = re.sub(r"\s+", " ", replaced_text)
-    return replaced_text
-def document_preprocessor(file_path, is_string, start_page, end_page):
-    if not is_string:
-        file_ext = os.path.splitext(file_path)[1].lower()
-    if is_string:
-        text = file_path
-    elif file_ext == ".pdf":
-        text = pdf_to_txt(file_path, start_page, end_page)
-    elif file_ext == ".docx":
-        text = docx_to_txt(file_path)
-    elif file_ext == ".txt":
-        with open(
-            file_path, "r", encoding='utf-8', errors='replace'
-        ) as file:
-            text = file.read()
-    else:
-        raise Exception("Unsupported file format")
-    # Add space to break segments more easily later
-    replacements = {
-        "、": "、 ",
-        "。": "。 ",
-        # "\n": " ",
-    }
-    text = replace_multiple_elements(text, replacements)
-    # Save text to a .txt file
-    # file_name = os.path.splitext(os.path.basename(file_path))[0]
-    txt_file_path = "./text_preprocessor.txt"
-    with open(
-        txt_file_path, "w", encoding='utf-8', errors='replace'
-    ) as txt_file:
-        txt_file.write(text)
-    return txt_file_path, text
-def split_text_into_chunks(text, chunk_size):
-    words = re.findall(r"\b\w+\b", text)
-    chunks = []
-    current_chunk = ""
-    for word in words:
-        if (
-            len(current_chunk) + len(word) + 1 <= chunk_size
-        ):  # Adding 1 for the space between words
-            if current_chunk:
-                current_chunk += " "
-            current_chunk += word
-        else:
-            chunks.append(current_chunk)
-            current_chunk = word
-    if current_chunk:
-        chunks.append(current_chunk)
-    return chunks
-def determine_chunk_size(file_name):
-    patterns = {
-        re.compile(r".*-(Male|Female)$"): 1024,  # by character
-        re.compile(r".* BARK$"): 100,  # t 64 256
-        re.compile(r".* VITS$"): 500,
-        re.compile(
-            r".+\.(wav|mp3|ogg|m4a)$"
-        ): 150,  # t 250 400 api automatic split
-        re.compile(r".* VITS-onnx$"): 250,  # automatic sentence split
-        re.compile(r".* OpenAI-TTS$"): 1024  # max charaters 4096
-    }
-    for pattern, chunk_size in patterns.items():
-        if pattern.match(file_name):
-            return chunk_size
-    # Default chunk size if the file doesn't match any pattern; max 1800
-    return 100
-def plain_text_to_segments(result_text=None, chunk_size=None):
-    if not chunk_size:
-        chunk_size = 100
-    text_chunks = split_text_into_chunks(result_text, chunk_size)
-    segments_chunks = []
-    for num, chunk in enumerate(text_chunks):
-        chunk_dict = {
-            "text": chunk,
-            "start": (1.0 + num),
-            "end": (2.0 + num),
-            "speaker": "SPEAKER_00",
-        }
-        segments_chunks.append(chunk_dict)
-    result_diarize = {"segments": segments_chunks}
-    return result_diarize
-def segments_to_plain_text(result_diarize):
-    complete_text = ""
-    for seg in result_diarize["segments"]:
-        complete_text += seg["text"] + " "  # issue
-    # Save text to a .txt file
-    # file_name = os.path.splitext(os.path.basename(file_path))[0]
-    txt_file_path = "./text_translation.txt"
-    with open(
-        txt_file_path, "w", encoding='utf-8', errors='replace'
-    ) as txt_file:
-        txt_file.write(complete_text)
-    return txt_file_path, complete_text
-# doc to video
-COLORS = {
-    "black": (0, 0, 0),
-    "white": (255, 255, 255),
-    "red": (255, 0, 0),
-    "green": (0, 255, 0),
-    "blue": (0, 0, 255),
-    "yellow": (255, 255, 0),
-    "light_gray": (200, 200, 200),
-    "light_blue": (173, 216, 230),
-    "light_green": (144, 238, 144),
-    "light_yellow": (255, 255, 224),
-    "light_pink": (255, 182, 193),
-    "lavender": (230, 230, 250),
-    "peach": (255, 218, 185),
-    "light_cyan": (224, 255, 255),
-    "light_salmon": (255, 160, 122),
-    "light_green_yellow": (173, 255, 47),
-}
-BORDER_COLORS = ["dynamic"] + list(COLORS.keys())
-def calculate_average_color(img):
-    # Resize the image to a small size for faster processing
-    img_small = img.resize((50, 50))
-    # Calculate the average color
-    average_color = img_small.convert("RGB").resize((1, 1)).getpixel((0, 0))
-    return average_color
-def add_border_to_image(
-    image_path,
-    target_width,
-    target_height,
-    border_color=None
-):
-    img = Image.open(image_path)
-    # Calculate the width and height for the new image with borders
-    original_width, original_height = img.size
-    original_aspect_ratio = original_width / original_height
-    target_aspect_ratio = target_width / target_height
-    # Resize the image to fit the target resolution retaining aspect ratio
-    if original_aspect_ratio > target_aspect_ratio:
-        # Image is wider, calculate new height
-        new_height = int(target_width / original_aspect_ratio)
-        resized_img = img.resize((target_width, new_height))
-    else:
-        # Image is taller, calculate new width
-        new_width = int(target_height * original_aspect_ratio)
-        resized_img = img.resize((new_width, target_height))
-    # Calculate padding for borders
-    padding = (0, 0, 0, 0)
-    if resized_img.size[0] != target_width or resized_img.size[1] != target_height:
-        if original_aspect_ratio > target_aspect_ratio:
-            # Add borders vertically
-            padding = (0, (target_height - resized_img.size[1]) // 2, 0, (target_height - resized_img.size[1]) // 2)
-        else:
-            # Add borders horizontally
-            padding = ((target_width - resized_img.size[0]) // 2, 0, (target_width - resized_img.size[0]) // 2, 0)
-    # Add borders with specified color
-    if not border_color or border_color == "dynamic":
-        border_color = calculate_average_color(resized_img)
-    else:
-        border_color = COLORS.get(border_color, (0, 0, 0))
-    bordered_img = ImageOps.expand(resized_img, padding, fill=border_color)
-    bordered_img.save(image_path)
-    return image_path
-def resize_and_position_subimage(
-    subimage,
-    max_width,
-    max_height,
-    subimage_position,
-    main_width,
-    main_height
-):
-    subimage_width, subimage_height = subimage.size
-    # Resize subimage if it exceeds maximum dimensions
-    if subimage_width > max_width or subimage_height > max_height:
-        # Calculate scaling factor
-        width_scale = max_width / subimage_width
-        height_scale = max_height / subimage_height
-        scale = min(width_scale, height_scale)
-        # Resize subimage
-        subimage = subimage.resize(
-            (int(subimage_width * scale), int(subimage_height * scale))
-        )
-    # Calculate position to place the subimage
-    if subimage_position == "top-left":
-        subimage_x = 0
-        subimage_y = 0
-    elif subimage_position == "top-right":
-        subimage_x = main_width - subimage.width
-        subimage_y = 0
-    elif subimage_position == "bottom-left":
-        subimage_x = 0
-        subimage_y = main_height - subimage.height
-    elif subimage_position == "bottom-right":
-        subimage_x = main_width - subimage.width
-        subimage_y = main_height - subimage.height
-    else:
-        raise ValueError(
-            "Invalid subimage_position. Choose from 'top-left', 'top-right',"
-            " 'bottom-left', or 'bottom-right'."
-        )
-    return subimage, subimage_x, subimage_y
-def create_image_with_text_and_subimages(
-    text,
-    subimages,
-    width,
-    height,
-    text_color,
-    background_color,
-    output_file
-):
-    # Create an image with the specified resolution and background color
-    image = Image.new('RGB', (width, height), color=background_color)
-    # Initialize ImageDraw object
-    draw = ImageDraw.Draw(image)
-    # Load a font
-    font = ImageFont.load_default()  # You can specify your font file here
-    # Calculate text size and position
-    text_bbox = draw.textbbox((0, 0), text, font=font)
-    text_width = text_bbox[2] - text_bbox[0]
-    text_height = text_bbox[3] - text_bbox[1]
-    text_x = (width - text_width) / 2
-    text_y = (height - text_height) / 2
-    # Draw text on the image
-    draw.text((text_x, text_y), text, fill=text_color, font=font)
-    # Paste subimages onto the main image
-    for subimage_path, subimage_position in subimages:
-        # Open the subimage
-        subimage = Image.open(subimage_path)
-        # Convert subimage to RGBA mode if it doesn't have an alpha channel
-        if subimage.mode != 'RGBA':
-            subimage = subimage.convert('RGBA')
-        # Resize and position the subimage
-        subimage, subimage_x, subimage_y = resize_and_position_subimage(
-            subimage, width / 4, height / 4, subimage_position, width, height
-        )
-        # Paste the subimage onto the main image
-        image.paste(subimage, (int(subimage_x), int(subimage_y)), subimage)
-    image.save(output_file)
-    return output_file
-def doc_to_txtximg_pages(
-    document,
-    width,
-    height,
-    start_page,
-    end_page,
-    bcolor
-):
-    from pypdf import PdfReader
-    images_folder = "pdf_images/"
-    os.makedirs(images_folder, exist_ok=True)
-    remove_directory_contents(images_folder)
-    # First image
-    text_image = os.path.basename(document)[:-4]
-    subimages = [("./assets/logo.jpeg", "top-left")]
-    text_color = (255, 255, 255) if bcolor == "black" else (0, 0, 0)  # w|b
-    background_color = COLORS.get(bcolor, (255, 255, 255))  # dynamic white
-    first_image = "pdf_images/0000_00_aaa.png"
-    create_image_with_text_and_subimages(
-        text_image,
-        subimages,
-        width,
-        height,
-        text_color,
-        background_color,
-        first_image
-    )
-    reader = PdfReader(document)
-    logger.debug(f"Total pages: {reader.get_num_pages()}")
-    start_page_idx = max((start_page-1), 0)
-    end_page_inx = min((end_page), (reader.get_num_pages()))
-    document_pages = reader.pages[start_page_idx:end_page_inx]
-    logger.info(
-        f"Selected pages from {start_page_idx} to {end_page_inx}: "
-        f"{len(document_pages)}"
-    )
-    data_doc = {}
-    for i, page in enumerate(document_pages):
-        count = 0
-        images = []
-        for image_file_object in page.images:
-            img_name = f"{images_folder}{i:04d}_{count:02d}_{image_file_object.name}"
-            images.append(img_name)
-            with open(img_name, "wb") as fp:
-                fp.write(image_file_object.data)
-                count += 1
-            img_name = add_border_to_image(img_name, width, height, bcolor)
-        data_doc[i] = {
-            "text": remove_hyphens(page.extract_text()),
-            "images": images
-        }
-    return data_doc
-def page_data_to_segments(result_text=None, chunk_size=None):
-    if not chunk_size:
-        chunk_size = 100
-    segments_chunks = []
-    time_global = 0
-    for page, result_data in result_text.items():
-        # result_image = result_data["images"]
-        result_text = result_data["text"]
-        text_chunks = split_text_into_chunks(result_text, chunk_size)
-        if not text_chunks:
-            text_chunks = [" "]
-        for chunk in text_chunks:
-            chunk_dict = {
-                "text": chunk,
-                "start": (1.0 + time_global),
-                "end": (2.0 + time_global),
-                "speaker": "SPEAKER_00",
-                "page": page,
-            }
-            segments_chunks.append(chunk_dict)
-            time_global += 1
-    result_diarize = {"segments": segments_chunks}
-    return result_diarize
-def update_page_data(result_diarize, doc_data):
-    complete_text = ""
-    current_page = result_diarize["segments"][0]["page"]
-    text_page = ""
-    for seg in result_diarize["segments"]:
-        text = seg["text"] + " "  # issue
-        complete_text += text
-        page = seg["page"]
-        if page == current_page:
-            text_page += text
-        else:
-            doc_data[current_page]["text"] = text_page
-            # Next
-            text_page = text
-            current_page = page
-    if doc_data[current_page]["text"] != text_page:
-        doc_data[current_page]["text"] = text_page
-    return doc_data
-def fix_timestamps_docs(result_diarize, audio_files):
-    current_start = 0.0
-    for seg, audio in zip(result_diarize["segments"], audio_files):
-        duration = round(sf.info(audio).duration, 2)
-        seg["start"] = current_start
-        current_start += duration
-        seg["end"] = current_start
-    return result_diarize
-def create_video_from_images(
-    doc_data,
-    result_diarize
-):
-    # First image path
-    first_image = "pdf_images/0000_00_aaa.png"
-    # Time segments and images
-    max_pages_idx = len(doc_data) - 1
-    current_page = result_diarize["segments"][0]["page"]
-    duration_page = 0.0
-    last_image = None
-    for seg in result_diarize["segments"]:
-        start = seg["start"]
-        end = seg["end"]
-        duration_seg = end - start
-        page = seg["page"]
-        if page == current_page:
-            duration_page += duration_seg
-        else:
-            images = doc_data[current_page]["images"]
-            if first_image:
-                images = [first_image] + images
-                first_image = None
-            if not doc_data[min(max_pages_idx, (current_page+1))]["text"].strip():
-                images = images + doc_data[min(max_pages_idx, (current_page+1))]["images"]
-            if not images and last_image:
-                images = [last_image]
-            # Calculate images duration
-            time_duration_per_image = round((duration_page / len(images)), 2)
-            doc_data[current_page]["time_per_image"] = time_duration_per_image
-            # Next values
-            doc_data[current_page]["images"] = images
-            last_image = images[-1]
-            duration_page = duration_seg
-            current_page = page
-    if "time_per_image" not in doc_data[current_page].keys():
-        images = doc_data[current_page]["images"]
-        if first_image:
-            images = [first_image] + images
-        if not images:
-            images = [last_image]
-        time_duration_per_image = round((duration_page / len(images)), 2)
-        doc_data[current_page]["time_per_image"] = time_duration_per_image
-    # Timestamped image video.
-    with open("list.txt", "w") as file:
-        for i, page in enumerate(doc_data.values()):
-            duration = page["time_per_image"]
-            for img in page["images"]:
-                if i == len(doc_data) - 1 and img == page["images"][-1]:  # Check if it's the last item
-                    file.write(f"file {img}\n")
-                    file.write(f"outpoint {duration}")
-                else:
-                    file.write(f"file {img}\n")
-                    file.write(f"outpoint {duration}\n")
-    out_video = "video_from_images.mp4"
-    remove_files(out_video)
-    cm = f"ffmpeg -y -f concat -i list.txt -c:v libx264 -preset veryfast -crf 18 -pix_fmt yuv420p {out_video}"
-    cm_alt = f"ffmpeg -f concat -i list.txt -c:v libx264 -r 30 -pix_fmt yuv420p -y {out_video}"
-    try:
-        run_command(cm)
-    except Exception as error:
-        logger.error(str(error))
-        remove_files(out_video)
-        run_command(cm_alt)
-    return out_video
-def merge_video_and_audio(video_doc, final_wav_file):
-    fixed_audio = "fixed_audio.mp3"
-    remove_files(fixed_audio)
-    cm = f"ffmpeg -i {final_wav_file} -c:a libmp3lame {fixed_audio}"
-    run_command(cm)
-    vid_out = "video_book.mp4"
-    remove_files(vid_out)
-    cm = f"ffmpeg -i {video_doc} -i {fixed_audio} -c:v copy -c:a copy -map 0:v -map 1:a -shortest {vid_out}"
-    run_command(cm)
-    return vid_out
-# subtitles
-def get_subtitle(
-    language,
-    segments_data,
-    extension,
-    filename=None,
-    highlight_words=False,
-):
-    if not filename:
-        filename = "task_subtitle"
-    is_ass_extension = False
-    if extension == "ass":
-        is_ass_extension = True
-        extension = "srt"
-    sub_file = filename + "." + extension
-    support_name = filename + ".mp3"
-    remove_files(sub_file)
-    writer = get_writer(extension, output_dir=".")
-    word_options = {
-        "highlight_words": highlight_words,
-        "max_line_count": None,
-        "max_line_width": None,
-    }
-    # Get data subs
-    subtitle_data = copy.deepcopy(segments_data)
-    subtitle_data["language"] = (
-        "ja" if language in ["ja", "zh", "zh-TW"] else language
-    )
-    # Clean
-    if not highlight_words:
-        subtitle_data.pop("word_segments", None)
-        for segment in subtitle_data["segments"]:
-            for key in ["speaker", "chars", "words"]:
-                segment.pop(key, None)
-    writer(
-        subtitle_data,
-        support_name,
-        word_options,
-    )
-    if is_ass_extension:
-        temp_name = filename + ".ass"
-        remove_files(temp_name)
-        convert_sub = f'ffmpeg -i "{sub_file}" "{temp_name}" -y'
-        run_command(convert_sub)
-        sub_file = temp_name
-    return sub_file
-def process_subtitles(
-    deep_copied_result,
-    align_language,
-    result_diarize,
-    output_format_subtitle,
-    TRANSLATE_AUDIO_TO,
-):
-    name_ori = "sub_ori."
-    name_tra = "sub_tra."
-    remove_files(
-        [name_ori + output_format_subtitle, name_tra + output_format_subtitle]
-    )
-    writer = get_writer(output_format_subtitle, output_dir=".")
-    word_options = {
-        "highlight_words": False,
-        "max_line_count": None,
-        "max_line_width": None,
-    }
-    # original lang
-    subs_copy_result = copy.deepcopy(deep_copied_result)
-    subs_copy_result["language"] = (
-        "zh" if align_language == "zh-TW" else align_language
-    )
-    for segment in subs_copy_result["segments"]:
-        segment.pop("speaker", None)
-    try:
-        writer(
-            subs_copy_result,
-            name_ori[:-1] + ".mp3",
-            word_options,
-        )
-    except Exception as error:
-        logger.error(str(error))
-        if str(error) == "list indices must be integers or slices, not str":
-            logger.error(
-                "Related to poor word segmentation"
-                " in segments after alignment."
-            )
-        subs_copy_result["segments"][0].pop("words")
-        writer(
-            subs_copy_result,
-            name_ori[:-1] + ".mp3",
-            word_options,
-        )
-    # translated lang
-    subs_tra_copy_result = copy.deepcopy(result_diarize)
-    subs_tra_copy_result["language"] = (
-        "ja" if TRANSLATE_AUDIO_TO in ["ja", "zh", "zh-TW"] else align_language
-    )
-    subs_tra_copy_result.pop("word_segments", None)
-    for segment in subs_tra_copy_result["segments"]:
-        for key in ["speaker", "chars", "words"]:
-            segment.pop(key, None)
-    writer(
-        subs_tra_copy_result,
-        name_tra[:-1] + ".mp3",
-        word_options,
-    )
-    return name_tra + output_format_subtitle
-def linguistic_level_segments(
-    result_base,
-    linguistic_unit="word",  # word or char
-):
-    linguistic_unit = linguistic_unit[:4]
-    linguistic_unit_key = linguistic_unit + "s"
-    result = copy.deepcopy(result_base)
-    if linguistic_unit_key not in result["segments"][0].keys():
-        raise ValueError("No alignment detected, can't process")
-    segments_by_unit = []
-    for segment in result["segments"]:
-        segment_units = segment[linguistic_unit_key]
-        # segment_speaker = segment.get("speaker", "SPEAKER_00")
-        for unit in segment_units:
-            text = unit[linguistic_unit]
-            if "start" in unit.keys():
-                segments_by_unit.append(
-                    {
-                        "start": unit["start"],
-                        "end": unit["end"],
-                        "text": text,
-                        # "speaker": segment_speaker,
-                    }
-                    )
-            elif not segments_by_unit:
-                pass
-            else:
-                segments_by_unit[-1]["text"] += text
-    return {"segments": segments_by_unit}
-def break_aling_segments(
-    result: dict,
-    break_characters: str = "",  # ":|,|.|"
-):
-    result_align = copy.deepcopy(result)
-    break_characters_list = break_characters.split("|")
-    break_characters_list = [i for i in break_characters_list if i != '']
-    if not break_characters_list:
-        logger.info("No valid break characters were specified.")
-        return result
-    logger.info(f"Redivide text segments by: {str(break_characters_list)}")
-    # create new with filters
-    normal = []
-    def process_chars(chars, letter_new_start, num, text):
-        start_key, end_key = "start", "end"
-        start_value = end_value = None
-        for char in chars:
-            if start_key in char:
-                start_value = char[start_key]
-                break
-        for char in reversed(chars):
-            if end_key in char:
-                end_value = char[end_key]
-                break
-        if not start_value or not end_value:
-            raise Exception(
-                f"Unable to obtain a valid timestamp for chars: {str(chars)}"
-            )
-        return {
-            "start": start_value,
-            "end": end_value,
-            "text": text,
-            "words": chars,
-        }
-    for i, segment in enumerate(result_align['segments']):
-        logger.debug(f"- Process segment: {i}, text: {segment['text']}")
-        # start = segment['start']
-        letter_new_start = 0
-        for num, char in enumerate(segment['chars']):
-            if char["char"] is None:
-                continue
-            # if "start" in char:
-            #     start = char["start"]
-            # if "end" in char:
-            #     end = char["end"]
-            # Break by character
-            if char['char'] in break_characters_list:
-                text = segment['text'][letter_new_start:num+1]
-                logger.debug(
-                    f"Break in: {char['char']}, position: {num}, text: {text}"
-                )
-                chars = segment['chars'][letter_new_start:num+1]
-                if not text:
-                    logger.debug("No text")
-                    continue
-                if num == 0 and not text.strip():
-                    logger.debug("blank space in start")
-                    continue
-                if len(text) == 1:
-                    logger.debug(f"Short char append, num: {num}")
-                    normal[-1]["text"] += text
-                    normal[-1]["words"].append(chars)
-                    continue
-                # logger.debug(chars)
-                normal_dict = process_chars(chars, letter_new_start, num, text)
-                letter_new_start = num+1
-                normal.append(normal_dict)
-            # If we reach the end of the segment, add the last part of chars.
-            if num == len(segment["chars"]) - 1:
-                text = segment['text'][letter_new_start:num+1]
-                # If remain text len is not default len text
-                if num not in [len(text)-1, len(text)] and text:
-                    logger.debug(f'Remaining text: {text}')
-                if not text:
-                    logger.debug("No remaining text.")
-                    continue
-                if len(text) == 1:
-                    logger.debug(f"Short char append, num: {num}")
-                    normal[-1]["text"] += text
-                    normal[-1]["words"].append(chars)
-                    continue
-                chars = segment['chars'][letter_new_start:num+1]
-                normal_dict = process_chars(chars, letter_new_start, num, text)
-                letter_new_start = num+1
-                normal.append(normal_dict)
-    # Rename char to word
-    for item in normal:
-        words_list = item['words']
-        for word_item in words_list:
-            if 'char' in word_item:
-                word_item['word'] = word_item.pop('char')
-    # Convert to dict default
-    break_segments = {"segments": normal}
-    msg_count = (
-        f"Segment count before: {len(result['segments'])}, "
-        f"after: {len(break_segments['segments'])}."
-    )
-    logger.info(msg_count)
-    return break_segments

+from .logging_setup import logger
+from whisperx.utils import get_writer
+from .utils import remove_files, run_command, remove_directory_contents
+from typing import List
+import srt
+import re
+import os
+import copy
+import string
+import soundfile as sf
+from PIL import Image, ImageOps, ImageDraw, ImageFont
+punctuation_list = list(
+    string.punctuation + "¡¿«»„”“”‚‘’「」『』《》（）【】〈〉〔〕〖〗〘〙〚〛⸤⸥⸨⸩"
+)
+symbol_list = punctuation_list + ["", "..", "..."]
+def extract_from_srt(file_path):
+    with open(file_path, "r", encoding="utf-8") as file:
+        srt_content = file.read()
+    subtitle_generator = srt.parse(srt_content)
+    srt_content_list = list(subtitle_generator)
+    return srt_content_list
+def clean_text(text):
+    # Remove content within square brackets
+    text = re.sub(r'\[.*?\]', '', text)
+    # Add pattern to remove content within <comment> tags
+    text = re.sub(r'<comment>.*?</comment>', '', text)
+    # Remove HTML tags
+    text = re.sub(r'<.*?>', '', text)
+    # Remove "♫" and "♪" content
+    text = re.sub(r'♫.*?♫', '', text)
+    text = re.sub(r'♪.*?♪', '', text)
+    # Replace newline characters with an empty string
+    text = text.replace("\n", ". ")
+    # Remove double quotation marks
+    text = text.replace('"', '')
+    # Collapse multiple spaces and replace with a single space
+    text = re.sub(r"\s+", " ", text)
+    # Normalize spaces around periods
+    text = re.sub(r"[\s\.]+(?=\s)", ". ", text)
+    # Check if there are ♫ or ♪ symbols present
+    if '♫' in text or '♪' in text:
+        return ""
+    text = text.strip()
+    # Valid text
+    return text if text not in symbol_list else ""
+def srt_file_to_segments(file_path, speaker=False):
+    try:
+        srt_content_list = extract_from_srt(file_path)
+    except Exception as error:
+        logger.error(str(error))
+        fixed_file = "fixed_sub.srt"
+        remove_files(fixed_file)
+        fix_sub = f'ffmpeg -i "{file_path}" "{fixed_file}" -y'
+        run_command(fix_sub)
+        srt_content_list = extract_from_srt(fixed_file)
+    segments = []
+    for segment in srt_content_list:
+        text = clean_text(str(segment.content))
+        if text:
+            segments.append(
+                {
+                    "text": text,
+                    "start": float(segment.start.total_seconds()),
+                    "end": float(segment.end.total_seconds()),
+                }
+            )
+    if not segments:
+        raise Exception("No data found in srt subtitle file")
+    if speaker:
+        segments = [{**seg, "speaker": "SPEAKER_00"} for seg in segments]
+    return {"segments": segments}
+# documents
+def dehyphenate(lines: List[str], line_no: int) -> List[str]:
+    next_line = lines[line_no + 1]
+    word_suffix = next_line.split(" ")[0]
+    lines[line_no] = lines[line_no][:-1] + word_suffix
+    lines[line_no + 1] = lines[line_no + 1][len(word_suffix):]
+    return lines
+def remove_hyphens(text: str) -> str:
+    """
+    This fails for:
+    * Natural dashes: well-known, self-replication, use-cases, non-semantic,
+                      Post-processing, Window-wise, viewpoint-dependent
+    * Trailing math operands: 2 - 4
+    * Names: Lopez-Ferreras, VGG-19, CIFAR-100
+    """
+    lines = [line.rstrip() for line in text.split("\n")]
+    # Find dashes
+    line_numbers = []
+    for line_no, line in enumerate(lines[:-1]):
+        if line.endswith("-"):
+            line_numbers.append(line_no)
+    # Replace
+    for line_no in line_numbers:
+        lines = dehyphenate(lines, line_no)
+    return "\n".join(lines)
+def pdf_to_txt(pdf_file, start_page, end_page):
+    from pypdf import PdfReader
+    with open(pdf_file, "rb") as file:
+        reader = PdfReader(file)
+        logger.debug(f"Total pages: {reader.get_num_pages()}")
+        text = ""
+        start_page_idx = max((start_page-1), 0)
+        end_page_inx = min((end_page), (reader.get_num_pages()))
+        document_pages = reader.pages[start_page_idx:end_page_inx]
+        logger.info(
+            f"Selected pages from {start_page_idx} to {end_page_inx}: "
+            f"{len(document_pages)}"
+        )
+        for page in document_pages:
+            text += remove_hyphens(page.extract_text())
+    return text
+def docx_to_txt(docx_file):
+    # https://github.com/AlJohri/docx2pdf update
+    from docx import Document
+    doc = Document(docx_file)
+    text = ""
+    for paragraph in doc.paragraphs:
+        text += paragraph.text + "\n"
+    return text
+def replace_multiple_elements(text, replacements):
+    pattern = re.compile("|".join(map(re.escape, replacements.keys())))
+    replaced_text = pattern.sub(
+        lambda match: replacements[match.group(0)], text
+    )
+    # Remove multiple spaces
+    replaced_text = re.sub(r"\s+", " ", replaced_text)
+    return replaced_text
+def document_preprocessor(file_path, is_string, start_page, end_page):
+    if not is_string:
+        file_ext = os.path.splitext(file_path)[1].lower()
+    if is_string:
+        text = file_path
+    elif file_ext == ".pdf":
+        text = pdf_to_txt(file_path, start_page, end_page)
+    elif file_ext == ".docx":
+        text = docx_to_txt(file_path)
+    elif file_ext == ".txt":
+        with open(
+            file_path, "r", encoding='utf-8', errors='replace'
+        ) as file:
+            text = file.read()
+    else:
+        raise Exception("Unsupported file format")
+    # Add space to break segments more easily later
+    replacements = {
+        "、": "、 ",
+        "。": "。 ",
+        # "\n": " ",
+    }
+    text = replace_multiple_elements(text, replacements)
+    # Save text to a .txt file
+    # file_name = os.path.splitext(os.path.basename(file_path))[0]
+    txt_file_path = "./text_preprocessor.txt"
+    with open(
+        txt_file_path, "w", encoding='utf-8', errors='replace'
+    ) as txt_file:
+        txt_file.write(text)
+    return txt_file_path, text
+def split_text_into_chunks(text, chunk_size):
+    words = re.findall(r"\b\w+\b", text)
+    chunks = []
+    current_chunk = ""
+    for word in words:
+        if (
+            len(current_chunk) + len(word) + 1 <= chunk_size
+        ):  # Adding 1 for the space between words
+            if current_chunk:
+                current_chunk += " "
+            current_chunk += word
+        else:
+            chunks.append(current_chunk)
+            current_chunk = word
+    if current_chunk:
+        chunks.append(current_chunk)
+    return chunks
+def determine_chunk_size(file_name):
+    patterns = {
+        re.compile(r".*-(Male|Female)$"): 1024,  # by character
+        re.compile(r".* BARK$"): 100,  # t 64 256
+        re.compile(r".* VITS$"): 500,
+        re.compile(
+            r".+\.(wav|mp3|ogg|m4a)$"
+        ): 150,  # t 250 400 api automatic split
+        re.compile(r".* VITS-onnx$"): 250,  # automatic sentence split
+        re.compile(r".* OpenAI-TTS$"): 1024  # max charaters 4096
+    }
+    for pattern, chunk_size in patterns.items():
+        if pattern.match(file_name):
+            return chunk_size
+    # Default chunk size if the file doesn't match any pattern; max 1800
+    return 100
+def plain_text_to_segments(result_text=None, chunk_size=None):
+    if not chunk_size:
+        chunk_size = 100
+    text_chunks = split_text_into_chunks(result_text, chunk_size)
+    segments_chunks = []
+    for num, chunk in enumerate(text_chunks):
+        chunk_dict = {
+            "text": chunk,
+            "start": (1.0 + num),
+            "end": (2.0 + num),
+            "speaker": "SPEAKER_00",
+        }
+        segments_chunks.append(chunk_dict)
+    result_diarize = {"segments": segments_chunks}
+    return result_diarize
+def segments_to_plain_text(result_diarize):
+    complete_text = ""
+    for seg in result_diarize["segments"]:
+        complete_text += seg["text"] + " "  # issue
+    # Save text to a .txt file
+    # file_name = os.path.splitext(os.path.basename(file_path))[0]
+    txt_file_path = "./text_translation.txt"
+    with open(
+        txt_file_path, "w", encoding='utf-8', errors='replace'
+    ) as txt_file:
+        txt_file.write(complete_text)
+    return txt_file_path, complete_text
+# doc to video
+COLORS = {
+    "black": (0, 0, 0),
+    "white": (255, 255, 255),
+    "red": (255, 0, 0),
+    "green": (0, 255, 0),
+    "blue": (0, 0, 255),
+    "yellow": (255, 255, 0),
+    "light_gray": (200, 200, 200),
+    "light_blue": (173, 216, 230),
+    "light_green": (144, 238, 144),
+    "light_yellow": (255, 255, 224),
+    "light_pink": (255, 182, 193),
+    "lavender": (230, 230, 250),
+    "peach": (255, 218, 185),
+    "light_cyan": (224, 255, 255),
+    "light_salmon": (255, 160, 122),
+    "light_green_yellow": (173, 255, 47),
+}
+BORDER_COLORS = ["dynamic"] + list(COLORS.keys())
+def calculate_average_color(img):
+    # Resize the image to a small size for faster processing
+    img_small = img.resize((50, 50))
+    # Calculate the average color
+    average_color = img_small.convert("RGB").resize((1, 1)).getpixel((0, 0))
+    return average_color
+def add_border_to_image(
+    image_path,
+    target_width,
+    target_height,
+    border_color=None
+):
+    img = Image.open(image_path)
+    # Calculate the width and height for the new image with borders
+    original_width, original_height = img.size
+    original_aspect_ratio = original_width / original_height
+    target_aspect_ratio = target_width / target_height
+    # Resize the image to fit the target resolution retaining aspect ratio
+    if original_aspect_ratio > target_aspect_ratio:
+        # Image is wider, calculate new height
+        new_height = int(target_width / original_aspect_ratio)
+        resized_img = img.resize((target_width, new_height))
+    else:
+        # Image is taller, calculate new width
+        new_width = int(target_height * original_aspect_ratio)
+        resized_img = img.resize((new_width, target_height))
+    # Calculate padding for borders
+    padding = (0, 0, 0, 0)
+    if resized_img.size[0] != target_width or resized_img.size[1] != target_height:
+        if original_aspect_ratio > target_aspect_ratio:
+            # Add borders vertically
+            padding = (0, (target_height - resized_img.size[1]) // 2, 0, (target_height - resized_img.size[1]) // 2)
+        else:
+            # Add borders horizontally
+            padding = ((target_width - resized_img.size[0]) // 2, 0, (target_width - resized_img.size[0]) // 2, 0)
+    # Add borders with specified color
+    if not border_color or border_color == "dynamic":
+        border_color = calculate_average_color(resized_img)
+    else:
+        border_color = COLORS.get(border_color, (0, 0, 0))
+    bordered_img = ImageOps.expand(resized_img, padding, fill=border_color)
+    bordered_img.save(image_path)
+    return image_path
+def resize_and_position_subimage(
+    subimage,
+    max_width,
+    max_height,
+    subimage_position,
+    main_width,
+    main_height
+):
+    subimage_width, subimage_height = subimage.size
+    # Resize subimage if it exceeds maximum dimensions
+    if subimage_width > max_width or subimage_height > max_height:
+        # Calculate scaling factor
+        width_scale = max_width / subimage_width
+        height_scale = max_height / subimage_height
+        scale = min(width_scale, height_scale)
+        # Resize subimage
+        subimage = subimage.resize(
+            (int(subimage_width * scale), int(subimage_height * scale))
+        )
+    # Calculate position to place the subimage
+    if subimage_position == "top-left":
+        subimage_x = 0
+        subimage_y = 0
+    elif subimage_position == "top-right":
+        subimage_x = main_width - subimage.width
+        subimage_y = 0
+    elif subimage_position == "bottom-left":
+        subimage_x = 0
+        subimage_y = main_height - subimage.height
+    elif subimage_position == "bottom-right":
+        subimage_x = main_width - subimage.width
+        subimage_y = main_height - subimage.height
+    else:
+        raise ValueError(
+            "Invalid subimage_position. Choose from 'top-left', 'top-right',"
+            " 'bottom-left', or 'bottom-right'."
+        )
+    return subimage, subimage_x, subimage_y
+def create_image_with_text_and_subimages(
+    text,
+    subimages,
+    width,
+    height,
+    text_color,
+    background_color,
+    output_file
+):
+    # Create an image with the specified resolution and background color
+    image = Image.new('RGB', (width, height), color=background_color)
+    # Initialize ImageDraw object
+    draw = ImageDraw.Draw(image)
+    # Load a font
+    font = ImageFont.load_default()  # You can specify your font file here
+    # Calculate text size and position
+    text_bbox = draw.textbbox((0, 0), text, font=font)
+    text_width = text_bbox[2] - text_bbox[0]
+    text_height = text_bbox[3] - text_bbox[1]
+    text_x = (width - text_width) / 2
+    text_y = (height - text_height) / 2
+    # Draw text on the image
+    draw.text((text_x, text_y), text, fill=text_color, font=font)
+    # Paste subimages onto the main image
+    for subimage_path, subimage_position in subimages:
+        # Open the subimage
+        subimage = Image.open(subimage_path)
+        # Convert subimage to RGBA mode if it doesn't have an alpha channel
+        if subimage.mode != 'RGBA':
+            subimage = subimage.convert('RGBA')
+        # Resize and position the subimage
+        subimage, subimage_x, subimage_y = resize_and_position_subimage(
+            subimage, width / 4, height / 4, subimage_position, width, height
+        )
+        # Paste the subimage onto the main image
+        image.paste(subimage, (int(subimage_x), int(subimage_y)), subimage)
+    image.save(output_file)
+    return output_file
+def doc_to_txtximg_pages(
+    document,
+    width,
+    height,
+    start_page,
+    end_page,
+    bcolor
+):
+    from pypdf import PdfReader
+    images_folder = "pdf_images/"
+    os.makedirs(images_folder, exist_ok=True)
+    remove_directory_contents(images_folder)
+    # First image
+    text_image = os.path.basename(document)[:-4]
+    subimages = [("./assets/logo.jpeg", "top-left")]
+    text_color = (255, 255, 255) if bcolor == "black" else (0, 0, 0)  # w|b
+    background_color = COLORS.get(bcolor, (255, 255, 255))  # dynamic white
+    first_image = "pdf_images/0000_00_aaa.png"
+    create_image_with_text_and_subimages(
+        text_image,
+        subimages,
+        width,
+        height,
+        text_color,
+        background_color,
+        first_image
+    )
+    reader = PdfReader(document)
+    logger.debug(f"Total pages: {reader.get_num_pages()}")
+    start_page_idx = max((start_page-1), 0)
+    end_page_inx = min((end_page), (reader.get_num_pages()))
+    document_pages = reader.pages[start_page_idx:end_page_inx]
+    logger.info(
+        f"Selected pages from {start_page_idx} to {end_page_inx}: "
+        f"{len(document_pages)}"
+    )
+    data_doc = {}
+    for i, page in enumerate(document_pages):
+        count = 0
+        images = []
+        for image_file_object in page.images:
+            img_name = f"{images_folder}{i:04d}_{count:02d}_{image_file_object.name}"
+            images.append(img_name)
+            with open(img_name, "wb") as fp:
+                fp.write(image_file_object.data)
+                count += 1
+            img_name = add_border_to_image(img_name, width, height, bcolor)
+        data_doc[i] = {
+            "text": remove_hyphens(page.extract_text()),
+            "images": images
+        }
+    return data_doc
+def page_data_to_segments(result_text=None, chunk_size=None):
+    if not chunk_size:
+        chunk_size = 100
+    segments_chunks = []
+    time_global = 0
+    for page, result_data in result_text.items():
+        # result_image = result_data["images"]
+        result_text = result_data["text"]
+        text_chunks = split_text_into_chunks(result_text, chunk_size)
+        if not text_chunks:
+            text_chunks = [" "]
+        for chunk in text_chunks:
+            chunk_dict = {
+                "text": chunk,
+                "start": (1.0 + time_global),
+                "end": (2.0 + time_global),
+                "speaker": "SPEAKER_00",
+                "page": page,
+            }
+            segments_chunks.append(chunk_dict)
+            time_global += 1
+    result_diarize = {"segments": segments_chunks}
+    return result_diarize
+def update_page_data(result_diarize, doc_data):
+    complete_text = ""
+    current_page = result_diarize["segments"][0]["page"]
+    text_page = ""
+    for seg in result_diarize["segments"]:
+        text = seg["text"] + " "  # issue
+        complete_text += text
+        page = seg["page"]
+        if page == current_page:
+            text_page += text
+        else:
+            doc_data[current_page]["text"] = text_page
+            # Next
+            text_page = text
+            current_page = page
+    if doc_data[current_page]["text"] != text_page:
+        doc_data[current_page]["text"] = text_page
+    return doc_data
+def fix_timestamps_docs(result_diarize, audio_files):
+    current_start = 0.0
+    for seg, audio in zip(result_diarize["segments"], audio_files):
+        duration = round(sf.info(audio).duration, 2)
+        seg["start"] = current_start
+        current_start += duration
+        seg["end"] = current_start
+    return result_diarize
+def create_video_from_images(
+    doc_data,
+    result_diarize
+):
+    # First image path
+    first_image = "pdf_images/0000_00_aaa.png"
+    # Time segments and images
+    max_pages_idx = len(doc_data) - 1
+    current_page = result_diarize["segments"][0]["page"]
+    duration_page = 0.0
+    last_image = None
+    for seg in result_diarize["segments"]:
+        start = seg["start"]
+        end = seg["end"]
+        duration_seg = end - start
+        page = seg["page"]
+        if page == current_page:
+            duration_page += duration_seg
+        else:
+            images = doc_data[current_page]["images"]
+            if first_image:
+                images = [first_image] + images
+                first_image = None
+            if not doc_data[min(max_pages_idx, (current_page+1))]["text"].strip():
+                images = images + doc_data[min(max_pages_idx, (current_page+1))]["images"]
+            if not images and last_image:
+                images = [last_image]
+            # Calculate images duration
+            time_duration_per_image = round((duration_page / len(images)), 2)
+            doc_data[current_page]["time_per_image"] = time_duration_per_image
+            # Next values
+            doc_data[current_page]["images"] = images
+            last_image = images[-1]
+            duration_page = duration_seg
+            current_page = page
+    if "time_per_image" not in doc_data[current_page].keys():
+        images = doc_data[current_page]["images"]
+        if first_image:
+            images = [first_image] + images
+        if not images:
+            images = [last_image]
+        time_duration_per_image = round((duration_page / len(images)), 2)
+        doc_data[current_page]["time_per_image"] = time_duration_per_image
+    # Timestamped image video.
+    with open("list.txt", "w") as file:
+        for i, page in enumerate(doc_data.values()):
+            duration = page["time_per_image"]
+            for img in page["images"]:
+                if i == len(doc_data) - 1 and img == page["images"][-1]:  # Check if it's the last item
+                    file.write(f"file {img}\n")
+                    file.write(f"outpoint {duration}")
+                else:
+                    file.write(f"file {img}\n")
+                    file.write(f"outpoint {duration}\n")
+    out_video = "video_from_images.mp4"
+    remove_files(out_video)
+    cm = f"ffmpeg -y -f concat -i list.txt -c:v libx264 -preset veryfast -crf 18 -pix_fmt yuv420p {out_video}"
+    cm_alt = f"ffmpeg -f concat -i list.txt -c:v libx264 -r 30 -pix_fmt yuv420p -y {out_video}"
+    try:
+        run_command(cm)
+    except Exception as error:
+        logger.error(str(error))
+        remove_files(out_video)
+        run_command(cm_alt)
+    return out_video
+def merge_video_and_audio(video_doc, final_wav_file):
+    fixed_audio = "fixed_audio.mp3"
+    remove_files(fixed_audio)
+    cm = f"ffmpeg -i {final_wav_file} -c:a libmp3lame {fixed_audio}"
+    run_command(cm)
+    vid_out = "video_book.mp4"
+    remove_files(vid_out)
+    cm = f"ffmpeg -i {video_doc} -i {fixed_audio} -c:v copy -c:a copy -map 0:v -map 1:a -shortest {vid_out}"
+    run_command(cm)
+    return vid_out
+# subtitles
+def get_subtitle(
+    language,
+    segments_data,
+    extension,
+    filename=None,
+    highlight_words=False,
+):
+    if not filename:
+        filename = "task_subtitle"
+    is_ass_extension = False
+    if extension == "ass":
+        is_ass_extension = True
+        extension = "srt"
+    sub_file = filename + "." + extension
+    support_name = filename + ".mp3"
+    remove_files(sub_file)
+    writer = get_writer(extension, output_dir=".")
+    word_options = {
+        "highlight_words": highlight_words,
+        "max_line_count": None,
+        "max_line_width": None,
+    }
+    # Get data subs
+    subtitle_data = copy.deepcopy(segments_data)
+    subtitle_data["language"] = (
+        "ja" if language in ["ja", "zh", "zh-TW"] else language
+    )
+    # Clean
+    if not highlight_words:
+        subtitle_data.pop("word_segments", None)
+        for segment in subtitle_data["segments"]:
+            for key in ["speaker", "chars", "words"]:
+                segment.pop(key, None)
+    writer(
+        subtitle_data,
+        support_name,
+        word_options,
+    )
+    if is_ass_extension:
+        temp_name = filename + ".ass"
+        remove_files(temp_name)
+        convert_sub = f'ffmpeg -i "{sub_file}" "{temp_name}" -y'
+        run_command(convert_sub)
+        sub_file = temp_name
+    return sub_file
+def process_subtitles(
+    deep_copied_result,
+    align_language,
+    result_diarize,
+    output_format_subtitle,
+    TRANSLATE_AUDIO_TO,
+):
+    name_ori = "sub_ori."
+    name_tra = "sub_tra."
+    remove_files(
+        [name_ori + output_format_subtitle, name_tra + output_format_subtitle]
+    )
+    writer = get_writer(output_format_subtitle, output_dir=".")
+    word_options = {
+        "highlight_words": False,
+        "max_line_count": None,
+        "max_line_width": None,
+    }
+    # original lang
+    subs_copy_result = copy.deepcopy(deep_copied_result)
+    subs_copy_result["language"] = (
+        "zh" if align_language == "zh-TW" else align_language
+    )
+    for segment in subs_copy_result["segments"]:
+        segment.pop("speaker", None)
+    try:
+        writer(
+            subs_copy_result,
+            name_ori[:-1] + ".mp3",
+            word_options,
+        )
+    except Exception as error:
+        logger.error(str(error))
+        if str(error) == "list indices must be integers or slices, not str":
+            logger.error(
+                "Related to poor word segmentation"
+                " in segments after alignment."
+            )
+        subs_copy_result["segments"][0].pop("words")
+        writer(
+            subs_copy_result,
+            name_ori[:-1] + ".mp3",
+            word_options,
+        )
+    # translated lang
+    subs_tra_copy_result = copy.deepcopy(result_diarize)
+    subs_tra_copy_result["language"] = (
+        "ja" if TRANSLATE_AUDIO_TO in ["ja", "zh", "zh-TW"] else align_language
+    )
+    subs_tra_copy_result.pop("word_segments", None)
+    for segment in subs_tra_copy_result["segments"]:
+        for key in ["speaker", "chars", "words"]:
+            segment.pop(key, None)
+    writer(
+        subs_tra_copy_result,
+        name_tra[:-1] + ".mp3",
+        word_options,
+    )
+    return name_tra + output_format_subtitle
+def linguistic_level_segments(
+    result_base,
+    linguistic_unit="word",  # word or char
+):
+    linguistic_unit = linguistic_unit[:4]
+    linguistic_unit_key = linguistic_unit + "s"
+    result = copy.deepcopy(result_base)
+    if linguistic_unit_key not in result["segments"][0].keys():
+        raise ValueError("No alignment detected, can't process")
+    segments_by_unit = []
+    for segment in result["segments"]:
+        segment_units = segment[linguistic_unit_key]
+        # segment_speaker = segment.get("speaker", "SPEAKER_00")
+        for unit in segment_units:
+            text = unit[linguistic_unit]
+            if "start" in unit.keys():
+                segments_by_unit.append(
+                    {
+                        "start": unit["start"],
+                        "end": unit["end"],
+                        "text": text,
+                        # "speaker": segment_speaker,
+                    }
+                    )
+            elif not segments_by_unit:
+                pass
+            else:
+                segments_by_unit[-1]["text"] += text
+    return {"segments": segments_by_unit}
+def break_aling_segments(
+    result: dict,
+    break_characters: str = "",  # ":|,|.|"
+):
+    result_align = copy.deepcopy(result)
+    break_characters_list = break_characters.split("|")
+    break_characters_list = [i for i in break_characters_list if i != '']
+    if not break_characters_list:
+        logger.info("No valid break characters were specified.")
+        return result
+    logger.info(f"Redivide text segments by: {str(break_characters_list)}")
+    # create new with filters
+    normal = []
+    def process_chars(chars, letter_new_start, num, text):
+        start_key, end_key = "start", "end"
+        start_value = end_value = None
+        for char in chars:
+            if start_key in char:
+                start_value = char[start_key]
+                break
+        for char in reversed(chars):
+            if end_key in char:
+                end_value = char[end_key]
+                break
+        if not start_value or not end_value:
+            raise Exception(
+                f"Unable to obtain a valid timestamp for chars: {str(chars)}"
+            )
+        return {
+            "start": start_value,
+            "end": end_value,
+            "text": text,
+            "words": chars,
+        }
+    for i, segment in enumerate(result_align['segments']):
+        logger.debug(f"- Process segment: {i}, text: {segment['text']}")
+        # start = segment['start']
+        letter_new_start = 0
+        for num, char in enumerate(segment['chars']):
+            if char["char"] is None:
+                continue
+            # if "start" in char:
+            #     start = char["start"]
+            # if "end" in char:
+            #     end = char["end"]
+            # Break by character
+            if char['char'] in break_characters_list:
+                text = segment['text'][letter_new_start:num+1]
+                logger.debug(
+                    f"Break in: {char['char']}, position: {num}, text: {text}"
+                )
+                chars = segment['chars'][letter_new_start:num+1]
+                if not text:
+                    logger.debug("No text")
+                    continue
+                if num == 0 and not text.strip():
+                    logger.debug("blank space in start")
+                    continue
+                if len(text) == 1:
+                    logger.debug(f"Short char append, num: {num}")
+                    normal[-1]["text"] += text
+                    normal[-1]["words"].append(chars)
+                    continue
+                # logger.debug(chars)
+                normal_dict = process_chars(chars, letter_new_start, num, text)
+                letter_new_start = num+1
+                normal.append(normal_dict)
+            # If we reach the end of the segment, add the last part of chars.
+            if num == len(segment["chars"]) - 1:
+                text = segment['text'][letter_new_start:num+1]
+                # If remain text len is not default len text
+                if num not in [len(text)-1, len(text)] and text:
+                    logger.debug(f'Remaining text: {text}')
+                if not text:
+                    logger.debug("No remaining text.")
+                    continue
+                if len(text) == 1:
+                    logger.debug(f"Short char append, num: {num}")
+                    normal[-1]["text"] += text
+                    normal[-1]["words"].append(chars)
+                    continue
+                chars = segment['chars'][letter_new_start:num+1]
+                normal_dict = process_chars(chars, letter_new_start, num, text)
+                letter_new_start = num+1
+                normal.append(normal_dict)
+    # Rename char to word
+    for item in normal:
+        words_list = item['words']
+        for word_item in words_list:
+            if 'char' in word_item:
+                word_item['word'] = word_item.pop('char')
+    # Convert to dict default
+    break_segments = {"segments": normal}
+    msg_count = (
+        f"Segment count before: {len(result['segments'])}, "
+        f"after: {len(break_segments['segments'])}."
+    )
+    logger.info(msg_count)
+    return break_segments

soni_translate/text_to_speech.py CHANGED Viewed

The diff for this file is too large to render. See raw diff

soni_translate/translate_segments.py CHANGED Viewed

@@ -1,457 +1,457 @@
-from tqdm import tqdm
-from deep_translator import GoogleTranslator
-from itertools import chain
-import copy
-from .language_configuration import fix_code_language, INVERTED_LANGUAGES
-from .logging_setup import logger
-import re
-import json
-import time
-TRANSLATION_PROCESS_OPTIONS = [
-    "google_translator_batch",
-    "google_translator",
-    "gpt-3.5-turbo-0125_batch",
-    "gpt-3.5-turbo-0125",
-    "gpt-4-turbo-preview_batch",
-    "gpt-4-turbo-preview",
-    "disable_translation",
-]
-DOCS_TRANSLATION_PROCESS_OPTIONS = [
-    "google_translator",
-    "gpt-3.5-turbo-0125",
-    "gpt-4-turbo-preview",
-    "disable_translation",
-]
-def translate_iterative(segments, target, source=None):
-    """
-    Translate text segments individually to the specified language.
-    Parameters:
-    - segments (list): A list of dictionaries with 'text' as a key for
-        segment text.
-    - target (str): Target language code.
-    - source (str, optional): Source language code. Defaults to None.
-    Returns:
-    - list: Translated text segments in the target language.
-    Notes:
-    - Translates each segment using Google Translate.
-    Example:
-    segments = [{'text': 'first segment.'}, {'text': 'second segment.'}]
-    translated_segments = translate_iterative(segments, 'es')
-    """
-    segments_ = copy.deepcopy(segments)
-    if (
-        not source
-    ):
-        logger.debug("No source language")
-        source = "auto"
-    translator = GoogleTranslator(source=source, target=target)
-    for line in tqdm(range(len(segments_))):
-        text = segments_[line]["text"]
-        translated_line = translator.translate(text.strip())
-        segments_[line]["text"] = translated_line
-    return segments_
-def verify_translate(
-    segments,
-    segments_copy,
-    translated_lines,
-    target,
-    source
-):
-    """
-    Verify integrity and translate segments if lengths match, otherwise
-    switch to iterative translation.
-    """
-    if len(segments) == len(translated_lines):
-        for line in range(len(segments_copy)):
-            logger.debug(
-                f"{segments_copy[line]['text']} >> "
-                f"{translated_lines[line].strip()}"
-            )
-            segments_copy[line]["text"] = translated_lines[
-                line].replace("\t", "").replace("\n", "").strip()
-        return segments_copy
-    else:
-        logger.error(
-            "The translation failed, switching to google_translate iterative. "
-            f"{len(segments), len(translated_lines)}"
-        )
-        return translate_iterative(segments, target, source)
-def translate_batch(segments, target, chunk_size=2000, source=None):
-    """
-    Translate a batch of text segments into the specified language in chunks,
-        respecting the character limit.
-    Parameters:
-    - segments (list): List of dictionaries with 'text' as a key for segment
-        text.
-    - target (str): Target language code.
-    - chunk_size (int, optional): Maximum character limit for each translation
-        chunk (default is 2000; max 5000).
-    - source (str, optional): Source language code. Defaults to None.
-    Returns:
-    - list: Translated text segments in the target language.
-    Notes:
-    - Splits input segments into chunks respecting the character limit for
-        translation.
-    - Translates the chunks using Google Translate.
-    - If chunked translation fails, switches to iterative translation using
-        `translate_iterative()`.
-    Example:
-    segments = [{'text': 'first segment.'}, {'text': 'second segment.'}]
-    translated = translate_batch(segments, 'es', chunk_size=4000, source='en')
-    """
-    segments_copy = copy.deepcopy(segments)
-    if (
-        not source
-    ):
-        logger.debug("No source language")
-        source = "auto"
-    # Get text
-    text_lines = []
-    for line in range(len(segments_copy)):
-        text = segments_copy[line]["text"].strip()
-        text_lines.append(text)
-    # chunk limit
-    text_merge = []
-    actual_chunk = ""
-    global_text_list = []
-    actual_text_list = []
-    for one_line in text_lines:
-        one_line = " " if not one_line else one_line
-        if (len(actual_chunk) + len(one_line)) <= chunk_size:
-            if actual_chunk:
-                actual_chunk += " ||||| "
-            actual_chunk += one_line
-            actual_text_list.append(one_line)
-        else:
-            text_merge.append(actual_chunk)
-            actual_chunk = one_line
-            global_text_list.append(actual_text_list)
-            actual_text_list = [one_line]
-    if actual_chunk:
-        text_merge.append(actual_chunk)
-        global_text_list.append(actual_text_list)
-    # translate chunks
-    progress_bar = tqdm(total=len(segments), desc="Translating")
-    translator = GoogleTranslator(source=source, target=target)
-    split_list = []
-    try:
-        for text, text_iterable in zip(text_merge, global_text_list):
-            translated_line = translator.translate(text.strip())
-            split_text = translated_line.split("|||||")
-            if len(split_text) == len(text_iterable):
-                progress_bar.update(len(split_text))
-            else:
-                logger.debug(
-                    "Chunk fixing iteratively. Len chunk: "
-                    f"{len(split_text)}, expected: {len(text_iterable)}"
-                )
-                split_text = []
-                for txt_iter in text_iterable:
-                    translated_txt = translator.translate(txt_iter.strip())
-                    split_text.append(translated_txt)
-                    progress_bar.update(1)
-            split_list.append(split_text)
-        progress_bar.close()
-    except Exception as error:
-        progress_bar.close()
-        logger.error(str(error))
-        logger.warning(
-            "The translation in chunks failed, switching to iterative."
-            " Related: too many request"
-        )  # use proxy or less chunk size
-        return translate_iterative(segments, target, source)
-    # un chunk
-    translated_lines = list(chain.from_iterable(split_list))
-    return verify_translate(
-        segments, segments_copy, translated_lines, target, source
-    )
-def call_gpt_translate(
-    client,
-    model,
-    system_prompt,
-    user_prompt,
-    original_text=None,
-    batch_lines=None,
-):
-    # https://platform.openai.com/docs/guides/text-generation/json-mode
-    response = client.chat.completions.create(
-        model=model,
-        response_format={"type": "json_object"},
-        messages=[
-          {"role": "system", "content": system_prompt},
-          {"role": "user", "content": user_prompt}
-        ]
-    )
-    result = response.choices[0].message.content
-    logger.debug(f"Result: {str(result)}")
-    try:
-        translation = json.loads(result)
-    except Exception as error:
-        match_result = re.search(r'\{.*?\}', result)
-        if match_result:
-            logger.error(str(error))
-            json_str = match_result.group(0)
-            translation = json.loads(json_str)
-        else:
-            raise error
-    # Get valid data
-    if batch_lines:
-        for conversation in translation.values():
-            if isinstance(conversation, dict):
-                conversation = list(conversation.values())[0]
-            if (
-                list(
-                    original_text["conversation"][0].values()
-                )[0].strip() ==
-                list(conversation[0].values())[0].strip()
-            ):
-                continue
-            if len(conversation) == batch_lines:
-                break
-        fix_conversation_length = []
-        for line in conversation:
-            for speaker_code, text_tr in line.items():
-                fix_conversation_length.append({speaker_code: text_tr})
-        logger.debug(f"Data batch: {str(fix_conversation_length)}")
-        logger.debug(
-            f"Lines Received: {len(fix_conversation_length)},"
-            f" expected: {batch_lines}"
-        )
-        return fix_conversation_length
-    else:
-        if isinstance(translation, dict):
-            translation = list(translation.values())[0]
-        if isinstance(translation, list):
-            translation = translation[0]
-        if isinstance(translation, set):
-            translation = list(translation)[0]
-        if not isinstance(translation, str):
-            raise ValueError(f"No valid response received: {str(translation)}")
-        return translation
-def gpt_sequential(segments, model, target, source=None):
-    from openai import OpenAI
-    translated_segments = copy.deepcopy(segments)
-    client = OpenAI()
-    progress_bar = tqdm(total=len(segments), desc="Translating")
-    lang_tg = re.sub(r'\([^)]*\)', '', INVERTED_LANGUAGES[target]).strip()
-    lang_sc = ""
-    if source:
-        lang_sc = re.sub(r'\([^)]*\)', '', INVERTED_LANGUAGES[source]).strip()
-    fixed_target = fix_code_language(target)
-    fixed_source = fix_code_language(source) if source else "auto"
-    system_prompt = "Machine translation designed to output the translated_text JSON."
-    for i, line in enumerate(translated_segments):
-        text = line["text"].strip()
-        start = line["start"]
-        user_prompt = f"Translate the following {lang_sc} text into {lang_tg}, write the fully translated text and nothing more:\n{text}"
-        time.sleep(0.5)
-        try:
-            translated_text = call_gpt_translate(
-                client,
-                model,
-                system_prompt,
-                user_prompt,
-            )
-        except Exception as error:
-            logger.error(
-                f"{str(error)} >> The text of segment {start} "
-                "is being corrected with Google Translate"
-            )
-            translator = GoogleTranslator(
-                source=fixed_source, target=fixed_target
-            )
-            translated_text = translator.translate(text.strip())
-        translated_segments[i]["text"] = translated_text.strip()
-        progress_bar.update(1)
-    progress_bar.close()
-    return translated_segments
-def gpt_batch(segments, model, target, token_batch_limit=900, source=None):
-    from openai import OpenAI
-    import tiktoken
-    token_batch_limit = max(100, (token_batch_limit - 40) // 2)
-    progress_bar = tqdm(total=len(segments), desc="Translating")
-    segments_copy = copy.deepcopy(segments)
-    encoding = tiktoken.get_encoding("cl100k_base")
-    client = OpenAI()
-    lang_tg = re.sub(r'\([^)]*\)', '', INVERTED_LANGUAGES[target]).strip()
-    lang_sc = ""
-    if source:
-        lang_sc = re.sub(r'\([^)]*\)', '', INVERTED_LANGUAGES[source]).strip()
-    fixed_target = fix_code_language(target)
-    fixed_source = fix_code_language(source) if source else "auto"
-    name_speaker = "ABCDEFGHIJKL"
-    translated_lines = []
-    text_data_dict = []
-    num_tokens = 0
-    count_sk = {char: 0 for char in "ABCDEFGHIJKL"}
-    for i, line in enumerate(segments_copy):
-        text = line["text"]
-        speaker = line["speaker"]
-        last_start = line["start"]
-        # text_data_dict.append({str(int(speaker[-1])+1): text})
-        index_sk = int(speaker[-2:])
-        character_sk = name_speaker[index_sk]
-        count_sk[character_sk] += 1
-        code_sk = character_sk+str(count_sk[character_sk])
-        text_data_dict.append({code_sk: text})
-        num_tokens += len(encoding.encode(text)) + 7
-        if num_tokens >= token_batch_limit or i == len(segments_copy)-1:
-            try:
-                batch_lines = len(text_data_dict)
-                batch_conversation = {"conversation": copy.deepcopy(text_data_dict)}
-                # Reset vars
-                num_tokens = 0
-                text_data_dict = []
-                count_sk = {char: 0 for char in "ABCDEFGHIJKL"}
-                # Process translation
-                # https://arxiv.org/pdf/2309.03409.pdf
-                system_prompt = f"Machine translation designed to output the translated_conversation key JSON containing a list of {batch_lines} items."
-                user_prompt = f"Translate each of the following text values in conversation{' from' if lang_sc else ''} {lang_sc} to {lang_tg}:\n{batch_conversation}"
-                logger.debug(f"Prompt: {str(user_prompt)}")
-                conversation = call_gpt_translate(
-                    client,
-                    model,
-                    system_prompt,
-                    user_prompt,
-                    original_text=batch_conversation,
-                    batch_lines=batch_lines,
-                )
-                if len(conversation) < batch_lines:
-                    raise ValueError(
-                        "Incomplete result received. Batch lines: "
-                        f"{len(conversation)}, expected: {batch_lines}"
-                    )
-                for i, translated_text in enumerate(conversation):
-                    if i+1 > batch_lines:
-                        break
-                    translated_lines.append(list(translated_text.values())[0])
-                progress_bar.update(batch_lines)
-            except Exception as error:
-                logger.error(str(error))
-                first_start = segments_copy[max(0, i-(batch_lines-1))]["start"]
-                logger.warning(
-                    f"The batch from {first_start} to {last_start} "
-                    "failed, is being corrected with Google Translate"
-                )
-                translator = GoogleTranslator(
-                    source=fixed_source,
-                    target=fixed_target
-                )
-                for txt_source in batch_conversation["conversation"]:
-                    translated_txt = translator.translate(
-                        list(txt_source.values())[0].strip()
-                    )
-                    translated_lines.append(translated_txt.strip())
-                    progress_bar.update(1)
-    progress_bar.close()
-    return verify_translate(
-        segments, segments_copy, translated_lines, fixed_target, fixed_source
-    )
-def translate_text(
-    segments,
-    target,
-    translation_process="google_translator_batch",
-    chunk_size=4500,
-    source=None,
-    token_batch_limit=1000,
-):
-    """Translates text segments using a specified process."""
-    match translation_process:
-        case "google_translator_batch":
-            return translate_batch(
-                segments,
-                fix_code_language(target),
-                chunk_size,
-                fix_code_language(source)
-            )
-        case "google_translator":
-            return translate_iterative(
-                segments,
-                fix_code_language(target),
-                fix_code_language(source)
-            )
-        case model if model in ["gpt-3.5-turbo-0125", "gpt-4-turbo-preview"]:
-            return gpt_sequential(segments, model, target, source)
-        case model if model in ["gpt-3.5-turbo-0125_batch", "gpt-4-turbo-preview_batch",]:
-            return gpt_batch(
-                segments,
-                translation_process.replace("_batch", ""),
-                target,
-                token_batch_limit,
-                source
-            )
-        case "disable_translation":
-            return segments
-        case _:
-            raise ValueError("No valid translation process")

+from tqdm import tqdm
+from deep_translator import GoogleTranslator
+from itertools import chain
+import copy
+from .language_configuration import fix_code_language, INVERTED_LANGUAGES
+from .logging_setup import logger
+import re
+import json
+import time
+TRANSLATION_PROCESS_OPTIONS = [
+    "google_translator_batch",
+    "google_translator",
+    "gpt-3.5-turbo-0125_batch",
+    "gpt-3.5-turbo-0125",
+    "gpt-4-turbo-preview_batch",
+    "gpt-4-turbo-preview",
+    "disable_translation",
+]
+DOCS_TRANSLATION_PROCESS_OPTIONS = [
+    "google_translator",
+    "gpt-3.5-turbo-0125",
+    "gpt-4-turbo-preview",
+    "disable_translation",
+]
+def translate_iterative(segments, target, source=None):
+    """
+    Translate text segments individually to the specified language.
+    Parameters:
+    - segments (list): A list of dictionaries with 'text' as a key for
+        segment text.
+    - target (str): Target language code.
+    - source (str, optional): Source language code. Defaults to None.
+    Returns:
+    - list: Translated text segments in the target language.
+    Notes:
+    - Translates each segment using Google Translate.
+    Example:
+    segments = [{'text': 'first segment.'}, {'text': 'second segment.'}]
+    translated_segments = translate_iterative(segments, 'es')
+    """
+    segments_ = copy.deepcopy(segments)
+    if (
+        not source
+    ):
+        logger.debug("No source language")
+        source = "auto"
+    translator = GoogleTranslator(source=source, target=target)
+    for line in tqdm(range(len(segments_))):
+        text = segments_[line]["text"]
+        translated_line = translator.translate(text.strip())
+        segments_[line]["text"] = translated_line
+    return segments_
+def verify_translate(
+    segments,
+    segments_copy,
+    translated_lines,
+    target,
+    source
+):
+    """
+    Verify integrity and translate segments if lengths match, otherwise
+    switch to iterative translation.
+    """
+    if len(segments) == len(translated_lines):
+        for line in range(len(segments_copy)):
+            logger.debug(
+                f"{segments_copy[line]['text']} >> "
+                f"{translated_lines[line].strip()}"
+            )
+            segments_copy[line]["text"] = translated_lines[
+                line].replace("\t", "").replace("\n", "").strip()
+        return segments_copy
+    else:
+        logger.error(
+            "The translation failed, switching to google_translate iterative. "
+            f"{len(segments), len(translated_lines)}"
+        )
+        return translate_iterative(segments, target, source)
+def translate_batch(segments, target, chunk_size=2000, source=None):
+    """
+    Translate a batch of text segments into the specified language in chunks,
+        respecting the character limit.
+    Parameters:
+    - segments (list): List of dictionaries with 'text' as a key for segment
+        text.
+    - target (str): Target language code.
+    - chunk_size (int, optional): Maximum character limit for each translation
+        chunk (default is 2000; max 5000).
+    - source (str, optional): Source language code. Defaults to None.
+    Returns:
+    - list: Translated text segments in the target language.
+    Notes:
+    - Splits input segments into chunks respecting the character limit for
+        translation.
+    - Translates the chunks using Google Translate.
+    - If chunked translation fails, switches to iterative translation using
+        `translate_iterative()`.
+    Example:
+    segments = [{'text': 'first segment.'}, {'text': 'second segment.'}]
+    translated = translate_batch(segments, 'es', chunk_size=4000, source='en')
+    """
+    segments_copy = copy.deepcopy(segments)
+    if (
+        not source
+    ):
+        logger.debug("No source language")
+        source = "auto"
+    # Get text
+    text_lines = []
+    for line in range(len(segments_copy)):
+        text = segments_copy[line]["text"].strip()
+        text_lines.append(text)
+    # chunk limit
+    text_merge = []
+    actual_chunk = ""
+    global_text_list = []
+    actual_text_list = []
+    for one_line in text_lines:
+        one_line = " " if not one_line else one_line
+        if (len(actual_chunk) + len(one_line)) <= chunk_size:
+            if actual_chunk:
+                actual_chunk += " ||||| "
+            actual_chunk += one_line
+            actual_text_list.append(one_line)
+        else:
+            text_merge.append(actual_chunk)
+            actual_chunk = one_line
+            global_text_list.append(actual_text_list)
+            actual_text_list = [one_line]
+    if actual_chunk:
+        text_merge.append(actual_chunk)
+        global_text_list.append(actual_text_list)
+    # translate chunks
+    progress_bar = tqdm(total=len(segments), desc="Translating")
+    translator = GoogleTranslator(source=source, target=target)
+    split_list = []
+    try:
+        for text, text_iterable in zip(text_merge, global_text_list):
+            translated_line = translator.translate(text.strip())
+            split_text = translated_line.split("|||||")
+            if len(split_text) == len(text_iterable):
+                progress_bar.update(len(split_text))
+            else:
+                logger.debug(
+                    "Chunk fixing iteratively. Len chunk: "
+                    f"{len(split_text)}, expected: {len(text_iterable)}"
+                )
+                split_text = []
+                for txt_iter in text_iterable:
+                    translated_txt = translator.translate(txt_iter.strip())
+                    split_text.append(translated_txt)
+                    progress_bar.update(1)
+            split_list.append(split_text)
+        progress_bar.close()
+    except Exception as error:
+        progress_bar.close()
+        logger.error(str(error))
+        logger.warning(
+            "The translation in chunks failed, switching to iterative."
+            " Related: too many request"
+        )  # use proxy or less chunk size
+        return translate_iterative(segments, target, source)
+    # un chunk
+    translated_lines = list(chain.from_iterable(split_list))
+    return verify_translate(
+        segments, segments_copy, translated_lines, target, source
+    )
+def call_gpt_translate(
+    client,
+    model,
+    system_prompt,
+    user_prompt,
+    original_text=None,
+    batch_lines=None,
+):
+    # https://platform.openai.com/docs/guides/text-generation/json-mode
+    response = client.chat.completions.create(
+        model=model,
+        response_format={"type": "json_object"},
+        messages=[
+          {"role": "system", "content": system_prompt},
+          {"role": "user", "content": user_prompt}
+        ]
+    )
+    result = response.choices[0].message.content
+    logger.debug(f"Result: {str(result)}")
+    try:
+        translation = json.loads(result)
+    except Exception as error:
+        match_result = re.search(r'\{.*?\}', result)
+        if match_result:
+            logger.error(str(error))
+            json_str = match_result.group(0)
+            translation = json.loads(json_str)
+        else:
+            raise error
+    # Get valid data
+    if batch_lines:
+        for conversation in translation.values():
+            if isinstance(conversation, dict):
+                conversation = list(conversation.values())[0]
+            if (
+                list(
+                    original_text["conversation"][0].values()
+                )[0].strip() ==
+                list(conversation[0].values())[0].strip()
+            ):
+                continue
+            if len(conversation) == batch_lines:
+                break
+        fix_conversation_length = []
+        for line in conversation:
+            for speaker_code, text_tr in line.items():
+                fix_conversation_length.append({speaker_code: text_tr})
+        logger.debug(f"Data batch: {str(fix_conversation_length)}")
+        logger.debug(
+            f"Lines Received: {len(fix_conversation_length)},"
+            f" expected: {batch_lines}"
+        )
+        return fix_conversation_length
+    else:
+        if isinstance(translation, dict):
+            translation = list(translation.values())[0]
+        if isinstance(translation, list):
+            translation = translation[0]
+        if isinstance(translation, set):
+            translation = list(translation)[0]
+        if not isinstance(translation, str):
+            raise ValueError(f"No valid response received: {str(translation)}")
+        return translation
+def gpt_sequential(segments, model, target, source=None):
+    from openai import OpenAI
+    translated_segments = copy.deepcopy(segments)
+    client = OpenAI()
+    progress_bar = tqdm(total=len(segments), desc="Translating")
+    lang_tg = re.sub(r'\([^)]*\)', '', INVERTED_LANGUAGES[target]).strip()
+    lang_sc = ""
+    if source:
+        lang_sc = re.sub(r'\([^)]*\)', '', INVERTED_LANGUAGES[source]).strip()
+    fixed_target = fix_code_language(target)
+    fixed_source = fix_code_language(source) if source else "auto"
+    system_prompt = "Machine translation designed to output the translated_text JSON."
+    for i, line in enumerate(translated_segments):
+        text = line["text"].strip()
+        start = line["start"]
+        user_prompt = f"Translate the following {lang_sc} text into {lang_tg}, write the fully translated text and nothing more:\n{text}"
+        time.sleep(0.5)
+        try:
+            translated_text = call_gpt_translate(
+                client,
+                model,
+                system_prompt,
+                user_prompt,
+            )
+        except Exception as error:
+            logger.error(
+                f"{str(error)} >> The text of segment {start} "
+                "is being corrected with Google Translate"
+            )
+            translator = GoogleTranslator(
+                source=fixed_source, target=fixed_target
+            )
+            translated_text = translator.translate(text.strip())
+        translated_segments[i]["text"] = translated_text.strip()
+        progress_bar.update(1)
+    progress_bar.close()
+    return translated_segments
+def gpt_batch(segments, model, target, token_batch_limit=900, source=None):
+    from openai import OpenAI
+    import tiktoken
+    token_batch_limit = max(100, (token_batch_limit - 40) // 2)
+    progress_bar = tqdm(total=len(segments), desc="Translating")
+    segments_copy = copy.deepcopy(segments)
+    encoding = tiktoken.get_encoding("cl100k_base")
+    client = OpenAI()
+    lang_tg = re.sub(r'\([^)]*\)', '', INVERTED_LANGUAGES[target]).strip()
+    lang_sc = ""
+    if source:
+        lang_sc = re.sub(r'\([^)]*\)', '', INVERTED_LANGUAGES[source]).strip()
+    fixed_target = fix_code_language(target)
+    fixed_source = fix_code_language(source) if source else "auto"
+    name_speaker = "ABCDEFGHIJKL"
+    translated_lines = []
+    text_data_dict = []
+    num_tokens = 0
+    count_sk = {char: 0 for char in "ABCDEFGHIJKL"}
+    for i, line in enumerate(segments_copy):
+        text = line["text"]
+        speaker = line["speaker"]
+        last_start = line["start"]
+        # text_data_dict.append({str(int(speaker[-1])+1): text})
+        index_sk = int(speaker[-2:])
+        character_sk = name_speaker[index_sk]
+        count_sk[character_sk] += 1
+        code_sk = character_sk+str(count_sk[character_sk])
+        text_data_dict.append({code_sk: text})
+        num_tokens += len(encoding.encode(text)) + 7
+        if num_tokens >= token_batch_limit or i == len(segments_copy)-1:
+            try:
+                batch_lines = len(text_data_dict)
+                batch_conversation = {"conversation": copy.deepcopy(text_data_dict)}
+                # Reset vars
+                num_tokens = 0
+                text_data_dict = []
+                count_sk = {char: 0 for char in "ABCDEFGHIJKL"}
+                # Process translation
+                # https://arxiv.org/pdf/2309.03409.pdf
+                system_prompt = f"Machine translation designed to output the translated_conversation key JSON containing a list of {batch_lines} items."
+                user_prompt = f"Translate each of the following text values in conversation{' from' if lang_sc else ''} {lang_sc} to {lang_tg}:\n{batch_conversation}"
+                logger.debug(f"Prompt: {str(user_prompt)}")
+                conversation = call_gpt_translate(
+                    client,
+                    model,
+                    system_prompt,
+                    user_prompt,
+                    original_text=batch_conversation,
+                    batch_lines=batch_lines,
+                )
+                if len(conversation) < batch_lines:
+                    raise ValueError(
+                        "Incomplete result received. Batch lines: "
+                        f"{len(conversation)}, expected: {batch_lines}"
+                    )
+                for i, translated_text in enumerate(conversation):
+                    if i+1 > batch_lines:
+                        break
+                    translated_lines.append(list(translated_text.values())[0])
+                progress_bar.update(batch_lines)
+            except Exception as error:
+                logger.error(str(error))
+                first_start = segments_copy[max(0, i-(batch_lines-1))]["start"]
+                logger.warning(
+                    f"The batch from {first_start} to {last_start} "
+                    "failed, is being corrected with Google Translate"
+                )
+                translator = GoogleTranslator(
+                    source=fixed_source,
+                    target=fixed_target
+                )
+                for txt_source in batch_conversation["conversation"]:
+                    translated_txt = translator.translate(
+                        list(txt_source.values())[0].strip()
+                    )
+                    translated_lines.append(translated_txt.strip())
+                    progress_bar.update(1)
+    progress_bar.close()
+    return verify_translate(
+        segments, segments_copy, translated_lines, fixed_target, fixed_source
+    )
+def translate_text(
+    segments,
+    target,
+    translation_process="google_translator_batch",
+    chunk_size=4500,
+    source=None,
+    token_batch_limit=1000,
+):
+    """Translates text segments using a specified process."""
+    match translation_process:
+        case "google_translator_batch":
+            return translate_batch(
+                segments,
+                fix_code_language(target),
+                chunk_size,
+                fix_code_language(source)
+            )
+        case "google_translator":
+            return translate_iterative(
+                segments,
+                fix_code_language(target),
+                fix_code_language(source)
+            )
+        case model if model in ["gpt-3.5-turbo-0125", "gpt-4-turbo-preview"]:
+            return gpt_sequential(segments, model, target, source)
+        case model if model in ["gpt-3.5-turbo-0125_batch", "gpt-4-turbo-preview_batch",]:
+            return gpt_batch(
+                segments,
+                translation_process.replace("_batch", ""),
+                target,
+                token_batch_limit,
+                source
+            )
+        case "disable_translation":
+            return segments
+        case _:
+            raise ValueError("No valid translation process")

soni_translate/utils.py CHANGED Viewed

@@ -1,487 +1,483 @@
-import os, zipfile, rarfile, shutil, subprocess, shlex, sys # noqa
-from .logging_setup import logger
-from urllib.parse import urlparse
-from IPython.utils import capture
-import re
-VIDEO_EXTENSIONS = [
-    ".mp4",
-    ".avi",
-    ".mov",
-    ".mkv",
-    ".wmv",
-    ".flv",
-    ".webm",
-    ".m4v",
-    ".mpeg",
-    ".mpg",
-    ".3gp"
-]
-AUDIO_EXTENSIONS = [
-    ".mp3",
-    ".wav",
-    ".aiff",
-    ".aif",
-    ".flac",
-    ".aac",
-    ".ogg",
-    ".wma",
-    ".m4a",
-    ".alac",
-    ".pcm",
-    ".opus",
-    ".ape",
-    ".amr",
-    ".ac3",
-    ".vox",
-    ".caf"
-]
-SUBTITLE_EXTENSIONS = [
-    ".srt",
-    ".vtt",
-    ".ass"
-]
-def run_command(command):
-    logger.debug(command)
-    if isinstance(command, str):
-        command = shlex.split(command)
-    sub_params = {
-        "stdout": subprocess.PIPE,
-        "stderr": subprocess.PIPE,
-        "creationflags": subprocess.CREATE_NO_WINDOW
-        if sys.platform == "win32"
-        else 0,
-    }
-    process_command = subprocess.Popen(command, **sub_params)
-    output, errors = process_command.communicate()
-    if (
-        process_command.returncode != 0
-    ):  # or not os.path.exists(mono_path) or os.path.getsize(mono_path) == 0:
-        logger.error("Error comnand")
-        raise Exception(errors.decode())
-def print_tree_directory(root_dir, indent=""):
-    if not os.path.exists(root_dir):
-        logger.error(f"{indent} Invalid directory or file: {root_dir}")
-        return
-    items = os.listdir(root_dir)
-    for index, item in enumerate(sorted(items)):
-        item_path = os.path.join(root_dir, item)
-        is_last_item = index == len(items) - 1
-        if os.path.isfile(item_path) and item_path.endswith(".zip"):
-            with zipfile.ZipFile(item_path, "r") as zip_file:
-                print(
-                    f"{indent}{'└──' if is_last_item else '├──'} {item} (zip file)"
-                )
-                zip_contents = zip_file.namelist()
-                for zip_item in sorted(zip_contents):
-                    print(
-                        f"{indent}{'    ' if is_last_item else '│   '}{zip_item}"
-                    )
-        else:
-            print(f"{indent}{'└──' if is_last_item else '├──'} {item}")
-            if os.path.isdir(item_path):
-                new_indent = indent + ("    " if is_last_item else "│   ")
-                print_tree_directory(item_path, new_indent)
-def upload_model_list():
-    weight_root = "weights"
-    models = []
-    for name in os.listdir(weight_root):
-        if name.endswith(".pth"):
-            models.append("weights/" + name)
-    if models:
-        logger.debug(models)
-    index_root = "logs"
-    index_paths = [None]
-    for name in os.listdir(index_root):
-        if name.endswith(".index"):
-            index_paths.append("logs/" + name)
-    if index_paths:
-        logger.debug(index_paths)
-    return models, index_paths
-def manual_download(url, dst):
-    if "drive.google" in url:
-        logger.info("Drive url")
-        if "folders" in url:
-            logger.info("folder")
-            os.system(f'gdown --folder "{url}" -O {dst} --fuzzy -c')
-        else:
-            logger.info("single")
-            os.system(f'gdown "{url}" -O {dst} --fuzzy -c')
-    elif "huggingface" in url:
-        logger.info("HuggingFace url")
-        if "/blob/" in url or "/resolve/" in url:
-            if "/blob/" in url:
-                url = url.replace("/blob/", "/resolve/")
-            download_manager(url=url, path=dst, overwrite=True, progress=True)
-        else:
-            os.system(f"git clone {url} {dst+'repo/'}")
-    elif "http" in url:
-        logger.info("URL")
-        download_manager(url=url, path=dst, overwrite=True, progress=True)
-    elif os.path.exists(url):
-        logger.info("Path")
-        copy_files(url, dst)
-    else:
-        logger.error(f"No valid URL: {url}")
-def download_list(text_downloads):
-    if os.environ.get("ZERO_GPU") == "TRUE":
-        raise RuntimeError("This option is disabled in this demo.")
-    try:
-        urls = [elem.strip() for elem in text_downloads.split(",")]
-    except Exception as error:
-        raise ValueError(f"No valid URL. {str(error)}")
-    create_directories(["downloads", "logs", "weights"])
-    path_download = "downloads/"
-    for url in urls:
-        manual_download(url, path_download)
-    # Tree
-    print("####################################")
-    print_tree_directory("downloads", indent="")
-    print("####################################")
-    # Place files
-    select_zip_and_rar_files("downloads/")
-    models, _ = upload_model_list()
-    # hf space models files delete
-    remove_directory_contents("downloads/repo")
-    return f"Downloaded = {models}"
-def select_zip_and_rar_files(directory_path="downloads/"):
-    # filter
-    zip_files = []
-    rar_files = []
-    for file_name in os.listdir(directory_path):
-        if file_name.endswith(".zip"):
-            zip_files.append(file_name)
-        elif file_name.endswith(".rar"):
-            rar_files.append(file_name)
-    # extract
-    for file_name in zip_files:
-        file_path = os.path.join(directory_path, file_name)
-        with zipfile.ZipFile(file_path, "r") as zip_ref:
-            zip_ref.extractall(directory_path)
-    for file_name in rar_files:
-        file_path = os.path.join(directory_path, file_name)
-        with rarfile.RarFile(file_path, "r") as rar_ref:
-            rar_ref.extractall(directory_path)
-    # set in path
-    def move_files_with_extension(src_dir, extension, destination_dir):
-        for root, _, files in os.walk(src_dir):
-            for file_name in files:
-                if file_name.endswith(extension):
-                    source_file = os.path.join(root, file_name)
-                    destination = os.path.join(destination_dir, file_name)
-                    shutil.move(source_file, destination)
-    move_files_with_extension(directory_path, ".index", "logs/")
-    move_files_with_extension(directory_path, ".pth", "weights/")
-    return "Download complete"
-def is_file_with_extensions(string_path, extensions):
-    return any(string_path.lower().endswith(ext) for ext in extensions)
-def is_video_file(string_path):
-    return is_file_with_extensions(string_path, VIDEO_EXTENSIONS)
-def is_audio_file(string_path):
-    return is_file_with_extensions(string_path, AUDIO_EXTENSIONS)
-def is_subtitle_file(string_path):
-    return is_file_with_extensions(string_path, SUBTITLE_EXTENSIONS)
-def get_directory_files(directory):
-    audio_files = []
-    video_files = []
-    sub_files = []
-    for item in os.listdir(directory):
-        item_path = os.path.join(directory, item)
-        if os.path.isfile(item_path):
-            if is_audio_file(item_path):
-                audio_files.append(item_path)
-            elif is_video_file(item_path):
-                video_files.append(item_path)
-            elif is_subtitle_file(item_path):
-                sub_files.append(item_path)
-    logger.info(
-        f"Files in path ({directory}): "
-        f"{str(audio_files + video_files + sub_files)}"
-    )
-    return audio_files, video_files, sub_files
-def get_valid_files(paths):
-    valid_paths = []
-    for path in paths:
-        if os.path.isdir(path):
-            audio_files, video_files, sub_files = get_directory_files(path)
-            valid_paths.extend(audio_files)
-            valid_paths.extend(video_files)
-            valid_paths.extend(sub_files)
-        else:
-            valid_paths.append(path)
-    return valid_paths
-def extract_video_links(link):
-    params_dlp = {"quiet": False, "no_warnings": True, "noplaylist": False}
-    try:
-        from yt_dlp import YoutubeDL
-        with capture.capture_output() as cap:
-            with YoutubeDL(params_dlp) as ydl:
-                info_dict = ydl.extract_info( # noqa
-                    link, download=False, process=True
-                )
-        urls = re.findall(r'\[youtube\] Extracting URL: (.*?)\n', cap.stdout)
-        logger.info(f"List of videos in ({link}): {str(urls)}")
-        del cap
-    except Exception as error:
-        logger.error(f"{link} >> {str(error)}")
-        urls = [link]
-    return urls
-def get_link_list(urls):
-    valid_links = []
-    for url_video in urls:
-        if "youtube.com" in url_video and "/watch?v=" not in url_video:
-            url_links = extract_video_links(url_video)
-            valid_links.extend(url_links)
-        else:
-            valid_links.append(url_video)
-    return valid_links
-# =====================================
-# Download Manager
-# =====================================
-def load_file_from_url(
-    url: str,
-    model_dir: str,
-    file_name: str | None = None,
-    overwrite: bool = False,
-    progress: bool = True,
-) -> str:
-    """Download a file from `url` into `model_dir`,
-    using the file present if possible.
-    Returns the path to the downloaded file.
-    """
-    os.makedirs(model_dir, exist_ok=True)
-    if not file_name:
-        parts = urlparse(url)
-        file_name = os.path.basename(parts.path)
-    cached_file = os.path.abspath(os.path.join(model_dir, file_name))
-    # Overwrite
-    if os.path.exists(cached_file):
-        if overwrite or os.path.getsize(cached_file) == 0:
-            remove_files(cached_file)
-    # Download
-    if not os.path.exists(cached_file):
-        logger.info(f'Downloading: "{url}" to {cached_file}\n')
-        from torch.hub import download_url_to_file
-        download_url_to_file(url, cached_file, progress=progress)
-    else:
-        logger.debug(cached_file)
-    return cached_file
-def friendly_name(file: str):
-    if file.startswith("http"):
-        file = urlparse(file).path
-    file = os.path.basename(file)
-    model_name, extension = os.path.splitext(file)
-    return model_name, extension
-def download_manager(
-    url: str,
-    path: str,
-    extension: str = "",
-    overwrite: bool = False,
-    progress: bool = True,
-):
-    url = url.strip()
-    name, ext = friendly_name(url)
-    name += ext if not extension else f".{extension}"
-    if url.startswith("http"):
-        filename = load_file_from_url(
-            url=url,
-            model_dir=path,
-            file_name=name,
-            overwrite=overwrite,
-            progress=progress,
-        )
-    else:
-        filename = path
-    return filename
-# =====================================
-# File management
-# =====================================
-# only remove files
-def remove_files(file_list):
-    if isinstance(file_list, str):
-        file_list = [file_list]
-    for file in file_list:
-        if os.path.exists(file):
-            os.remove(file)
-def remove_directory_contents(directory_path):
-    """
-    Removes all files and subdirectories within a directory.
-    Parameters:
-    directory_path (str): Path to the directory whose
-    contents need to be removed.
-    """
-    if os.path.exists(directory_path):
-        for filename in os.listdir(directory_path):
-            file_path = os.path.join(directory_path, filename)
-            try:
-                if os.path.isfile(file_path):
-                    os.remove(file_path)
-                elif os.path.isdir(file_path):
-                    shutil.rmtree(file_path)
-            except Exception as e:
-                logger.error(f"Failed to delete {file_path}. Reason: {e}")
-        logger.info(f"Content in '{directory_path}' removed.")
-    else:
-        logger.error(f"Directory '{directory_path}' does not exist.")
-# Create directory if not exists
-def create_directories(directory_path):
-    if isinstance(directory_path, str):
-        directory_path = [directory_path]
-    for one_dir_path in directory_path:
-        if not os.path.exists(one_dir_path):
-            os.makedirs(one_dir_path)
-            logger.debug(f"Directory '{one_dir_path}' created.")
-def move_files(source_dir, destination_dir, extension=""):
-    """
-    Moves file(s) from the source path to the destination path.
-    Parameters:
-    source_dir (str): Path to the source directory.
-    destination_dir (str): Path to the destination directory.
-    extension (str): Only move files with this extension.
-    """
-    create_directories(destination_dir)
-    for filename in os.listdir(source_dir):
-        source_path = os.path.join(source_dir, filename)
-        destination_path = os.path.join(destination_dir, filename)
-        if extension and not filename.endswith(extension):
-            continue
-        os.replace(source_path, destination_path)
-def copy_files(source_path, destination_path):
-    """
-    Copies a file or multiple files from a source path to a destination path.
-    Parameters:
-    source_path (str or list): Path or list of paths to the source
-    file(s) or directory.
-    destination_path (str): Path to the destination directory.
-    """
-    create_directories(destination_path)
-    if isinstance(source_path, str):
-        source_path = [source_path]
-    if os.path.isdir(source_path[0]):
-        # Copy all files from the source directory to the destination directory
-        base_path = source_path[0]
-        source_path = os.listdir(source_path[0])
-        source_path = [
-            os.path.join(base_path, file_name) for file_name in source_path
-        ]
-    for one_source_path in source_path:
-        if os.path.exists(one_source_path):
-            shutil.copy2(one_source_path, destination_path)
-            logger.debug(
-                f"File '{one_source_path}' copied to '{destination_path}'."
-            )
-        else:
-            logger.error(f"File '{one_source_path}' does not exist.")
-def rename_file(current_name, new_name):
-    file_directory = os.path.dirname(current_name)
-    if os.path.exists(current_name):
-        dir_new_name_file = os.path.join(file_directory, new_name)
-        os.rename(current_name, dir_new_name_file)
-        logger.debug(f"File '{current_name}' renamed to '{new_name}'.")
-        return dir_new_name_file
-    else:
-        logger.error(f"File '{current_name}' does not exist.")
-        return None

+import os, zipfile, rarfile, shutil, subprocess, shlex, sys # noqa
+from .logging_setup import logger
+from urllib.parse import urlparse
+from IPython.utils import capture
+import re
+VIDEO_EXTENSIONS = [
+    ".mp4",
+    ".avi",
+    ".mov",
+    ".mkv",
+    ".wmv",
+    ".flv",
+    ".webm",
+    ".m4v",
+    ".mpeg",
+    ".mpg",
+    ".3gp"
+]
+AUDIO_EXTENSIONS = [
+    ".mp3",
+    ".wav",
+    ".aiff",
+    ".aif",
+    ".flac",
+    ".aac",
+    ".ogg",
+    ".wma",
+    ".m4a",
+    ".alac",
+    ".pcm",
+    ".opus",
+    ".ape",
+    ".amr",
+    ".ac3",
+    ".vox",
+    ".caf"
+]
+SUBTITLE_EXTENSIONS = [
+    ".srt",
+    ".vtt",
+    ".ass"
+]
+def run_command(command):
+    logger.debug(command)
+    if isinstance(command, str):
+        command = shlex.split(command)
+    sub_params = {
+        "stdout": subprocess.PIPE,
+        "stderr": subprocess.PIPE,
+        "creationflags": subprocess.CREATE_NO_WINDOW
+        if sys.platform == "win32"
+        else 0,
+    }
+    process_command = subprocess.Popen(command, **sub_params)
+    output, errors = process_command.communicate()
+    if (
+        process_command.returncode != 0
+    ):  # or not os.path.exists(mono_path) or os.path.getsize(mono_path) == 0:
+        logger.error("Error comnand")
+        raise Exception(errors.decode())
+def print_tree_directory(root_dir, indent=""):
+    if not os.path.exists(root_dir):
+        logger.error(f"{indent} Invalid directory or file: {root_dir}")
+        return
+    items = os.listdir(root_dir)
+    for index, item in enumerate(sorted(items)):
+        item_path = os.path.join(root_dir, item)
+        is_last_item = index == len(items) - 1
+        if os.path.isfile(item_path) and item_path.endswith(".zip"):
+            with zipfile.ZipFile(item_path, "r") as zip_file:
+                print(
+                    f"{indent}{'└──' if is_last_item else '├──'} {item} (zip file)"
+                )
+                zip_contents = zip_file.namelist()
+                for zip_item in sorted(zip_contents):
+                    print(
+                        f"{indent}{'    ' if is_last_item else '│   '}{zip_item}"
+                    )
+        else:
+            print(f"{indent}{'└──' if is_last_item else '├──'} {item}")
+            if os.path.isdir(item_path):
+                new_indent = indent + ("    " if is_last_item else "│   ")
+                print_tree_directory(item_path, new_indent)
+def upload_model_list():
+    weight_root = "weights"
+    models = []
+    for name in os.listdir(weight_root):
+        if name.endswith(".pth"):
+            models.append("weights/" + name)
+    if models:
+        logger.debug(models)
+    index_root = "logs"
+    index_paths = [None]
+    for name in os.listdir(index_root):
+        if name.endswith(".index"):
+            index_paths.append("logs/" + name)
+    if index_paths:
+        logger.debug(index_paths)
+    return models, index_paths
+def manual_download(url, dst):
+    if "drive.google" in url:
+        logger.info("Drive url")
+        if "folders" in url:
+            logger.info("folder")
+            os.system(f'gdown --folder "{url}" -O {dst} --fuzzy -c')
+        else:
+            logger.info("single")
+            os.system(f'gdown "{url}" -O {dst} --fuzzy -c')
+    elif "huggingface" in url:
+        logger.info("HuggingFace url")
+        if "/blob/" in url or "/resolve/" in url:
+            if "/blob/" in url:
+                url = url.replace("/blob/", "/resolve/")
+            download_manager(url=url, path=dst, overwrite=True, progress=True)
+        else:
+            os.system(f"git clone {url} {dst+'repo/'}")
+    elif "http" in url:
+        logger.info("URL")
+        download_manager(url=url, path=dst, overwrite=True, progress=True)
+    elif os.path.exists(url):
+        logger.info("Path")
+        copy_files(url, dst)
+    else:
+        logger.error(f"No valid URL: {url}")
+def download_list(text_downloads):
+    try:
+        urls = [elem.strip() for elem in text_downloads.split(",")]
+    except Exception as error:
+        raise ValueError(f"No valid URL. {str(error)}")
+    create_directories(["downloads", "logs", "weights"])
+    path_download = "downloads/"
+    for url in urls:
+        manual_download(url, path_download)
+    # Tree
+    print("####################################")
+    print_tree_directory("downloads", indent="")
+    print("####################################")
+    # Place files
+    select_zip_and_rar_files("downloads/")
+    models, _ = upload_model_list()
+    # hf space models files delete
+    remove_directory_contents("downloads/repo")
+    return f"Downloaded = {models}"
+def select_zip_and_rar_files(directory_path="downloads/"):
+    # filter
+    zip_files = []
+    rar_files = []
+    for file_name in os.listdir(directory_path):
+        if file_name.endswith(".zip"):
+            zip_files.append(file_name)
+        elif file_name.endswith(".rar"):
+            rar_files.append(file_name)
+    # extract
+    for file_name in zip_files:
+        file_path = os.path.join(directory_path, file_name)
+        with zipfile.ZipFile(file_path, "r") as zip_ref:
+            zip_ref.extractall(directory_path)
+    for file_name in rar_files:
+        file_path = os.path.join(directory_path, file_name)
+        with rarfile.RarFile(file_path, "r") as rar_ref:
+            rar_ref.extractall(directory_path)
+    # set in path
+    def move_files_with_extension(src_dir, extension, destination_dir):
+        for root, _, files in os.walk(src_dir):
+            for file_name in files:
+                if file_name.endswith(extension):
+                    source_file = os.path.join(root, file_name)
+                    destination = os.path.join(destination_dir, file_name)
+                    shutil.move(source_file, destination)
+    move_files_with_extension(directory_path, ".index", "logs/")
+    move_files_with_extension(directory_path, ".pth", "weights/")
+    return "Download complete"
+def is_file_with_extensions(string_path, extensions):
+    return any(string_path.lower().endswith(ext) for ext in extensions)
+def is_video_file(string_path):
+    return is_file_with_extensions(string_path, VIDEO_EXTENSIONS)
+def is_audio_file(string_path):
+    return is_file_with_extensions(string_path, AUDIO_EXTENSIONS)
+def is_subtitle_file(string_path):
+    return is_file_with_extensions(string_path, SUBTITLE_EXTENSIONS)
+def get_directory_files(directory):
+    audio_files = []
+    video_files = []
+    sub_files = []
+    for item in os.listdir(directory):
+        item_path = os.path.join(directory, item)
+        if os.path.isfile(item_path):
+            if is_audio_file(item_path):
+                audio_files.append(item_path)
+            elif is_video_file(item_path):
+                video_files.append(item_path)
+            elif is_subtitle_file(item_path):
+                sub_files.append(item_path)
+    logger.info(
+        f"Files in path ({directory}): "
+        f"{str(audio_files + video_files + sub_files)}"
+    )
+    return audio_files, video_files, sub_files
+def get_valid_files(paths):
+    valid_paths = []
+    for path in paths:
+        if os.path.isdir(path):
+            audio_files, video_files, sub_files = get_directory_files(path)
+            valid_paths.extend(audio_files)
+            valid_paths.extend(video_files)
+            valid_paths.extend(sub_files)
+        else:
+            valid_paths.append(path)
+    return valid_paths
+def extract_video_links(link):
+    params_dlp = {"quiet": False, "no_warnings": True, "noplaylist": False}
+    try:
+        from yt_dlp import YoutubeDL
+        with capture.capture_output() as cap:
+            with YoutubeDL(params_dlp) as ydl:
+                info_dict = ydl.extract_info( # noqa
+                    link, download=False, process=True
+                )
+        urls = re.findall(r'\[youtube\] Extracting URL: (.*?)\n', cap.stdout)
+        logger.info(f"List of videos in ({link}): {str(urls)}")
+        del cap
+    except Exception as error:
+        logger.error(f"{link} >> {str(error)}")
+        urls = [link]
+    return urls
+def get_link_list(urls):
+    valid_links = []
+    for url_video in urls:
+        if "youtube.com" in url_video and "/watch?v=" not in url_video:
+            url_links = extract_video_links(url_video)
+            valid_links.extend(url_links)
+        else:
+            valid_links.append(url_video)
+    return valid_links
+# =====================================
+# Download Manager
+# =====================================
+def load_file_from_url(
+    url: str,
+    model_dir: str,
+    file_name: str | None = None,
+    overwrite: bool = False,
+    progress: bool = True,
+) -> str:
+    """Download a file from `url` into `model_dir`,
+    using the file present if possible.
+    Returns the path to the downloaded file.
+    """
+    os.makedirs(model_dir, exist_ok=True)
+    if not file_name:
+        parts = urlparse(url)
+        file_name = os.path.basename(parts.path)
+    cached_file = os.path.abspath(os.path.join(model_dir, file_name))
+    # Overwrite
+    if os.path.exists(cached_file):
+        if overwrite or os.path.getsize(cached_file) == 0:
+            remove_files(cached_file)
+    # Download
+    if not os.path.exists(cached_file):
+        logger.info(f'Downloading: "{url}" to {cached_file}\n')
+        from torch.hub import download_url_to_file
+        download_url_to_file(url, cached_file, progress=progress)
+    else:
+        logger.debug(cached_file)
+    return cached_file
+def friendly_name(file: str):
+    if file.startswith("http"):
+        file = urlparse(file).path
+    file = os.path.basename(file)
+    model_name, extension = os.path.splitext(file)
+    return model_name, extension
+def download_manager(
+    url: str,
+    path: str,
+    extension: str = "",
+    overwrite: bool = False,
+    progress: bool = True,
+):
+    url = url.strip()
+    name, ext = friendly_name(url)
+    name += ext if not extension else f".{extension}"
+    if url.startswith("http"):
+        filename = load_file_from_url(
+            url=url,
+            model_dir=path,
+            file_name=name,
+            overwrite=overwrite,
+            progress=progress,
+        )
+    else:
+        filename = path
+    return filename
+# =====================================
+# File management
+# =====================================
+# only remove files
+def remove_files(file_list):
+    if isinstance(file_list, str):
+        file_list = [file_list]
+    for file in file_list:
+        if os.path.exists(file):
+            os.remove(file)
+def remove_directory_contents(directory_path):
+    """
+    Removes all files and subdirectories within a directory.
+    Parameters:
+    directory_path (str): Path to the directory whose
+    contents need to be removed.
+    """
+    if os.path.exists(directory_path):
+        for filename in os.listdir(directory_path):
+            file_path = os.path.join(directory_path, filename)
+            try:
+                if os.path.isfile(file_path):
+                    os.remove(file_path)
+                elif os.path.isdir(file_path):
+                    shutil.rmtree(file_path)
+            except Exception as e:
+                logger.error(f"Failed to delete {file_path}. Reason: {e}")
+        logger.info(f"Content in '{directory_path}' removed.")
+    else:
+        logger.error(f"Directory '{directory_path}' does not exist.")
+# Create directory if not exists
+def create_directories(directory_path):
+    if isinstance(directory_path, str):
+        directory_path = [directory_path]
+    for one_dir_path in directory_path:
+        if not os.path.exists(one_dir_path):
+            os.makedirs(one_dir_path)
+            logger.debug(f"Directory '{one_dir_path}' created.")
+def move_files(source_dir, destination_dir, extension=""):
+    """
+    Moves file(s) from the source path to the destination path.
+    Parameters:
+    source_dir (str): Path to the source directory.
+    destination_dir (str): Path to the destination directory.
+    extension (str): Only move files with this extension.
+    """
+    create_directories(destination_dir)
+    for filename in os.listdir(source_dir):
+        source_path = os.path.join(source_dir, filename)
+        destination_path = os.path.join(destination_dir, filename)
+        if extension and not filename.endswith(extension):
+            continue
+        os.replace(source_path, destination_path)
+def copy_files(source_path, destination_path):
+    """
+    Copies a file or multiple files from a source path to a destination path.
+    Parameters:
+    source_path (str or list): Path or list of paths to the source
+    file(s) or directory.
+    destination_path (str): Path to the destination directory.
+    """
+    create_directories(destination_path)
+    if isinstance(source_path, str):
+        source_path = [source_path]
+    if os.path.isdir(source_path[0]):
+        # Copy all files from the source directory to the destination directory
+        base_path = source_path[0]
+        source_path = os.listdir(source_path[0])
+        source_path = [
+            os.path.join(base_path, file_name) for file_name in source_path
+        ]
+    for one_source_path in source_path:
+        if os.path.exists(one_source_path):
+            shutil.copy2(one_source_path, destination_path)
+            logger.debug(
+                f"File '{one_source_path}' copied to '{destination_path}'."
+            )
+        else:
+            logger.error(f"File '{one_source_path}' does not exist.")
+def rename_file(current_name, new_name):
+    file_directory = os.path.dirname(current_name)
+    if os.path.exists(current_name):
+        dir_new_name_file = os.path.join(file_directory, new_name)
+        os.rename(current_name, dir_new_name_file)
+        logger.debug(f"File '{current_name}' renamed to '{new_name}'.")
+        return dir_new_name_file
+    else:
+        logger.error(f"File '{current_name}' does not exist.")
+        return None

voice_main.py CHANGED Viewed

@@ -1,732 +1,732 @@
-from soni_translate.logging_setup import logger
-import torch
-import gc
-import numpy as np
-import os
-import shutil
-import warnings
-import threading
-from tqdm import tqdm
-from lib.infer_pack.models import (
-    SynthesizerTrnMs256NSFsid,
-    SynthesizerTrnMs256NSFsid_nono,
-    SynthesizerTrnMs768NSFsid,
-    SynthesizerTrnMs768NSFsid_nono,
-)
-from lib.audio import load_audio
-import soundfile as sf
-import edge_tts
-import asyncio
-from soni_translate.utils import remove_directory_contents, create_directories
-from scipy import signal
-from time import time as ttime
-import faiss
-from vci_pipeline import VC, change_rms, bh, ah
-import librosa
-warnings.filterwarnings("ignore")
-class Config:
-    def __init__(self, only_cpu=False):
-        self.device = "cuda:0"
-        self.is_half = True
-        self.n_cpu = 0
-        self.gpu_name = None
-        self.gpu_mem = None
-        (
-            self.x_pad,
-            self.x_query,
-            self.x_center,
-            self.x_max
-        ) = self.device_config(only_cpu)
-    def device_config(self, only_cpu) -> tuple:
-        if torch.cuda.is_available() and not only_cpu:
-            i_device = int(self.device.split(":")[-1])
-            self.gpu_name = torch.cuda.get_device_name(i_device)
-            if (
-                ("16" in self.gpu_name and "V100" not in self.gpu_name.upper())
-                or "P40" in self.gpu_name.upper()
-                or "1060" in self.gpu_name
-                or "1070" in self.gpu_name
-                or "1080" in self.gpu_name
-            ):
-                logger.info(
-                    "16/10 Series GPUs and P40 excel "
-                    "in single-precision tasks."
-                )
-                self.is_half = False
-            else:
-                self.gpu_name = None
-            self.gpu_mem = int(
-                torch.cuda.get_device_properties(i_device).total_memory
-                / 1024
-                / 1024
-                / 1024
-                + 0.4
-            )
-        elif torch.backends.mps.is_available() and not only_cpu:
-            logger.info("Supported N-card not found, using MPS for inference")
-            self.device = "mps"
-        else:
-            logger.info("No supported N-card found, using CPU for inference")
-            self.device = "cpu"
-            self.is_half = False
-        if self.n_cpu == 0:
-            self.n_cpu = os.cpu_count()
-        if self.is_half:
-            # 6GB VRAM configuration
-            x_pad = 3
-            x_query = 10
-            x_center = 60
-            x_max = 65
-        else:
-            # 5GB VRAM configuration
-            x_pad = 1
-            x_query = 6
-            x_center = 38
-            x_max = 41
-        if self.gpu_mem is not None and self.gpu_mem <= 4:
-            x_pad = 1
-            x_query = 5
-            x_center = 30
-            x_max = 32
-        logger.info(
-            f"Config: Device is {self.device}, "
-            f"half precision is {self.is_half}"
-        )
-        return x_pad, x_query, x_center, x_max
-BASE_DOWNLOAD_LINK = "https://huggingface.co/r3gm/sonitranslate_voice_models/resolve/main/"
-BASE_MODELS = [
-    "hubert_base.pt",
-    "rmvpe.pt"
-]
-BASE_DIR = "."
-def load_hu_bert(config):
-    from fairseq import checkpoint_utils
-    from soni_translate.utils import download_manager
-    for id_model in BASE_MODELS:
-        download_manager(
-            os.path.join(BASE_DOWNLOAD_LINK, id_model), BASE_DIR
-        )
-    models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
-        ["hubert_base.pt"],
-        suffix="",
-    )
-    hubert_model = models[0]
-    hubert_model = hubert_model.to(config.device)
-    if config.is_half:
-        hubert_model = hubert_model.half()
-    else:
-        hubert_model = hubert_model.float()
-    hubert_model.eval()
-    return hubert_model
-def load_trained_model(model_path, config):
-    if not model_path:
-        raise ValueError("No model found")
-    logger.info("Loading %s" % model_path)
-    cpt = torch.load(model_path, map_location="cpu")
-    tgt_sr = cpt["config"][-1]
-    cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]  # n_spk
-    if_f0 = cpt.get("f0", 1)
-    if if_f0 == 0:
-        # protect to 0.5 need?
-        pass
-    version = cpt.get("version", "v1")
-    if version == "v1":
-        if if_f0 == 1:
-            net_g = SynthesizerTrnMs256NSFsid(
-                *cpt["config"], is_half=config.is_half
-            )
-        else:
-            net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
-    elif version == "v2":
-        if if_f0 == 1:
-            net_g = SynthesizerTrnMs768NSFsid(
-                *cpt["config"], is_half=config.is_half
-            )
-        else:
-            net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
-    del net_g.enc_q
-    net_g.load_state_dict(cpt["weight"], strict=False)
-    net_g.eval().to(config.device)
-    if config.is_half:
-        net_g = net_g.half()
-    else:
-        net_g = net_g.float()
-    vc = VC(tgt_sr, config)
-    n_spk = cpt["config"][-3]
-    return n_spk, tgt_sr, net_g, vc, cpt, version
-class ClassVoices:
-    def __init__(self, only_cpu=False):
-        self.model_config = {}
-        self.config = None
-        self.only_cpu = only_cpu
-    def apply_conf(
-        self,
-        tag="base_model",
-        file_model="",
-        pitch_algo="pm",
-        pitch_lvl=0,
-        file_index="",
-        index_influence=0.66,
-        respiration_median_filtering=3,
-        envelope_ratio=0.25,
-        consonant_breath_protection=0.33,
-        resample_sr=0,
-        file_pitch_algo="",
-    ):
-        if not file_model:
-            raise ValueError("Model not found")
-        if file_index is None:
-            file_index = ""
-        if file_pitch_algo is None:
-            file_pitch_algo = ""
-        if not self.config:
-            self.config = Config(self.only_cpu)
-            self.hu_bert_model = None
-            self.model_pitch_estimator = None
-        self.model_config[tag] = {
-            "file_model": file_model,
-            "pitch_algo": pitch_algo,
-            "pitch_lvl": pitch_lvl,  # no decimal
-            "file_index": file_index,
-            "index_influence": index_influence,
-            "respiration_median_filtering": respiration_median_filtering,
-            "envelope_ratio": envelope_ratio,
-            "consonant_breath_protection": consonant_breath_protection,
-            "resample_sr": resample_sr,
-            "file_pitch_algo": file_pitch_algo,
-        }
-        return f"CONFIGURATION APPLIED FOR {tag}: {file_model}"
-    def infer(
-        self,
-        task_id,
-        params,
-        # load model
-        n_spk,
-        tgt_sr,
-        net_g,
-        pipe,
-        cpt,
-        version,
-        if_f0,
-        # load index
-        index_rate,
-        index,
-        big_npy,
-        # load f0 file
-        inp_f0,
-        # audio file
-        input_audio_path,
-        overwrite,
-    ):
-        f0_method = params["pitch_algo"]
-        f0_up_key = params["pitch_lvl"]
-        filter_radius = params["respiration_median_filtering"]
-        resample_sr = params["resample_sr"]
-        rms_mix_rate = params["envelope_ratio"]
-        protect = params["consonant_breath_protection"]
-        if not os.path.exists(input_audio_path):
-            raise ValueError(
-                "The audio file was not found or is not "
-                f"a valid file: {input_audio_path}"
-            )
-        f0_up_key = int(f0_up_key)
-        audio = load_audio(input_audio_path, 16000)
-        # Normalize audio
-        audio_max = np.abs(audio).max() / 0.95
-        if audio_max > 1:
-            audio /= audio_max
-        times = [0, 0, 0]
-        # filters audio signal, pads it, computes sliding window sums,
-        # and extracts optimized time indices
-        audio = signal.filtfilt(bh, ah, audio)
-        audio_pad = np.pad(
-            audio, (pipe.window // 2, pipe.window // 2), mode="reflect"
-        )
-        opt_ts = []
-        if audio_pad.shape[0] > pipe.t_max:
-            audio_sum = np.zeros_like(audio)
-            for i in range(pipe.window):
-                audio_sum += audio_pad[i:i - pipe.window]
-            for t in range(pipe.t_center, audio.shape[0], pipe.t_center):
-                opt_ts.append(
-                    t
-                    - pipe.t_query
-                    + np.where(
-                        np.abs(audio_sum[t - pipe.t_query: t + pipe.t_query])
-                        == np.abs(audio_sum[t - pipe.t_query: t + pipe.t_query]).min()
-                    )[0][0]
-                )
-        s = 0
-        audio_opt = []
-        t = None
-        t1 = ttime()
-        sid_value = 0
-        sid = torch.tensor(sid_value, device=pipe.device).unsqueeze(0).long()
-        # Pads audio symmetrically, calculates length divided by window size.
-        audio_pad = np.pad(audio, (pipe.t_pad, pipe.t_pad), mode="reflect")
-        p_len = audio_pad.shape[0] // pipe.window
-        # Estimates pitch from audio signal
-        pitch, pitchf = None, None
-        if if_f0 == 1:
-            pitch, pitchf = pipe.get_f0(
-                input_audio_path,
-                audio_pad,
-                p_len,
-                f0_up_key,
-                f0_method,
-                filter_radius,
-                inp_f0,
-            )
-            pitch = pitch[:p_len]
-            pitchf = pitchf[:p_len]
-            if pipe.device == "mps":
-                pitchf = pitchf.astype(np.float32)
-            pitch = torch.tensor(
-                pitch, device=pipe.device
-            ).unsqueeze(0).long()
-            pitchf = torch.tensor(
-                pitchf, device=pipe.device
-            ).unsqueeze(0).float()
-        t2 = ttime()
-        times[1] += t2 - t1
-        for t in opt_ts:
-            t = t // pipe.window * pipe.window
-            if if_f0 == 1:
-                pitch_slice = pitch[
-                    :, s // pipe.window: (t + pipe.t_pad2) // pipe.window
-                ]
-                pitchf_slice = pitchf[
-                    :, s // pipe.window: (t + pipe.t_pad2) // pipe.window
-                ]
-            else:
-                pitch_slice = None
-                pitchf_slice = None
-            audio_slice = audio_pad[s:t + pipe.t_pad2 + pipe.window]
-            audio_opt.append(
-                pipe.vc(
-                    self.hu_bert_model,
-                    net_g,
-                    sid,
-                    audio_slice,
-                    pitch_slice,
-                    pitchf_slice,
-                    times,
-                    index,
-                    big_npy,
-                    index_rate,
-                    version,
-                    protect,
-                )[pipe.t_pad_tgt:-pipe.t_pad_tgt]
-            )
-            s = t
-        pitch_end_slice = pitch[
-            :, t // pipe.window:
-        ] if t is not None else pitch
-        pitchf_end_slice = pitchf[
-            :, t // pipe.window:
-        ] if t is not None else pitchf
-        audio_opt.append(
-            pipe.vc(
-                self.hu_bert_model,
-                net_g,
-                sid,
-                audio_pad[t:],
-                pitch_end_slice,
-                pitchf_end_slice,
-                times,
-                index,
-                big_npy,
-                index_rate,
-                version,
-                protect,
-            )[pipe.t_pad_tgt:-pipe.t_pad_tgt]
-        )
-        audio_opt = np.concatenate(audio_opt)
-        if rms_mix_rate != 1:
-            audio_opt = change_rms(
-                audio, 16000, audio_opt, tgt_sr, rms_mix_rate
-            )
-        if resample_sr >= 16000 and tgt_sr != resample_sr:
-            audio_opt = librosa.resample(
-                audio_opt, orig_sr=tgt_sr, target_sr=resample_sr
-            )
-        audio_max = np.abs(audio_opt).max() / 0.99
-        max_int16 = 32768
-        if audio_max > 1:
-            max_int16 /= audio_max
-        audio_opt = (audio_opt * max_int16).astype(np.int16)
-        del pitch, pitchf, sid
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-        if tgt_sr != resample_sr >= 16000:
-            final_sr = resample_sr
-        else:
-            final_sr = tgt_sr
-        """
-        "Success.\n %s\nTime:\n npy:%ss, f0:%ss, infer:%ss" % (
-            times[0],
-            times[1],
-            times[2],
-        ), (final_sr, audio_opt)
-        """
-        if overwrite:
-            output_audio_path = input_audio_path  # Overwrite
-        else:
-            basename = os.path.basename(input_audio_path)
-            dirname = os.path.dirname(input_audio_path)
-            new_basename = basename.split(
-                '.')[0] + "_edited." + basename.split('.')[-1]
-            new_path = os.path.join(dirname, new_basename)
-            logger.info(str(new_path))
-            output_audio_path = new_path
-        # Save file
-        sf.write(
-            file=output_audio_path,
-            samplerate=final_sr,
-            data=audio_opt
-        )
-        self.model_config[task_id]["result"].append(output_audio_path)
-        self.output_list.append(output_audio_path)
-    def make_test(
-        self,
-        tts_text,
-        tts_voice,
-        model_path,
-        index_path,
-        transpose,
-        f0_method,
-    ):
-        folder_test = "test"
-        tag = "test_edge"
-        tts_file = "test/test.wav"
-        tts_edited = "test/test_edited.wav"
-        create_directories(folder_test)
-        remove_directory_contents(folder_test)
-        if "SET_LIMIT" == os.getenv("DEMO"):
-            if len(tts_text) > 60:
-                tts_text = tts_text[:60]
-                logger.warning("DEMO; limit to 60 characters")
-        try:
-            asyncio.run(edge_tts.Communicate(
-                tts_text, "-".join(tts_voice.split('-')[:-1])
-            ).save(tts_file))
-        except Exception as e:
-            raise ValueError(
-                "No audio was received. Please change the "
-                f"tts voice for {tts_voice}. Error: {str(e)}"
-            )
-        shutil.copy(tts_file, tts_edited)
-        self.apply_conf(
-            tag=tag,
-            file_model=model_path,
-            pitch_algo=f0_method,
-            pitch_lvl=transpose,
-            file_index=index_path,
-            index_influence=0.66,
-            respiration_median_filtering=3,
-            envelope_ratio=0.25,
-            consonant_breath_protection=0.33,
-        )
-        self(
-            audio_files=tts_edited,
-            tag_list=tag,
-            overwrite=True
-        )
-        return tts_edited, tts_file
-    def run_threads(self, threads):
-        # Start threads
-        for thread in threads:
-            thread.start()
-        # Wait for all threads to finish
-        for thread in threads:
-            thread.join()
-        gc.collect()
-        torch.cuda.empty_cache()
-    def unload_models(self):
-        self.hu_bert_model = None
-        self.model_pitch_estimator = None
-        gc.collect()
-        torch.cuda.empty_cache()
-    def __call__(
-        self,
-        audio_files=[],
-        tag_list=[],
-        overwrite=False,
-        parallel_workers=1,
-    ):
-        logger.info(f"Parallel workers: {str(parallel_workers)}")
-        self.output_list = []
-        if not self.model_config:
-            raise ValueError("No model has been configured for inference")
-        if isinstance(audio_files, str):
-            audio_files = [audio_files]
-        if isinstance(tag_list, str):
-            tag_list = [tag_list]
-        if not audio_files:
-            raise ValueError("No audio found to convert")
-        if not tag_list:
-            tag_list = [list(self.model_config.keys())[-1]] * len(audio_files)
-        if len(audio_files) > len(tag_list):
-            logger.info("Extend tag list to match audio files")
-            extend_number = len(audio_files) - len(tag_list)
-            tag_list.extend([tag_list[0]] * extend_number)
-        if len(audio_files) < len(tag_list):
-            logger.info("Cut list tags")
-            tag_list = tag_list[:len(audio_files)]
-        tag_file_pairs = list(zip(tag_list, audio_files))
-        sorted_tag_file = sorted(tag_file_pairs, key=lambda x: x[0])
-        # Base params
-        if not self.hu_bert_model:
-            self.hu_bert_model = load_hu_bert(self.config)
-        cache_params = None
-        threads = []
-        progress_bar = tqdm(total=len(tag_list), desc="Progress")
-        for i, (id_tag, input_audio_path) in enumerate(sorted_tag_file):
-            if id_tag not in self.model_config.keys():
-                logger.info(
-                    f"No configured model for {id_tag} with {input_audio_path}"
-                )
-                continue
-            if (
-                len(threads) >= parallel_workers
-                or cache_params != id_tag
-                and cache_params is not None
-            ):
-                self.run_threads(threads)
-                progress_bar.update(len(threads))
-                threads = []
-            if cache_params != id_tag:
-                self.model_config[id_tag]["result"] = []
-                # Unload previous
-                (
-                    n_spk,
-                    tgt_sr,
-                    net_g,
-                    pipe,
-                    cpt,
-                    version,
-                    if_f0,
-                    index_rate,
-                    index,
-                    big_npy,
-                    inp_f0,
-                ) = [None] * 11
-                gc.collect()
-                torch.cuda.empty_cache()
-                # Model params
-                params = self.model_config[id_tag]
-                model_path = params["file_model"]
-                f0_method = params["pitch_algo"]
-                file_index = params["file_index"]
-                index_rate = params["index_influence"]
-                f0_file = params["file_pitch_algo"]
-                # Load model
-                (
-                    n_spk,
-                    tgt_sr,
-                    net_g,
-                    pipe,
-                    cpt,
-                    version
-                ) = load_trained_model(model_path, self.config)
-                if_f0 = cpt.get("f0", 1)  # pitch data
-                # Load index
-                if os.path.exists(file_index) and index_rate != 0:
-                    try:
-                        index = faiss.read_index(file_index)
-                        big_npy = index.reconstruct_n(0, index.ntotal)
-                    except Exception as error:
-                        logger.error(f"Index: {str(error)}")
-                        index_rate = 0
-                        index = big_npy = None
-                else:
-                    logger.warning("File index not found")
-                    index_rate = 0
-                    index = big_npy = None
-                # Load f0 file
-                inp_f0 = None
-                if os.path.exists(f0_file):
-                    try:
-                        with open(f0_file, "r") as f:
-                            lines = f.read().strip("\n").split("\n")
-                        inp_f0 = []
-                        for line in lines:
-                            inp_f0.append([float(i) for i in line.split(",")])
-                        inp_f0 = np.array(inp_f0, dtype="float32")
-                    except Exception as error:
-                        logger.error(f"f0 file: {str(error)}")
-                if "rmvpe" in f0_method:
-                    if not self.model_pitch_estimator:
-                        from lib.rmvpe import RMVPE
-                        logger.info("Loading vocal pitch estimator model")
-                        self.model_pitch_estimator = RMVPE(
-                            "rmvpe.pt",
-                            is_half=self.config.is_half,
-                            device=self.config.device
-                        )
-                    pipe.model_rmvpe = self.model_pitch_estimator
-                cache_params = id_tag
-            # self.infer(
-            #     id_tag,
-            #     params,
-            #     # load model
-            #     n_spk,
-            #     tgt_sr,
-            #     net_g,
-            #     pipe,
-            #     cpt,
-            #     version,
-            #     if_f0,
-            #     # load index
-            #     index_rate,
-            #     index,
-            #     big_npy,
-            #     # load f0 file
-            #     inp_f0,
-            #     # output file
-            #     input_audio_path,
-            #     overwrite,
-            # )
-            thread = threading.Thread(
-                target=self.infer,
-                args=(
-                    id_tag,
-                    params,
-                    # loaded model
-                    n_spk,
-                    tgt_sr,
-                    net_g,
-                    pipe,
-                    cpt,
-                    version,
-                    if_f0,
-                    # loaded index
-                    index_rate,
-                    index,
-                    big_npy,
-                    # loaded f0 file
-                    inp_f0,
-                    # audio file
-                    input_audio_path,
-                    overwrite,
-                )
-            )
-            threads.append(thread)
-        # Run last
-        if threads:
-            self.run_threads(threads)
-        progress_bar.update(len(threads))
-        progress_bar.close()
-        final_result = []
-        valid_tags = set(tag_list)
-        for tag in valid_tags:
-            if (
-                tag in self.model_config.keys()
-                and "result" in self.model_config[tag].keys()
-            ):
-                final_result.extend(self.model_config[tag]["result"])
-        return final_result

+from soni_translate.logging_setup import logger
+import torch
+import gc
+import numpy as np
+import os
+import shutil
+import warnings
+import threading
+from tqdm import tqdm
+from lib.infer_pack.models import (
+    SynthesizerTrnMs256NSFsid,
+    SynthesizerTrnMs256NSFsid_nono,
+    SynthesizerTrnMs768NSFsid,
+    SynthesizerTrnMs768NSFsid_nono,
+)
+from lib.audio import load_audio
+import soundfile as sf
+import edge_tts
+import asyncio
+from soni_translate.utils import remove_directory_contents, create_directories
+from scipy import signal
+from time import time as ttime
+import faiss
+from vci_pipeline import VC, change_rms, bh, ah
+import librosa
+warnings.filterwarnings("ignore")
+class Config:
+    def __init__(self, only_cpu=False):
+        self.device = "cuda:0"
+        self.is_half = True
+        self.n_cpu = 0
+        self.gpu_name = None
+        self.gpu_mem = None
+        (
+            self.x_pad,
+            self.x_query,
+            self.x_center,
+            self.x_max
+        ) = self.device_config(only_cpu)
+    def device_config(self, only_cpu) -> tuple:
+        if torch.cuda.is_available() and not only_cpu:
+            i_device = int(self.device.split(":")[-1])
+            self.gpu_name = torch.cuda.get_device_name(i_device)
+            if (
+                ("16" in self.gpu_name and "V100" not in self.gpu_name.upper())
+                or "P40" in self.gpu_name.upper()
+                or "1060" in self.gpu_name
+                or "1070" in self.gpu_name
+                or "1080" in self.gpu_name
+            ):
+                logger.info(
+                    "16/10 Series GPUs and P40 excel "
+                    "in single-precision tasks."
+                )
+                self.is_half = False
+            else:
+                self.gpu_name = None
+            self.gpu_mem = int(
+                torch.cuda.get_device_properties(i_device).total_memory
+                / 1024
+                / 1024
+                / 1024
+                + 0.4
+            )
+        elif torch.backends.mps.is_available() and not only_cpu:
+            logger.info("Supported N-card not found, using MPS for inference")
+            self.device = "mps"
+        else:
+            logger.info("No supported N-card found, using CPU for inference")
+            self.device = "cpu"
+            self.is_half = False
+        if self.n_cpu == 0:
+            self.n_cpu = os.cpu_count()
+        if self.is_half:
+            # 6GB VRAM configuration
+            x_pad = 3
+            x_query = 10
+            x_center = 60
+            x_max = 65
+        else:
+            # 5GB VRAM configuration
+            x_pad = 1
+            x_query = 6
+            x_center = 38
+            x_max = 41
+        if self.gpu_mem is not None and self.gpu_mem <= 4:
+            x_pad = 1
+            x_query = 5
+            x_center = 30
+            x_max = 32
+        logger.info(
+            f"Config: Device is {self.device}, "
+            f"half precision is {self.is_half}"
+        )
+        return x_pad, x_query, x_center, x_max
+BASE_DOWNLOAD_LINK = "https://huggingface.co/r3gm/sonitranslate_voice_models/resolve/main/"
+BASE_MODELS = [
+    "hubert_base.pt",
+    "rmvpe.pt"
+]
+BASE_DIR = "."
+def load_hu_bert(config):
+    from fairseq import checkpoint_utils
+    from soni_translate.utils import download_manager
+    for id_model in BASE_MODELS:
+        download_manager(
+            os.path.join(BASE_DOWNLOAD_LINK, id_model), BASE_DIR
+        )
+    models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
+        ["hubert_base.pt"],
+        suffix="",
+    )
+    hubert_model = models[0]
+    hubert_model = hubert_model.to(config.device)
+    if config.is_half:
+        hubert_model = hubert_model.half()
+    else:
+        hubert_model = hubert_model.float()
+    hubert_model.eval()
+    return hubert_model
+def load_trained_model(model_path, config):
+    if not model_path:
+        raise ValueError("No model found")
+    logger.info("Loading %s" % model_path)
+    cpt = torch.load(model_path, map_location="cpu")
+    tgt_sr = cpt["config"][-1]
+    cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]  # n_spk
+    if_f0 = cpt.get("f0", 1)
+    if if_f0 == 0:
+        # protect to 0.5 need?
+        pass
+    version = cpt.get("version", "v1")
+    if version == "v1":
+        if if_f0 == 1:
+            net_g = SynthesizerTrnMs256NSFsid(
+                *cpt["config"], is_half=config.is_half
+            )
+        else:
+            net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
+    elif version == "v2":
+        if if_f0 == 1:
+            net_g = SynthesizerTrnMs768NSFsid(
+                *cpt["config"], is_half=config.is_half
+            )
+        else:
+            net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
+    del net_g.enc_q
+    net_g.load_state_dict(cpt["weight"], strict=False)
+    net_g.eval().to(config.device)
+    if config.is_half:
+        net_g = net_g.half()
+    else:
+        net_g = net_g.float()
+    vc = VC(tgt_sr, config)
+    n_spk = cpt["config"][-3]
+    return n_spk, tgt_sr, net_g, vc, cpt, version
+class ClassVoices:
+    def __init__(self, only_cpu=False):
+        self.model_config = {}
+        self.config = None
+        self.only_cpu = only_cpu
+    def apply_conf(
+        self,
+        tag="base_model",
+        file_model="",
+        pitch_algo="pm",
+        pitch_lvl=0,
+        file_index="",
+        index_influence=0.66,
+        respiration_median_filtering=3,
+        envelope_ratio=0.25,
+        consonant_breath_protection=0.33,
+        resample_sr=0,
+        file_pitch_algo="",
+    ):
+        if not file_model:
+            raise ValueError("Model not found")
+        if file_index is None:
+            file_index = ""
+        if file_pitch_algo is None:
+            file_pitch_algo = ""
+        if not self.config:
+            self.config = Config(self.only_cpu)
+            self.hu_bert_model = None
+            self.model_pitch_estimator = None
+        self.model_config[tag] = {
+            "file_model": file_model,
+            "pitch_algo": pitch_algo,
+            "pitch_lvl": pitch_lvl,  # no decimal
+            "file_index": file_index,
+            "index_influence": index_influence,
+            "respiration_median_filtering": respiration_median_filtering,
+            "envelope_ratio": envelope_ratio,
+            "consonant_breath_protection": consonant_breath_protection,
+            "resample_sr": resample_sr,
+            "file_pitch_algo": file_pitch_algo,
+        }
+        return f"CONFIGURATION APPLIED FOR {tag}: {file_model}"
+    def infer(
+        self,
+        task_id,
+        params,
+        # load model
+        n_spk,
+        tgt_sr,
+        net_g,
+        pipe,
+        cpt,
+        version,
+        if_f0,
+        # load index
+        index_rate,
+        index,
+        big_npy,
+        # load f0 file
+        inp_f0,
+        # audio file
+        input_audio_path,
+        overwrite,
+    ):
+        f0_method = params["pitch_algo"]
+        f0_up_key = params["pitch_lvl"]
+        filter_radius = params["respiration_median_filtering"]
+        resample_sr = params["resample_sr"]
+        rms_mix_rate = params["envelope_ratio"]
+        protect = params["consonant_breath_protection"]
+        if not os.path.exists(input_audio_path):
+            raise ValueError(
+                "The audio file was not found or is not "
+                f"a valid file: {input_audio_path}"
+            )
+        f0_up_key = int(f0_up_key)
+        audio = load_audio(input_audio_path, 16000)
+        # Normalize audio
+        audio_max = np.abs(audio).max() / 0.95
+        if audio_max > 1:
+            audio /= audio_max
+        times = [0, 0, 0]
+        # filters audio signal, pads it, computes sliding window sums,
+        # and extracts optimized time indices
+        audio = signal.filtfilt(bh, ah, audio)
+        audio_pad = np.pad(
+            audio, (pipe.window // 2, pipe.window // 2), mode="reflect"
+        )
+        opt_ts = []
+        if audio_pad.shape[0] > pipe.t_max:
+            audio_sum = np.zeros_like(audio)
+            for i in range(pipe.window):
+                audio_sum += audio_pad[i:i - pipe.window]
+            for t in range(pipe.t_center, audio.shape[0], pipe.t_center):
+                opt_ts.append(
+                    t
+                    - pipe.t_query
+                    + np.where(
+                        np.abs(audio_sum[t - pipe.t_query: t + pipe.t_query])
+                        == np.abs(audio_sum[t - pipe.t_query: t + pipe.t_query]).min()
+                    )[0][0]
+                )
+        s = 0
+        audio_opt = []
+        t = None
+        t1 = ttime()
+        sid_value = 0
+        sid = torch.tensor(sid_value, device=pipe.device).unsqueeze(0).long()
+        # Pads audio symmetrically, calculates length divided by window size.
+        audio_pad = np.pad(audio, (pipe.t_pad, pipe.t_pad), mode="reflect")
+        p_len = audio_pad.shape[0] // pipe.window
+        # Estimates pitch from audio signal
+        pitch, pitchf = None, None
+        if if_f0 == 1:
+            pitch, pitchf = pipe.get_f0(
+                input_audio_path,
+                audio_pad,
+                p_len,
+                f0_up_key,
+                f0_method,
+                filter_radius,
+                inp_f0,
+            )
+            pitch = pitch[:p_len]
+            pitchf = pitchf[:p_len]
+            if pipe.device == "mps":
+                pitchf = pitchf.astype(np.float32)
+            pitch = torch.tensor(
+                pitch, device=pipe.device
+            ).unsqueeze(0).long()
+            pitchf = torch.tensor(
+                pitchf, device=pipe.device
+            ).unsqueeze(0).float()
+        t2 = ttime()
+        times[1] += t2 - t1
+        for t in opt_ts:
+            t = t // pipe.window * pipe.window
+            if if_f0 == 1:
+                pitch_slice = pitch[
+                    :, s // pipe.window: (t + pipe.t_pad2) // pipe.window
+                ]
+                pitchf_slice = pitchf[
+                    :, s // pipe.window: (t + pipe.t_pad2) // pipe.window
+                ]
+            else:
+                pitch_slice = None
+                pitchf_slice = None
+            audio_slice = audio_pad[s:t + pipe.t_pad2 + pipe.window]
+            audio_opt.append(
+                pipe.vc(
+                    self.hu_bert_model,
+                    net_g,
+                    sid,
+                    audio_slice,
+                    pitch_slice,
+                    pitchf_slice,
+                    times,
+                    index,
+                    big_npy,
+                    index_rate,
+                    version,
+                    protect,
+                )[pipe.t_pad_tgt:-pipe.t_pad_tgt]
+            )
+            s = t
+        pitch_end_slice = pitch[
+            :, t // pipe.window:
+        ] if t is not None else pitch
+        pitchf_end_slice = pitchf[
+            :, t // pipe.window:
+        ] if t is not None else pitchf
+        audio_opt.append(
+            pipe.vc(
+                self.hu_bert_model,
+                net_g,
+                sid,
+                audio_pad[t:],
+                pitch_end_slice,
+                pitchf_end_slice,
+                times,
+                index,
+                big_npy,
+                index_rate,
+                version,
+                protect,
+            )[pipe.t_pad_tgt:-pipe.t_pad_tgt]
+        )
+        audio_opt = np.concatenate(audio_opt)
+        if rms_mix_rate != 1:
+            audio_opt = change_rms(
+                audio, 16000, audio_opt, tgt_sr, rms_mix_rate
+            )
+        if resample_sr >= 16000 and tgt_sr != resample_sr:
+            audio_opt = librosa.resample(
+                audio_opt, orig_sr=tgt_sr, target_sr=resample_sr
+            )
+        audio_max = np.abs(audio_opt).max() / 0.99
+        max_int16 = 32768
+        if audio_max > 1:
+            max_int16 /= audio_max
+        audio_opt = (audio_opt * max_int16).astype(np.int16)
+        del pitch, pitchf, sid
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        if tgt_sr != resample_sr >= 16000:
+            final_sr = resample_sr
+        else:
+            final_sr = tgt_sr
+        """
+        "Success.\n %s\nTime:\n npy:%ss, f0:%ss, infer:%ss" % (
+            times[0],
+            times[1],
+            times[2],
+        ), (final_sr, audio_opt)
+        """
+        if overwrite:
+            output_audio_path = input_audio_path  # Overwrite
+        else:
+            basename = os.path.basename(input_audio_path)
+            dirname = os.path.dirname(input_audio_path)
+            new_basename = basename.split(
+                '.')[0] + "_edited." + basename.split('.')[-1]
+            new_path = os.path.join(dirname, new_basename)
+            logger.info(str(new_path))
+            output_audio_path = new_path
+        # Save file
+        sf.write(
+            file=output_audio_path,
+            samplerate=final_sr,
+            data=audio_opt
+        )
+        self.model_config[task_id]["result"].append(output_audio_path)
+        self.output_list.append(output_audio_path)
+    def make_test(
+        self,
+        tts_text,
+        tts_voice,
+        model_path,
+        index_path,
+        transpose,
+        f0_method,
+    ):
+        folder_test = "test"
+        tag = "test_edge"
+        tts_file = "test/test.wav"
+        tts_edited = "test/test_edited.wav"
+        create_directories(folder_test)
+        remove_directory_contents(folder_test)
+        if "SET_LIMIT" == os.getenv("DEMO"):
+            if len(tts_text) > 60:
+                tts_text = tts_text[:60]
+                logger.warning("DEMO; limit to 60 characters")
+        try:
+            asyncio.run(edge_tts.Communicate(
+                tts_text, "-".join(tts_voice.split('-')[:-1])
+            ).save(tts_file))
+        except Exception as e:
+            raise ValueError(
+                "No audio was received. Please change the "
+                f"tts voice for {tts_voice}. Error: {str(e)}"
+            )
+        shutil.copy(tts_file, tts_edited)
+        self.apply_conf(
+            tag=tag,
+            file_model=model_path,
+            pitch_algo=f0_method,
+            pitch_lvl=transpose,
+            file_index=index_path,
+            index_influence=0.66,
+            respiration_median_filtering=3,
+            envelope_ratio=0.25,
+            consonant_breath_protection=0.33,
+        )
+        self(
+            audio_files=tts_edited,
+            tag_list=tag,
+            overwrite=True
+        )
+        return tts_edited, tts_file
+    def run_threads(self, threads):
+        # Start threads
+        for thread in threads:
+            thread.start()
+        # Wait for all threads to finish
+        for thread in threads:
+            thread.join()
+        gc.collect()
+        torch.cuda.empty_cache()
+    def unload_models(self):
+        self.hu_bert_model = None
+        self.model_pitch_estimator = None
+        gc.collect()
+        torch.cuda.empty_cache()
+    def __call__(
+        self,
+        audio_files=[],
+        tag_list=[],
+        overwrite=False,
+        parallel_workers=1,
+    ):
+        logger.info(f"Parallel workers: {str(parallel_workers)}")
+        self.output_list = []
+        if not self.model_config:
+            raise ValueError("No model has been configured for inference")
+        if isinstance(audio_files, str):
+            audio_files = [audio_files]
+        if isinstance(tag_list, str):
+            tag_list = [tag_list]
+        if not audio_files:
+            raise ValueError("No audio found to convert")
+        if not tag_list:
+            tag_list = [list(self.model_config.keys())[-1]] * len(audio_files)
+        if len(audio_files) > len(tag_list):
+            logger.info("Extend tag list to match audio files")
+            extend_number = len(audio_files) - len(tag_list)
+            tag_list.extend([tag_list[0]] * extend_number)
+        if len(audio_files) < len(tag_list):
+            logger.info("Cut list tags")
+            tag_list = tag_list[:len(audio_files)]
+        tag_file_pairs = list(zip(tag_list, audio_files))
+        sorted_tag_file = sorted(tag_file_pairs, key=lambda x: x[0])
+        # Base params
+        if not self.hu_bert_model:
+            self.hu_bert_model = load_hu_bert(self.config)
+        cache_params = None
+        threads = []
+        progress_bar = tqdm(total=len(tag_list), desc="Progress")
+        for i, (id_tag, input_audio_path) in enumerate(sorted_tag_file):
+            if id_tag not in self.model_config.keys():
+                logger.info(
+                    f"No configured model for {id_tag} with {input_audio_path}"
+                )
+                continue
+            if (
+                len(threads) >= parallel_workers
+                or cache_params != id_tag
+                and cache_params is not None
+            ):
+                self.run_threads(threads)
+                progress_bar.update(len(threads))
+                threads = []
+            if cache_params != id_tag:
+                self.model_config[id_tag]["result"] = []
+                # Unload previous
+                (
+                    n_spk,
+                    tgt_sr,
+                    net_g,
+                    pipe,
+                    cpt,
+                    version,
+                    if_f0,
+                    index_rate,
+                    index,
+                    big_npy,
+                    inp_f0,
+                ) = [None] * 11
+                gc.collect()
+                torch.cuda.empty_cache()
+                # Model params
+                params = self.model_config[id_tag]
+                model_path = params["file_model"]
+                f0_method = params["pitch_algo"]
+                file_index = params["file_index"]
+                index_rate = params["index_influence"]
+                f0_file = params["file_pitch_algo"]
+                # Load model
+                (
+                    n_spk,
+                    tgt_sr,
+                    net_g,
+                    pipe,
+                    cpt,
+                    version
+                ) = load_trained_model(model_path, self.config)
+                if_f0 = cpt.get("f0", 1)  # pitch data
+                # Load index
+                if os.path.exists(file_index) and index_rate != 0:
+                    try:
+                        index = faiss.read_index(file_index)
+                        big_npy = index.reconstruct_n(0, index.ntotal)
+                    except Exception as error:
+                        logger.error(f"Index: {str(error)}")
+                        index_rate = 0
+                        index = big_npy = None
+                else:
+                    logger.warning("File index not found")
+                    index_rate = 0
+                    index = big_npy = None
+                # Load f0 file
+                inp_f0 = None
+                if os.path.exists(f0_file):
+                    try:
+                        with open(f0_file, "r") as f:
+                            lines = f.read().strip("\n").split("\n")
+                        inp_f0 = []
+                        for line in lines:
+                            inp_f0.append([float(i) for i in line.split(",")])
+                        inp_f0 = np.array(inp_f0, dtype="float32")
+                    except Exception as error:
+                        logger.error(f"f0 file: {str(error)}")
+                if "rmvpe" in f0_method:
+                    if not self.model_pitch_estimator:
+                        from lib.rmvpe import RMVPE
+                        logger.info("Loading vocal pitch estimator model")
+                        self.model_pitch_estimator = RMVPE(
+                            "rmvpe.pt",
+                            is_half=self.config.is_half,
+                            device=self.config.device
+                        )
+                    pipe.model_rmvpe = self.model_pitch_estimator
+                cache_params = id_tag
+            # self.infer(
+            #     id_tag,
+            #     params,
+            #     # load model
+            #     n_spk,
+            #     tgt_sr,
+            #     net_g,
+            #     pipe,
+            #     cpt,
+            #     version,
+            #     if_f0,
+            #     # load index
+            #     index_rate,
+            #     index,
+            #     big_npy,
+            #     # load f0 file
+            #     inp_f0,
+            #     # output file
+            #     input_audio_path,
+            #     overwrite,
+            # )
+            thread = threading.Thread(
+                target=self.infer,
+                args=(
+                    id_tag,
+                    params,
+                    # loaded model
+                    n_spk,
+                    tgt_sr,
+                    net_g,
+                    pipe,
+                    cpt,
+                    version,
+                    if_f0,
+                    # loaded index
+                    index_rate,
+                    index,
+                    big_npy,
+                    # loaded f0 file
+                    inp_f0,
+                    # audio file
+                    input_audio_path,
+                    overwrite,
+                )
+            )
+            threads.append(thread)
+        # Run last
+        if threads:
+            self.run_threads(threads)
+        progress_bar.update(len(threads))
+        progress_bar.close()
+        final_result = []
+        valid_tags = set(tag_list)
+        for tag in valid_tags:
+            if (
+                tag in self.model_config.keys()
+                and "result" in self.model_config[tag].keys()
+            ):
+                final_result.extend(self.model_config[tag]["result"])
+        return final_result