camilosegura
commited on
Commit
•
7d873e2
1
Parent(s):
d1c8358
Upload folder using huggingface_hub
Browse files- .gitattributes +2 -0
- .gitignore +194 -0
- Lenguajes soportados Deep Translator.txt +135 -0
- README.md +38 -8
- Tutorial meta-mms.txt +72 -0
- app.py +354 -0
- assets/images/favico.ico +0 -0
- assets/images/icono.png +0 -0
- assets/images/logo.png +0 -0
- assets/styles/css.css +3 -0
- audio_output.mp3 +3 -0
- data/eng/D_100000.pth +3 -0
- data/eng/G_100000.pth +3 -0
- data/eng/config.json +87 -0
- data/eng/vocab.txt +38 -0
- data/gum/D_100000.pth +3 -0
- data/gum/G_100000.pth +3 -0
- data/gum/config.json +87 -0
- data/gum/vocab.txt +43 -0
- data/quz/D_100000.pth +3 -0
- data/quz/G_100000.pth +3 -0
- data/quz/config.json +87 -0
- data/quz/vocab.txt +37 -0
- data/spa/D_100000.pth +3 -0
- data/spa/G_100000.pth +3 -0
- data/spa/config.json +87 -0
- data/spa/vocab.txt +45 -0
- logs.txt +0 -0
- output.wav +0 -0
- requirements.txt +4 -0
- video.mp4 +3 -0
- video_subtitles.srt +117 -0
.gitattributes
CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
audio_output.mp3 filter=lfs diff=lfs merge=lfs -text
|
37 |
+
video.mp4 filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,194 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Created by .ignore support plugin (hsz.mobi)
|
2 |
+
### Python template
|
3 |
+
# Byte-compiled / optimized / DLL files
|
4 |
+
__pycache__/
|
5 |
+
*.py[cod]
|
6 |
+
*$py.class
|
7 |
+
|
8 |
+
# C extensions
|
9 |
+
*.so
|
10 |
+
|
11 |
+
# Distribution / packaging
|
12 |
+
.Python
|
13 |
+
data/
|
14 |
+
flagged/
|
15 |
+
env/
|
16 |
+
venv/
|
17 |
+
build/
|
18 |
+
develop-eggs/
|
19 |
+
dist/
|
20 |
+
downloads/
|
21 |
+
eggs/
|
22 |
+
.eggs/
|
23 |
+
lib/
|
24 |
+
lib64/
|
25 |
+
parts/
|
26 |
+
sdist/
|
27 |
+
var/
|
28 |
+
*.egg-info/
|
29 |
+
.installed.cfg
|
30 |
+
*.egg
|
31 |
+
|
32 |
+
# PyInstaller
|
33 |
+
# Usually these files are written by a python script from a template
|
34 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
35 |
+
*.manifest
|
36 |
+
*.spec
|
37 |
+
|
38 |
+
# Installer logs
|
39 |
+
pip-log.txt
|
40 |
+
pip-delete-this-directory.txt
|
41 |
+
|
42 |
+
# Unit test / coverage reports
|
43 |
+
htmlcov/
|
44 |
+
.tox/
|
45 |
+
.coverage
|
46 |
+
.coverage.*
|
47 |
+
.cache
|
48 |
+
nosetests.xml
|
49 |
+
coverage.xml
|
50 |
+
*,cover
|
51 |
+
.hypothesis/
|
52 |
+
|
53 |
+
# Translations
|
54 |
+
*.mo
|
55 |
+
*.pot
|
56 |
+
|
57 |
+
# Django stuff:
|
58 |
+
*.log
|
59 |
+
local_settings.py
|
60 |
+
|
61 |
+
# Flask stuff:
|
62 |
+
instance/
|
63 |
+
.webassets-cache
|
64 |
+
|
65 |
+
# Scrapy stuff:
|
66 |
+
.scrapy
|
67 |
+
|
68 |
+
# Sphinx documentation
|
69 |
+
docs/_build/
|
70 |
+
|
71 |
+
# PyBuilder
|
72 |
+
target/
|
73 |
+
|
74 |
+
# IPython Notebook
|
75 |
+
.ipynb_checkpoints
|
76 |
+
|
77 |
+
# pyenv
|
78 |
+
.python-version
|
79 |
+
|
80 |
+
# celery beat schedule file
|
81 |
+
celerybeat-schedule
|
82 |
+
|
83 |
+
# dotenv
|
84 |
+
.env
|
85 |
+
|
86 |
+
# virtualenv
|
87 |
+
venv/
|
88 |
+
ENV/
|
89 |
+
|
90 |
+
# Spyder project settings
|
91 |
+
.spyderproject
|
92 |
+
|
93 |
+
# Rope project settings
|
94 |
+
.ropeproject
|
95 |
+
### VirtualEnv template
|
96 |
+
# Virtualenv
|
97 |
+
# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
|
98 |
+
[Bb]in
|
99 |
+
[Ii]nclude
|
100 |
+
[Ll]ib
|
101 |
+
[Ll]ib64
|
102 |
+
[Ll]ocal
|
103 |
+
[Ss]cripts
|
104 |
+
pyvenv.cfg
|
105 |
+
.venv
|
106 |
+
pip-selfcheck.json
|
107 |
+
|
108 |
+
### JetBrains template
|
109 |
+
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
|
110 |
+
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
|
111 |
+
|
112 |
+
# User-specific stuff
|
113 |
+
.idea/**/workspace.xml
|
114 |
+
.idea/**/tasks.xml
|
115 |
+
.idea/**/usage.statistics.xml
|
116 |
+
.idea/**/dictionaries
|
117 |
+
.idea/**/shelf
|
118 |
+
|
119 |
+
# AWS User-specific
|
120 |
+
.idea/**/aws.xml
|
121 |
+
|
122 |
+
# Generated files
|
123 |
+
.idea/**/contentModel.xml
|
124 |
+
|
125 |
+
# Sensitive or high-churn files
|
126 |
+
.idea/**/dataSources/
|
127 |
+
.idea/**/dataSources.ids
|
128 |
+
.idea/**/dataSources.local.xml
|
129 |
+
.idea/**/sqlDataSources.xml
|
130 |
+
.idea/**/dynamic.xml
|
131 |
+
.idea/**/uiDesigner.xml
|
132 |
+
.idea/**/dbnavigator.xml
|
133 |
+
|
134 |
+
# Gradle
|
135 |
+
.idea/**/gradle.xml
|
136 |
+
.idea/**/libraries
|
137 |
+
|
138 |
+
# Gradle and Maven with auto-import
|
139 |
+
# When using Gradle or Maven with auto-import, you should exclude module files,
|
140 |
+
# since they will be recreated, and may cause churn. Uncomment if using
|
141 |
+
# auto-import.
|
142 |
+
# .idea/artifacts
|
143 |
+
# .idea/compiler.xml
|
144 |
+
# .idea/jarRepositories.xml
|
145 |
+
# .idea/modules.xml
|
146 |
+
# .idea/*.iml
|
147 |
+
# .idea/modules
|
148 |
+
# *.iml
|
149 |
+
# *.ipr
|
150 |
+
|
151 |
+
# CMake
|
152 |
+
cmake-build-*/
|
153 |
+
|
154 |
+
# Mongo Explorer plugin
|
155 |
+
.idea/**/mongoSettings.xml
|
156 |
+
|
157 |
+
# File-based project format
|
158 |
+
*.iws
|
159 |
+
|
160 |
+
# IntelliJ
|
161 |
+
out/
|
162 |
+
|
163 |
+
# mpeltonen/sbt-idea plugin
|
164 |
+
.idea_modules/
|
165 |
+
|
166 |
+
# JIRA plugin
|
167 |
+
atlassian-ide-plugin.xml
|
168 |
+
|
169 |
+
# Cursive Clojure plugin
|
170 |
+
.idea/replstate.xml
|
171 |
+
|
172 |
+
# SonarLint plugin
|
173 |
+
.idea/sonarlint/
|
174 |
+
|
175 |
+
# Crashlytics plugin (for Android Studio and IntelliJ)
|
176 |
+
com_crashlytics_export_strings.xml
|
177 |
+
crashlytics.properties
|
178 |
+
crashlytics-build.properties
|
179 |
+
fabric.properties
|
180 |
+
|
181 |
+
# Editor-based Rest Client
|
182 |
+
.idea/httpRequests
|
183 |
+
|
184 |
+
# Android studio 3.1+ serialized cache file
|
185 |
+
.idea/caches/build_file_checksums.ser
|
186 |
+
|
187 |
+
# idea folder, uncomment if you don't need it
|
188 |
+
.idea
|
189 |
+
*.gz
|
190 |
+
*.wav
|
191 |
+
|
192 |
+
main.py
|
193 |
+
mms_ars.py
|
194 |
+
.DS_Store
|
Lenguajes soportados Deep Translator.txt
ADDED
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
'afrikaans': 'af',
|
3 |
+
'albanian': 'sq',
|
4 |
+
'amharic': 'am',
|
5 |
+
'arabic': 'ar',
|
6 |
+
'armenian': 'hy',
|
7 |
+
'assamese': 'as',
|
8 |
+
'aymara': 'ay',
|
9 |
+
'azerbaijani': 'az',
|
10 |
+
'bambara': 'bm',
|
11 |
+
'basque': 'eu',
|
12 |
+
'belarusian': 'be',
|
13 |
+
'bengali': 'bn',
|
14 |
+
'bhojpuri': 'bho',
|
15 |
+
'bosnian': 'bs',
|
16 |
+
'bulgarian': 'bg',
|
17 |
+
'catalan': 'ca',
|
18 |
+
'cebuano': 'ceb',
|
19 |
+
'chichewa': 'ny',
|
20 |
+
'chinese (simplified)': 'zh-CN',
|
21 |
+
'chinese (traditional)': 'zh-TW',
|
22 |
+
'corsican': 'co',
|
23 |
+
'croatian': 'hr',
|
24 |
+
'czech': 'cs',
|
25 |
+
'danish': 'da',
|
26 |
+
'dhivehi': 'dv',
|
27 |
+
'dogri': 'doi',
|
28 |
+
'dutch': 'nl',
|
29 |
+
'english': 'en',
|
30 |
+
'esperanto': 'eo',
|
31 |
+
'estonian': 'et',
|
32 |
+
'ewe': 'ee',
|
33 |
+
'filipino': 'tl',
|
34 |
+
'finnish': 'fi',
|
35 |
+
'french': 'fr',
|
36 |
+
'frisian': 'fy',
|
37 |
+
'galician': 'gl',
|
38 |
+
'georgian': 'ka',
|
39 |
+
'german': 'de',
|
40 |
+
'greek': 'el',
|
41 |
+
'guarani': 'gn',
|
42 |
+
'gujarati': 'gu',
|
43 |
+
'haitian creole': 'ht',
|
44 |
+
'hausa': 'ha',
|
45 |
+
'hawaiian': 'haw',
|
46 |
+
'hebrew': 'iw',
|
47 |
+
'hindi': 'hi',
|
48 |
+
'hmong': 'hmn',
|
49 |
+
'hungarian': 'hu',
|
50 |
+
'icelandic': 'is',
|
51 |
+
'igbo': 'ig',
|
52 |
+
'ilocano': 'ilo',
|
53 |
+
'indonesian': 'id',
|
54 |
+
'irish': 'ga',
|
55 |
+
'italian': 'it',
|
56 |
+
'japanese': 'ja',
|
57 |
+
'javanese': 'jw',
|
58 |
+
'kannada': 'kn',
|
59 |
+
'kazakh': 'kk',
|
60 |
+
'khmer': 'km',
|
61 |
+
'kinyarwanda': 'rw',
|
62 |
+
'konkani': 'gom',
|
63 |
+
'korean': 'ko',
|
64 |
+
'krio': 'kri',
|
65 |
+
'kurdish (kurmanji)': 'ku',
|
66 |
+
'kurdish (sorani)': 'ckb',
|
67 |
+
'kyrgyz': 'ky',
|
68 |
+
'lao': 'lo',
|
69 |
+
'latin': 'la',
|
70 |
+
'latvian': 'lv',
|
71 |
+
'lingala': 'ln',
|
72 |
+
'lithuanian': 'lt',
|
73 |
+
'luganda': 'lg',
|
74 |
+
'luxembourgish': 'lb',
|
75 |
+
'macedonian': 'mk',
|
76 |
+
'maithili': 'mai',
|
77 |
+
'malagasy': 'mg',
|
78 |
+
'malay': 'ms',
|
79 |
+
'malayalam': 'ml',
|
80 |
+
'maltese': 'mt',
|
81 |
+
'maori': 'mi',
|
82 |
+
'marathi': 'mr',
|
83 |
+
'meiteilon (manipuri)': 'mni-Mtei',
|
84 |
+
'mizo': 'lus',
|
85 |
+
'mongolian': 'mn',
|
86 |
+
'myanmar': 'my',
|
87 |
+
'nepali': 'ne',
|
88 |
+
'norwegian': 'no',
|
89 |
+
'odia (oriya)': 'or',
|
90 |
+
'oromo': 'om',
|
91 |
+
'pashto': 'ps',
|
92 |
+
'persian': 'fa',
|
93 |
+
'polish': 'pl',
|
94 |
+
'portuguese': 'pt',
|
95 |
+
'punjabi': 'pa',
|
96 |
+
'quechua': 'qu',
|
97 |
+
'romanian': 'ro',
|
98 |
+
'russian': 'ru',
|
99 |
+
'samoan': 'sm',
|
100 |
+
'sanskrit': 'sa',
|
101 |
+
'scots gaelic': 'gd',
|
102 |
+
'sepedi': 'nso',
|
103 |
+
'serbian': 'sr',
|
104 |
+
'sesotho': 'st',
|
105 |
+
'shona': 'sn',
|
106 |
+
'sindhi': 'sd',
|
107 |
+
'sinhala': 'si',
|
108 |
+
'slovak': 'sk',
|
109 |
+
'slovenian': 'sl',
|
110 |
+
'somali': 'so',
|
111 |
+
'spanish': 'es',
|
112 |
+
'sundanese': 'su',
|
113 |
+
'swahili': 'sw',
|
114 |
+
'swedish': 'sv',
|
115 |
+
'tajik': 'tg',
|
116 |
+
'tamil': 'ta',
|
117 |
+
'tatar': 'tt',
|
118 |
+
'telugu': 'te',
|
119 |
+
'thai': 'th',
|
120 |
+
'tigrinya': 'ti',
|
121 |
+
'tsonga': 'ts',
|
122 |
+
'turkish': 'tr',
|
123 |
+
'turkmen': 'tk',
|
124 |
+
'twi': 'ak',
|
125 |
+
'ukrainian': 'uk',
|
126 |
+
'urdu': 'ur',
|
127 |
+
'uyghur': 'ug',
|
128 |
+
'uzbek': 'uz',
|
129 |
+
'vietnamese': 'vi',
|
130 |
+
'welsh': 'cy',
|
131 |
+
'xhosa': 'xh',
|
132 |
+
'yiddish': 'yi',
|
133 |
+
'yoruba': 'yo',
|
134 |
+
'zulu': 'zu'
|
135 |
+
}
|
README.md
CHANGED
@@ -1,12 +1,42 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji: 🔥
|
4 |
-
colorFrom: yellow
|
5 |
-
colorTo: gray
|
6 |
-
sdk: gradio
|
7 |
-
sdk_version: 3.38.0
|
8 |
app_file: app.py
|
9 |
-
|
|
|
10 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
-
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
title: traductor-multilenguaje
|
|
|
|
|
|
|
|
|
|
|
3 |
app_file: app.py
|
4 |
+
sdk: gradio
|
5 |
+
sdk_version: 3.35.2
|
6 |
---
|
7 |
+
# Web App of Meta's META's Massively Multilingual Speech (MMS)
|
8 |
+
|
9 |
+
This repository contains a Python code that implements a [META's Massively Multilingual Speech (MMS)](https://github.com/facebookresearch/fairseq/tree/main/examples/mms) using the Gradio library. The application allows users to record audio and convert it to text, or enter text and generate corresponding local speech output.
|
10 |
+
|
11 |
+
## Step 1: Clone repo
|
12 |
+
```shell
|
13 |
+
git clone https://github.com/ikequan/meta-mms.git
|
14 |
+
cd meta-mms
|
15 |
+
```
|
16 |
+
## Step 2: Prerequisites
|
17 |
+
|
18 |
+
Before running the code, make sure you have the following requirements installed:
|
19 |
+
|
20 |
+
- Python 3.x
|
21 |
+
- gradio
|
22 |
+
- speech_recognition
|
23 |
+
- ttsmms
|
24 |
+
- deep_translator
|
25 |
+
|
26 |
+
You can install the required packages using the following command:
|
27 |
+
|
28 |
+
```shell
|
29 |
+
pip install gradio SpeechRecognition ttsmms deep_translator
|
30 |
+
```
|
31 |
+
|
32 |
+
## Step 3: Download language model
|
33 |
+
Check [here](https://github.com/wannaphong/ttsmms/blob/main/support_list.txt) for supported languages and their iso code for this step.
|
34 |
+
```shell
|
35 |
+
curl https://dl.fbaipublicfiles.com/mms/tts/{put your language iso code here}.tar.gz --output {put your language iso code here}.tar.gz # Update lang
|
36 |
+
mkdir -p data && tar -xzf {put your language iso code here}.tar.gz -C data/ # Update langcode
|
37 |
+
```
|
38 |
|
39 |
+
## Step 4: Run code
|
40 |
+
```shell
|
41 |
+
python app.py
|
42 |
+
```
|
Tutorial meta-mms.txt
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
https://www.youtube.com/watch?v=7K4b2S7X99w
|
2 |
+
https://github.com/ikequan/meta-mms
|
3 |
+
|
4 |
+
#Github proyecto
|
5 |
+
https://github.com/AYTECOL/traductor-multilenguaje.git
|
6 |
+
|
7 |
+
#Las librerias se instalan en:
|
8 |
+
C:\Users\jorge\AppData\Local\Programs\Python\Python311\Scripts
|
9 |
+
|
10 |
+
# Salida de Audio de video
|
11 |
+
C:\Users\jorge\AppData\Local\Temp\gradio\04300dd9108b391bd8a7984ab530b47d54bfec91\
|
12 |
+
|
13 |
+
#Lenguajes soportados:
|
14 |
+
https://github.com/wannaphong/ttsmms/blob/main/support_list.txt
|
15 |
+
|
16 |
+
# Descargar e instalar Phyton si no está instalado
|
17 |
+
Instalar en C:\Users\jorge\AppData\Local\Programs\Python\Python311\
|
18 |
+
Pasar al directorio \Scripts para que tome el comando .\pip
|
19 |
+
|
20 |
+
# Comprobar la instalacion de PIP:
|
21 |
+
.\pip help
|
22 |
+
|
23 |
+
# Si PIP no está instalado descargarlo de:
|
24 |
+
curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
|
25 |
+
# Instalar PIP, pasar a la carpeta donde se descargó el archivo get-pip.py:
|
26 |
+
py get-pip.py
|
27 |
+
# Agregar el PATH de la instalación en las variables de entorno y colocarlo al inicio de la lista
|
28 |
+
|
29 |
+
# Si no está instalado Microsoft Visual C++ 14.0 or greater instalarlo de:
|
30 |
+
https://visualstudio.microsoft.com/visual-cpp-build-tools/
|
31 |
+
seleccionando las utilidades para desktop windows
|
32 |
+
|
33 |
+
# Instalar transformers:
|
34 |
+
.\pip install torch datasets[audio]
|
35 |
+
.\pip install --upgrade transformers
|
36 |
+
|
37 |
+
# Instalar los complementos necesarios para la aplicación:
|
38 |
+
.\pip install gradio SpeechRecognition ttsmms deep_translator
|
39 |
+
|
40 |
+
# instalar ffmpeg mediante cmd como administrador:
|
41 |
+
choco install ffmpeg
|
42 |
+
|
43 |
+
# Comprobar ISO de idiomas disponibles:
|
44 |
+
https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html
|
45 |
+
|
46 |
+
# descargar los idiomas a soportar cambiando el ISO del idioma correspondiente:
|
47 |
+
https://dl.fbaipublicfiles.com/mms/tts/full_model/eng.tar.gz #Inglés (eng)
|
48 |
+
https://dl.fbaipublicfiles.com/mms/tts/full_model/spa.tar.gz #Español (spa)
|
49 |
+
https://dl.fbaipublicfiles.com/mms/tts/full_model/gum.tar.gz #Misak (gum)
|
50 |
+
https://dl.fbaipublicfiles.com/mms/tts/full_model/quz.tar.gz #Quechua Cuzco (quz)
|
51 |
+
|
52 |
+
# crear carpeta "data" y descomprimir ahi los lenguajes dentro del proyecto:
|
53 |
+
/meta-mms/data/spa/
|
54 |
+
/meta-mms/data/eng/
|
55 |
+
|
56 |
+
# Comandos procesamiento de Audio y Video
|
57 |
+
# extraer audio de un video
|
58 |
+
ffmpeg -y -i input.mp4 -ar 16000 -ac 1 output_audio.wav
|
59 |
+
|
60 |
+
# dejar un video sin audio
|
61 |
+
ffmpeg -y -i input.mp4 -t 43 output_muted.webm
|
62 |
+
ffmpeg -y -i input.mp4 -shortest output_muted.webm
|
63 |
+
|
64 |
+
# unir audio con video
|
65 |
+
ffmpeg -y -i input.mp4 -i audio.wav -an output_muted.webm
|
66 |
+
|
67 |
+
# subtitulos
|
68 |
+
ffmpeg -y -copyts -i input.webm -vf subtitles=subtitle.srt output_srt.webm
|
69 |
+
ffmpeg -y -copyts -i noticias_caracol_tv.mp4 -vf subtitles=noticias_caracol_tv_subtitles.srt output_srt.webm
|
70 |
+
|
71 |
+
# Para ejecutar la aplicación:
|
72 |
+
py app.py
|
app.py
ADDED
@@ -0,0 +1,354 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import speech_recognition as sr # Libreria de Audio
|
3 |
+
from ttsmms import TTS
|
4 |
+
from deep_translator import GoogleTranslator
|
5 |
+
|
6 |
+
import subprocess # Libreria para procesamiento de comandos cmd para video
|
7 |
+
import os
|
8 |
+
import time # Libreria para manejo de tiempos
|
9 |
+
import math # Libreria matemática, usada para redondeo de cifras
|
10 |
+
from threading import Thread # Librería para manejo de Hilos de procesamiento
|
11 |
+
|
12 |
+
# Idioma de ingreso
|
13 |
+
input_language = 'es-ES'
|
14 |
+
output_audio_format = 'mp3'
|
15 |
+
output_video_format = 'mp4'
|
16 |
+
|
17 |
+
# Inicializa el modelo TTS para los idiomas soportados
|
18 |
+
spanish = TTS("data/spa") #español
|
19 |
+
english = TTS("data/eng") #inglés
|
20 |
+
misak = TTS("data/gum") #misak
|
21 |
+
quechua = TTS("data/quz") #quechua
|
22 |
+
|
23 |
+
# Crea la lista de idiomas soportados para traducir y su modelo TTS correspondiente
|
24 |
+
#langs = [{"lang": 'spanish', "tts": spanish}, {"lang": 'english', "tts": english}, {"lang": 'guarani', "tts": misak}, {"lang": 'quechua', "tts": quechua}]
|
25 |
+
langs = [{"lang": 'english', "tts": english}, {"lang": 'quechua', "tts": quechua}, {"lang": 'spanish', "tts": spanish}]
|
26 |
+
|
27 |
+
# *************************** MÉTODOS ***************************
|
28 |
+
# TEXT TO TEXT: Función que convierte texto a texto
|
29 |
+
def text_to_text(text, lang, logs_file):
|
30 |
+
tiempo = time.ctime().split()
|
31 |
+
print(tiempo[3] + " - Traduciendo el texto a texto en el idioma seleccionado...")
|
32 |
+
logs_file.write(tiempo[3] + " - Traduciendo el texto a texto en el idioma seleccionado...\n")
|
33 |
+
selected_lang = next((lang_item for lang_item in langs if lang_item["lang"] == lang), None) # Busca el idioma seleccionado en la lista de idiomas disponibles
|
34 |
+
if selected_lang is None:
|
35 |
+
raise ValueError(f"Lenguaje '{lang}' no disponible.")
|
36 |
+
text_translated = GoogleTranslator(source='auto', target=lang).translate(text) # Traduce el texto al idioma seleccionado usando Google Translator
|
37 |
+
tiempo = time.ctime().split()
|
38 |
+
print(tiempo[3] + " - Texto traducido: ", text_translated)
|
39 |
+
logs_file.write(tiempo[3] + " - Texto traducido: " + text_translated + "\n")
|
40 |
+
return text_translated
|
41 |
+
|
42 |
+
# TEXT TO AUDIO: Función que convierte texto a audio
|
43 |
+
def text_to_audio(text, lang, logs_file):
|
44 |
+
tiempo = time.ctime().split()
|
45 |
+
print(tiempo[3] + " - Convirtiendo el texto extraido a audio en el idioma seleccionado...")
|
46 |
+
logs_file.write(tiempo[3] + " - Convirtiendo el texto extraido a audio en el idioma seleccionado...\n")
|
47 |
+
selected_lang = next((lang_item for lang_item in langs if lang_item["lang"] == lang), None) # Busca el idioma seleccionado en la lista de idiomas disponibles
|
48 |
+
if selected_lang is None:
|
49 |
+
raise ValueError(f"Lenguaje '{lang}' no disponible.")
|
50 |
+
selected_tts = selected_lang["tts"]
|
51 |
+
text_translated = text_to_text(text, lang, logs_file) # Traduce el texto al idioma seleccionado usando Google Translator
|
52 |
+
wav_path = "audio_output." + output_audio_format
|
53 |
+
print("wav_path!!!!!!!!!!!!!!!!1", wav_path)
|
54 |
+
selected_tts.synthesis(text_translated, wav_path=wav_path) # Genera el audio y lo graba como un archivo WAV
|
55 |
+
tiempo = time.ctime().split()
|
56 |
+
print(tiempo[3] + " - Audio traducido generado: ",wav_path)
|
57 |
+
logs_file.write(tiempo[3] + " - Audio traducido generado: " + wav_path + "\n")
|
58 |
+
return wav_path, text_translated
|
59 |
+
|
60 |
+
# AUDIO TO TEXT: Función que convierte audio a texto usando Google's speech recognition API
|
61 |
+
def audio_to_text(audio_file, logs_file):
|
62 |
+
tiempo = time.ctime().split()
|
63 |
+
print(tiempo[3] + " - Convirtiendo el audio a texto...")
|
64 |
+
logs_file.write(tiempo[3] + " - Convirtiendo el audio a texto...\n")
|
65 |
+
r = sr.Recognizer()
|
66 |
+
with sr.AudioFile(audio_file) as source:
|
67 |
+
audio = r.record(source)
|
68 |
+
try:
|
69 |
+
text = r.recognize_google(audio, language=input_language)
|
70 |
+
tiempo = time.ctime().split()
|
71 |
+
print(tiempo[3] + " - Reconocimiento de texto obtenido del audio: ",text)
|
72 |
+
logs_file.write(tiempo[3] + " - Reconocimiento de texto obtenido del audio: " + text + "\n")
|
73 |
+
return text
|
74 |
+
except sr.UnknownValueError:
|
75 |
+
print("Google Speech Recognition no pudo transcribir el audio.")
|
76 |
+
logs_file.write("Google Speech Recognition no pudo transcribir el audio.\n")
|
77 |
+
return None
|
78 |
+
except sr.RequestError:
|
79 |
+
print("Reconocimiento de audio no disponible.")
|
80 |
+
logs_file.write("Reconocimiento de audio no disponible.\n")
|
81 |
+
return None
|
82 |
+
|
83 |
+
# VIDEO TO AUDIO: Función que extrae el audio del video
|
84 |
+
def video_to_audio(video_file, output_audio_ext, logs_file):
|
85 |
+
tiempo = time.ctime().split()
|
86 |
+
print(tiempo[3] + " - Extrayendo el audio del video...")
|
87 |
+
logs_file.write(tiempo[3] + " - Extrayendo el audio del video...\n")
|
88 |
+
filename, ext = os.path.splitext(video_file) # Se extrae el nombre del archivo y su extensión
|
89 |
+
subprocess.call(["ffmpeg", "-y", "-i", video_file, "-ar", "16000", "-ac", "1", f"{filename+'_audio'}.{output_audio_ext}"], # Se extrae el archivo de audio del video
|
90 |
+
stdout=subprocess.DEVNULL,
|
91 |
+
stderr=subprocess.STDOUT)
|
92 |
+
audio_video = filename + "_audio." + output_audio_ext
|
93 |
+
tiempo = time.ctime().split()
|
94 |
+
print(tiempo[3] + " - Audio extraido: ",audio_video)
|
95 |
+
logs_file.write(tiempo[3] + " - Audio extraido: " + audio_video + "\n")
|
96 |
+
return audio_video
|
97 |
+
|
98 |
+
# VIDEO TO VIDEO: Función que concatena audio con el video traducido
|
99 |
+
def video_to_video(video_file, audio_file_traslated, output_video_ext, logs_file):
|
100 |
+
tiempo = time.ctime().split()
|
101 |
+
print(tiempo[3] + " - Procesando el video para obtenerlo sin audio...")
|
102 |
+
logs_file.write(tiempo[3] + " - Procesando el video para obtenerlo sin audio...\n")
|
103 |
+
filename, ext = os.path.splitext(video_file) # Se extrae el nombre del archivo y su extensión
|
104 |
+
subprocess.call(["ffmpeg", "-y", "-i", video_file, "-an", f"{filename+'_muted'}.{output_video_ext}"],
|
105 |
+
stdout=subprocess.DEVNULL, # Se extrae el video sin audio
|
106 |
+
stderr=subprocess.STDOUT)
|
107 |
+
video_mute = filename + "_muted." + output_video_ext
|
108 |
+
|
109 |
+
tiempo = time.ctime().split()
|
110 |
+
print(tiempo[3] + " - Doblando el video con el audio traducido...")
|
111 |
+
logs_file.write(tiempo[3] + " - Doblando el video con el audio traducido...\n")
|
112 |
+
subprocess.call(["ffmpeg", "-y", "-i", video_mute, "-i", audio_file_traslated, "-shortest", f"{filename+'_traslated'}.{output_video_ext}"],
|
113 |
+
stdout=subprocess.DEVNULL, # Se concatena el video sin audio con el audio traducido
|
114 |
+
stderr=subprocess.STDOUT)
|
115 |
+
video_traslated = filename + "_traslated." + output_video_ext
|
116 |
+
tiempo = time.ctime().split()
|
117 |
+
print(tiempo[3] + " - Video traducido: ",video_traslated)
|
118 |
+
logs_file.write(tiempo[3] + " - Video traducido: " + video_traslated + "\n")
|
119 |
+
return video_traslated
|
120 |
+
|
121 |
+
# VIDEO TO VIDEO SUBTITULADO: Función que coloca subtitulos traducidos al video
|
122 |
+
def video_to_video_subtitled(video_file, text_traslated, output_video_ext, logs_file):
|
123 |
+
tiempo = time.ctime().split()
|
124 |
+
print(tiempo[3] + " - Procesando el video subtitulado...")
|
125 |
+
logs_file.write(tiempo[3] + " - Procesando el video subtitulado...\n")
|
126 |
+
subtitles = text_traslated.split()
|
127 |
+
filename, ext = os.path.splitext(video_file) # Se extrae el nombre del archivo y su extensión
|
128 |
+
#filedir = os.path.dirname(video_file)
|
129 |
+
#subtitles_file = open(f"{filename+'_subtitles'}.srt","w+")
|
130 |
+
length_video = get_length_video(video_file)
|
131 |
+
length_line_subtitle = math.ceil(len(subtitles)/length_video) # Rate de palabras por segundo de video
|
132 |
+
i=0
|
133 |
+
j=0
|
134 |
+
subtitles_line = []
|
135 |
+
while i < length_video//length_line_subtitle: # Ciclo para timming de subtítulos
|
136 |
+
while j < len(subtitles):
|
137 |
+
line = ' '.join(subtitles[j:j+length_line_subtitle]) # Concatena palabras en una frase de acuerdo al rate de palabras
|
138 |
+
subtitles_line.append(line) # Inserta la frase en el vector final de subtítulos
|
139 |
+
j += length_line_subtitle
|
140 |
+
i += 1
|
141 |
+
|
142 |
+
subtitles_file = open(f"{'video_subtitles'}.srt","w+") # Se genera el archivo de subtítulos .srt
|
143 |
+
i=0
|
144 |
+
while i < len(subtitles_line):
|
145 |
+
subtitles_content = (''''''+str(i+1)+'''
|
146 |
+
'''+ time.strftime('%H:%M:%S', time.gmtime(i)) + ''',001 --> ''' + time.strftime('%H:%M:%S', time.gmtime(i+1)) + ''',000 --> ''' +
|
147 |
+
'''
|
148 |
+
''''''<b>'''+ subtitles_line[i] +'''</b>'''
|
149 |
+
'''
|
150 |
+
''')
|
151 |
+
subtitles_file.write(subtitles_content)
|
152 |
+
i += 1
|
153 |
+
|
154 |
+
subtitles_file.close()
|
155 |
+
#subtitles_file = filename + "_subtitles.srt"
|
156 |
+
subprocess.call(["ffmpeg", "-y", "-copyts", "-i", video_file, "-vf", "subtitles=video_subtitles.srt:force_style='Fontname=Futura,Fontsize=20,MarginV=50,Shadow=1'", f"{filename+'_subtitled'}.{output_video_ext}"],
|
157 |
+
stdout=subprocess.DEVNULL, # Se insertan los subtitulos al video con el audio original
|
158 |
+
stderr=subprocess.STDOUT)
|
159 |
+
|
160 |
+
video_subtitled = filename + "_subtitled." + output_video_ext
|
161 |
+
tiempo = time.ctime().split()
|
162 |
+
print(tiempo[3] + " - Video subtitulado: ",video_subtitled)
|
163 |
+
logs_file.write(tiempo[3] + " - Video subtitulado: " + video_subtitled + "\n")
|
164 |
+
return video_subtitled
|
165 |
+
|
166 |
+
def get_length_video(filename):
|
167 |
+
result = subprocess.run(["ffprobe", "-v", "error", "-show_entries",
|
168 |
+
"format=duration", "-of",
|
169 |
+
"default=noprint_wrappers=1:nokey=1", filename],
|
170 |
+
stdout=subprocess.PIPE,
|
171 |
+
stderr=subprocess.STDOUT)
|
172 |
+
return float(result.stdout)
|
173 |
+
|
174 |
+
# *************************** MAIN ***************************
|
175 |
+
# ************************** ROUTER **************************
|
176 |
+
# ROUTER: Función para transcribir video, audio y texto al lenguaje seleccionado
|
177 |
+
def multimedia_to_multimedia_app(lang_input, video_file_upload, audio_file_upload, video_file_webcam, audio_file_microphone, text_input):
|
178 |
+
tiempo = time.ctime().split()
|
179 |
+
logs_file = open("logs.txt","w+")
|
180 |
+
logs_file.write("LOGS TRADUCTOR MULTILENGUAJE\n")
|
181 |
+
if video_file_webcam and lang_input:
|
182 |
+
print("PROCESANDO GRABACIÓN VIDEO DE LA WEBCAM")
|
183 |
+
logs_file.write("PROCESANDO GRABACIÓN VIDEO DE LA WEBCAM\n")
|
184 |
+
print(tiempo[3] + " - Traduciendo el video grabado: " + video_file_webcam + " al idioma " + lang_input)
|
185 |
+
logs_file.write(tiempo[3] + " - Traduciendo el video grabado: " + video_file_webcam + " al idioma " + lang_input + "\n")
|
186 |
+
text_transcribed = convert_video_to_text_app(lang_input, video_file_webcam, logs_file)
|
187 |
+
audio_traslated, text_translated = text_to_audio(text_transcribed, lang_input, logs_file)
|
188 |
+
#video_subtitled = convert_video_to_video_subtitled_app(video_file_webcam, text_translated, logs_file)
|
189 |
+
#video_traslated = convert_video_to_video_app(video_file_webcam, audio_traslated, logs_file)
|
190 |
+
return_video_subtitled = [None]*1
|
191 |
+
return_video_traslated = [None]*1
|
192 |
+
hilo_video_subtitled = Thread(target=convert_video_to_video_subtitled_app, args=(video_file_webcam, text_translated,logs_file,return_video_subtitled,))
|
193 |
+
hilo_video_traslated = Thread(target=convert_video_to_video_app, args=(video_file_webcam, audio_traslated,logs_file,return_video_traslated,))
|
194 |
+
hilo_video_subtitled.start()
|
195 |
+
hilo_video_traslated.start()
|
196 |
+
hilo_video_subtitled.join()
|
197 |
+
hilo_video_traslated.join()
|
198 |
+
video_subtitled = return_video_subtitled[0]
|
199 |
+
video_traslated = return_video_traslated[0]
|
200 |
+
print("FIN PROCESO GRABACIÓN VIDEO DE LA WEBCAM")
|
201 |
+
logs_file.write("FIN PROCESO GRABACIÓN VIDEO DE LA WEBCAM\n")
|
202 |
+
logs_file.close()
|
203 |
+
return text_transcribed, text_translated, audio_traslated, video_subtitled, video_traslated
|
204 |
+
if audio_file_microphone and lang_input:
|
205 |
+
print("PROCESANDO GRABACIÓN AUDIO DEL MICRÓFONO")
|
206 |
+
logs_file.write("PROCESANDO GRABACIÓN AUDIO DEL MICRÓFONO\n")
|
207 |
+
print(tiempo[3] + " - Traduciendo el audio grabado " + audio_file_microphone + " al idioma " + lang_input)
|
208 |
+
logs_file.write(tiempo[3] + " - Traduciendo el audio grabado " + audio_file_microphone + " al idioma " + lang_input + "\n")
|
209 |
+
text_translated, text_transcribed, audio_traslated = convert_audio_to_audio_app(lang_input,audio_file_microphone,logs_file)
|
210 |
+
video_subtitled = None
|
211 |
+
video_traslated = None
|
212 |
+
print("FIN PROCESO GRABACIÓN AUDIO DEL MICRÓFONO")
|
213 |
+
logs_file.write("FIN PROCESO GRABACIÓN AUDIO DEL MICRÓFONO\n")
|
214 |
+
logs_file.close()
|
215 |
+
return text_transcribed, text_translated, audio_traslated, video_subtitled, video_traslated
|
216 |
+
if video_file_upload and lang_input:
|
217 |
+
print("PROCESANDO ARCHIVO DE VIDEO")
|
218 |
+
logs_file.write("PROCESANDO ARCHIVO DE VIDEO\n")
|
219 |
+
print(tiempo[3] + " - Traduciendo el video ingresado " + video_file_upload + " al idioma " + lang_input)
|
220 |
+
logs_file.write(tiempo[3] + " - Traduciendo el video ingresado " + video_file_upload + " al idioma " + lang_input + "\n")
|
221 |
+
text_transcribed = convert_video_to_text_app(lang_input,video_file_upload,logs_file)
|
222 |
+
audio_traslated, text_translated = text_to_audio(text_transcribed, lang_input,logs_file)
|
223 |
+
#video_subtitled = convert_video_to_video_subtitled_app(video_file_upload, text_translated,logs_file)
|
224 |
+
#video_traslated = convert_video_to_video_app(video_file_upload, audio_traslated,logs_file)
|
225 |
+
return_video_subtitled = [None]*1
|
226 |
+
return_video_traslated = [None]*1
|
227 |
+
hilo_video_subtitled = Thread(target=convert_video_to_video_subtitled_app, args=(video_file_upload, text_translated,logs_file,return_video_subtitled,))
|
228 |
+
hilo_video_traslated = Thread(target=convert_video_to_video_app, args=(video_file_upload, audio_traslated,logs_file,return_video_traslated,))
|
229 |
+
hilo_video_subtitled.start()
|
230 |
+
hilo_video_traslated.start()
|
231 |
+
hilo_video_subtitled.join()
|
232 |
+
hilo_video_traslated.join()
|
233 |
+
video_subtitled = return_video_subtitled[0]
|
234 |
+
video_traslated = return_video_traslated[0]
|
235 |
+
|
236 |
+
print("FIN PROCESO ARCHIVO DE VIDEO")
|
237 |
+
logs_file.write("FIN PROCESO ARCHIVO DE VIDEO\n")
|
238 |
+
logs_file.close()
|
239 |
+
return text_transcribed, text_translated, audio_traslated, video_subtitled, video_traslated
|
240 |
+
if audio_file_upload and lang_input:
|
241 |
+
print("PROCESANDO ARCHIVO DE AUDIO")
|
242 |
+
logs_file.write("PROCESANDO ARCHIVO DE AUDIO\n")
|
243 |
+
print(tiempo[3] + " - Traduciendo el audio ingresado " + audio_file_upload + " al idioma " + lang_input)
|
244 |
+
logs_file.write(tiempo[3] + " - Traduciendo el audio ingresado " + audio_file_upload + " al idioma " + lang_input + "\n")
|
245 |
+
text_translated, text_transcribed, audio_traslated = convert_audio_to_audio_app(lang_input,audio_file_upload,logs_file)
|
246 |
+
video_subtitled = None
|
247 |
+
video_traslated = None
|
248 |
+
print("FIN PROCESO ARCHIVO DE AUDIO")
|
249 |
+
logs_file.write("FIN PROCESO ARCHIVO DE AUDIO\n")
|
250 |
+
logs_file.close()
|
251 |
+
return text_transcribed, text_translated, audio_traslated, video_subtitled, video_traslated
|
252 |
+
if text_input and lang_input:
|
253 |
+
print("PROCESANDO TEXTO INGRESADO")
|
254 |
+
logs_file.write("PROCESANDO TEXTO INGRESADO\n")
|
255 |
+
print(tiempo[3] + " - Traduciendo el texto ingresado " + text_input + " al idioma " + lang_input)
|
256 |
+
logs_file.write(tiempo[3] + " - Traduciendo el texto ingresado " + text_input + " al idioma " + lang_input + "\n")
|
257 |
+
audio_traslated, text_translated = text_to_audio(text_input, lang_input, logs_file)
|
258 |
+
video_subtitled = None
|
259 |
+
video_traslated = None
|
260 |
+
print("FIN PROCESO TEXTO INGRESADO")
|
261 |
+
logs_file.write("FIN PROCESO TEXTO INGRESADO\n")
|
262 |
+
logs_file.close()
|
263 |
+
return text_input, text_translated, audio_traslated, video_subtitled, video_traslated
|
264 |
+
if not lang_input:
|
265 |
+
print("Error - Lenguaje no ingresado")
|
266 |
+
raise gr.Error("Debes ingresar el idioma a traducir") # Muestra la alerta si no se ingresa un idioma a traducir
|
267 |
+
|
268 |
+
# *************************** SERVICIOS ***************************
|
269 |
+
# t2t: Traducir el texto a texto en el idioma deseado
|
270 |
+
def convert_text_to_text_app(lang_input, text_to_translate, logs_file):
|
271 |
+
if text_to_translate:
|
272 |
+
print("Traduciendo texto " + text_to_translate + " al idioma " + lang_input)
|
273 |
+
logs_file.write("Traduciendo texto " + text_to_translate + " al idioma " + lang_input + "\n")
|
274 |
+
text_translated = text_to_text(text_to_translate, lang_input, logs_file)
|
275 |
+
return text_translated
|
276 |
+
|
277 |
+
# a2t: Transcribir el audio a texto
|
278 |
+
def convert_audio_to_text_app(lang_input, audio_file, logs_file):
|
279 |
+
if audio_file:
|
280 |
+
print("Convirtiendo audio " + audio_file + " al idioma " + lang_input)
|
281 |
+
logs_file.write("Convirtiendo audio " + audio_file + " al idioma " + lang_input + "\n")
|
282 |
+
text_translated = audio_to_text(audio_file, logs_file)
|
283 |
+
return text_translated
|
284 |
+
|
285 |
+
# a2a: Transcribir el audio a texto y de texto al audio traducido
|
286 |
+
def convert_audio_to_audio_app(lang_input, audio_file, logs_file):
|
287 |
+
if audio_file:
|
288 |
+
print("Traduciendo audio " + audio_file + " al idioma deseado...")
|
289 |
+
logs_file.write("Traduciendo audio " + audio_file + " al idioma deseado...\n")
|
290 |
+
text_transcribed = audio_to_text(audio_file, logs_file)
|
291 |
+
audio_traslated, text_translated = text_to_audio(text_transcribed, lang_input, logs_file)
|
292 |
+
return text_translated, text_transcribed, audio_traslated
|
293 |
+
|
294 |
+
# v2t: Convertir video a audio usando 'ffmpeg' con módulo 'subprocess'
|
295 |
+
def convert_video_to_text_app(lang_input,video_file, logs_file, output_audio_ext="wav"):
|
296 |
+
print("Procesando video " + video_file + " para convertirlo a texto...")
|
297 |
+
logs_file.write("Procesando video " + video_file + " para convertirlo a texto...\n")
|
298 |
+
audio_video = video_to_audio(video_file, output_audio_ext, logs_file)
|
299 |
+
text_translated = convert_audio_to_text_app(lang_input,audio_video, logs_file)
|
300 |
+
return text_translated
|
301 |
+
|
302 |
+
# v2v: Convertir video a video
|
303 |
+
def convert_video_to_video_app(video_file, audio_file_traslated, logs_file, return_video_traslated, output_video_ext=output_video_format):
|
304 |
+
print("Procesando video " + video_file + " para traducirlo...")
|
305 |
+
logs_file.write("Procesando video " + video_file + " para traducirlo...\n")
|
306 |
+
video_traslated = video_to_video(video_file, audio_file_traslated, output_video_ext,logs_file)
|
307 |
+
return_video_traslated[0] = video_traslated
|
308 |
+
#return video_traslated
|
309 |
+
|
310 |
+
# v2vs: Convertir video a video subtitulado
|
311 |
+
def convert_video_to_video_subtitled_app(video_file, text_translated, logs_file, return_video_subtitled, output_video_ext=output_video_format):
|
312 |
+
print("Procesando video " + video_file + " para subtitularlo...")
|
313 |
+
logs_file.write("Procesando video " + video_file + " para subtitularlo...\n")
|
314 |
+
video_subtitled = video_to_video_subtitled(video_file, text_translated, output_video_ext, logs_file)
|
315 |
+
return_video_subtitled[0] = video_subtitled
|
316 |
+
#return video_subtitled
|
317 |
+
|
318 |
+
# *************************** INTERFAZ ***************************
|
319 |
+
# Entradas y salidas en la interfaz Gradio
|
320 |
+
lang_input = gr.components.Dropdown(choices=[lang["lang"] for lang in langs], label="Selecciona el idioma al cual deseas traducir:*")
|
321 |
+
|
322 |
+
#video_input_file = gr.Video(label= "Noticias Caracol", value="D:/Noticias/noticias_caracol_long.mp4")
|
323 |
+
video_input_file = gr.Video()
|
324 |
+
video_input_file = gr.Video(label= "Noticias Caracol", source="upload")
|
325 |
+
video_input_webcam = gr.Video(label= "Noticias Caracol en vivo", source="webcam", include_audio=1)
|
326 |
+
#audio_input_file = gr.Audio(label="Blue Radio", value="D:/Noticias/caracol_radio.mp3")
|
327 |
+
audio_input_file = gr.Audio(label="Blue Radio", source="upload", type="filepath")
|
328 |
+
audio_input_microphone = gr.Audio(label="Blue Radio en vivo", source="microphone", type="filepath")
|
329 |
+
text_input = gr.components.Textbox(label="Noticia a traducir:")
|
330 |
+
output_text_transcribed = gr.components.Textbox(label="Transcripción")
|
331 |
+
output_text_traslated = gr.components.Textbox(label="Traducción")
|
332 |
+
output_audio = gr.components.Audio(label="Audio traducido", format=output_audio_format)
|
333 |
+
output_video_subtitled = gr.components.Video(label="Noticia subtitulada", format=output_video_format)
|
334 |
+
output_video_traslated = gr.components.Video(label="Noticia traducida", format=output_video_format)
|
335 |
+
|
336 |
+
"""""""""
|
337 |
+
embed_html = '<iframe width="560" height="315" src="https://www.youtube.com/embed/EngW7tLk6R8" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>'
|
338 |
+
with gr.Blocks() as interface:
|
339 |
+
gr.HTML(embed_html)
|
340 |
+
"""""""""
|
341 |
+
|
342 |
+
# Crea la interfaz Gradio para multimedia_to_multimedia_app
|
343 |
+
interface = gr.Interface(
|
344 |
+
fn=multimedia_to_multimedia_app,
|
345 |
+
inputs=[lang_input, video_input_file, audio_input_file, video_input_webcam, audio_input_microphone, text_input],
|
346 |
+
outputs=[output_text_transcribed, output_text_traslated, output_audio, output_video_subtitled, output_video_traslated],
|
347 |
+
title="TRADUCTOR MULTILENGUA DE NOTICIAS | AYTÉ - CARACOL",
|
348 |
+
description="Ingresa la noticia que deseas traducir:",
|
349 |
+
#theme = gr.themes.Soft()
|
350 |
+
theme=gr.themes.Default(primary_hue="blue")
|
351 |
+
)
|
352 |
+
#interface.launch() # Lanza la interfaz
|
353 |
+
#interface.launch(share=True, auth=("caracol", "caracol"), server_name=("127.0.0.1"), server_port=(7860), favicon_path=())
|
354 |
+
interface.launch(share=True, auth=("caracol", "caracol"), server_name=("127.0.0.1"), server_port=(7860))
|
assets/images/favico.ico
ADDED
assets/images/icono.png
ADDED
assets/images/logo.png
ADDED
assets/styles/css.css
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
h1 {
|
2 |
+
color: orange;
|
3 |
+
}
|
audio_output.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:46d0e34a258a63efae8713b1b054d0875819a225531f988aac50e3751c6a394a
|
3 |
+
size 1763386
|
data/eng/D_100000.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:04b1d7a2726b3cb27c18604ace828556d9c17c09f65eb041f690a89c99d7aea4
|
3 |
+
size 561110135
|
data/eng/G_100000.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d276cee0f8291de23c8ed4f4a2ed15e3e4cff7b2d6af43660cd6b5e6e1149110
|
3 |
+
size 436618116
|
data/eng/config.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"train": {
|
3 |
+
"log_interval": 200,
|
4 |
+
"eval_interval": 1000,
|
5 |
+
"seed": 1234,
|
6 |
+
"epochs": 20000,
|
7 |
+
"learning_rate": 0.0002,
|
8 |
+
"betas": [
|
9 |
+
0.8,
|
10 |
+
0.99
|
11 |
+
],
|
12 |
+
"eps": 1e-09,
|
13 |
+
"batch_size": 64,
|
14 |
+
"fp16_run": true,
|
15 |
+
"lr_decay": 0.999875,
|
16 |
+
"segment_size": 8192,
|
17 |
+
"init_lr_ratio": 1,
|
18 |
+
"warmup_epochs": 0,
|
19 |
+
"c_mel": 45,
|
20 |
+
"c_kl": 1.0
|
21 |
+
},
|
22 |
+
"data": {
|
23 |
+
"training_files": "train.ltr",
|
24 |
+
"validation_files": "dev.ltr",
|
25 |
+
"text_cleaners": [
|
26 |
+
"transliteration_cleaners"
|
27 |
+
],
|
28 |
+
"max_wav_value": 32768.0,
|
29 |
+
"sampling_rate": 16000,
|
30 |
+
"filter_length": 1024,
|
31 |
+
"hop_length": 256,
|
32 |
+
"win_length": 1024,
|
33 |
+
"n_mel_channels": 80,
|
34 |
+
"mel_fmin": 0.0,
|
35 |
+
"mel_fmax": null,
|
36 |
+
"add_blank": true,
|
37 |
+
"n_speakers": 0,
|
38 |
+
"cleaned_text": true
|
39 |
+
},
|
40 |
+
"model": {
|
41 |
+
"inter_channels": 192,
|
42 |
+
"hidden_channels": 192,
|
43 |
+
"filter_channels": 768,
|
44 |
+
"n_heads": 2,
|
45 |
+
"n_layers": 6,
|
46 |
+
"kernel_size": 3,
|
47 |
+
"p_dropout": 0.1,
|
48 |
+
"resblock": "1",
|
49 |
+
"resblock_kernel_sizes": [
|
50 |
+
3,
|
51 |
+
7,
|
52 |
+
11
|
53 |
+
],
|
54 |
+
"resblock_dilation_sizes": [
|
55 |
+
[
|
56 |
+
1,
|
57 |
+
3,
|
58 |
+
5
|
59 |
+
],
|
60 |
+
[
|
61 |
+
1,
|
62 |
+
3,
|
63 |
+
5
|
64 |
+
],
|
65 |
+
[
|
66 |
+
1,
|
67 |
+
3,
|
68 |
+
5
|
69 |
+
]
|
70 |
+
],
|
71 |
+
"upsample_rates": [
|
72 |
+
8,
|
73 |
+
8,
|
74 |
+
2,
|
75 |
+
2
|
76 |
+
],
|
77 |
+
"upsample_initial_channel": 512,
|
78 |
+
"upsample_kernel_sizes": [
|
79 |
+
16,
|
80 |
+
16,
|
81 |
+
4,
|
82 |
+
4
|
83 |
+
],
|
84 |
+
"n_layers_q": 3,
|
85 |
+
"use_spectral_norm": false
|
86 |
+
}
|
87 |
+
}
|
data/eng/vocab.txt
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
k
|
2 |
+
'
|
3 |
+
z
|
4 |
+
y
|
5 |
+
u
|
6 |
+
d
|
7 |
+
h
|
8 |
+
e
|
9 |
+
s
|
10 |
+
w
|
11 |
+
–
|
12 |
+
3
|
13 |
+
c
|
14 |
+
p
|
15 |
+
-
|
16 |
+
1
|
17 |
+
j
|
18 |
+
m
|
19 |
+
i
|
20 |
+
|
21 |
+
f
|
22 |
+
l
|
23 |
+
o
|
24 |
+
0
|
25 |
+
b
|
26 |
+
r
|
27 |
+
a
|
28 |
+
4
|
29 |
+
2
|
30 |
+
n
|
31 |
+
_
|
32 |
+
x
|
33 |
+
v
|
34 |
+
t
|
35 |
+
q
|
36 |
+
5
|
37 |
+
6
|
38 |
+
g
|
data/gum/D_100000.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0755e583b0b39fe2cc3cf7dfd5c4c9d184de3c83bf562281c7fa23a272bcf9d2
|
3 |
+
size 561109839
|
data/gum/G_100000.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ea406973d9699d994463477d4adfeada83625459e1fa606b7cc7e0593f4c31c2
|
3 |
+
size 436625202
|
data/gum/config.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"train": {
|
3 |
+
"log_interval": 200,
|
4 |
+
"eval_interval": 1000,
|
5 |
+
"seed": 1234,
|
6 |
+
"epochs": 20000,
|
7 |
+
"learning_rate": 0.0002,
|
8 |
+
"betas": [
|
9 |
+
0.8,
|
10 |
+
0.99
|
11 |
+
],
|
12 |
+
"eps": 1e-09,
|
13 |
+
"batch_size": 64,
|
14 |
+
"fp16_run": true,
|
15 |
+
"lr_decay": 0.999875,
|
16 |
+
"segment_size": 8192,
|
17 |
+
"init_lr_ratio": 1,
|
18 |
+
"warmup_epochs": 0,
|
19 |
+
"c_mel": 45,
|
20 |
+
"c_kl": 1.0
|
21 |
+
},
|
22 |
+
"data": {
|
23 |
+
"training_files": "train.ltr",
|
24 |
+
"validation_files": "dev.ltr",
|
25 |
+
"text_cleaners": [
|
26 |
+
"transliteration_cleaners"
|
27 |
+
],
|
28 |
+
"max_wav_value": 32768.0,
|
29 |
+
"sampling_rate": 16000,
|
30 |
+
"filter_length": 1024,
|
31 |
+
"hop_length": 256,
|
32 |
+
"win_length": 1024,
|
33 |
+
"n_mel_channels": 80,
|
34 |
+
"mel_fmin": 0.0,
|
35 |
+
"mel_fmax": null,
|
36 |
+
"add_blank": true,
|
37 |
+
"n_speakers": 0,
|
38 |
+
"cleaned_text": true
|
39 |
+
},
|
40 |
+
"model": {
|
41 |
+
"inter_channels": 192,
|
42 |
+
"hidden_channels": 192,
|
43 |
+
"filter_channels": 768,
|
44 |
+
"n_heads": 2,
|
45 |
+
"n_layers": 6,
|
46 |
+
"kernel_size": 3,
|
47 |
+
"p_dropout": 0.1,
|
48 |
+
"resblock": "1",
|
49 |
+
"resblock_kernel_sizes": [
|
50 |
+
3,
|
51 |
+
7,
|
52 |
+
11
|
53 |
+
],
|
54 |
+
"resblock_dilation_sizes": [
|
55 |
+
[
|
56 |
+
1,
|
57 |
+
3,
|
58 |
+
5
|
59 |
+
],
|
60 |
+
[
|
61 |
+
1,
|
62 |
+
3,
|
63 |
+
5
|
64 |
+
],
|
65 |
+
[
|
66 |
+
1,
|
67 |
+
3,
|
68 |
+
5
|
69 |
+
]
|
70 |
+
],
|
71 |
+
"upsample_rates": [
|
72 |
+
8,
|
73 |
+
8,
|
74 |
+
2,
|
75 |
+
2
|
76 |
+
],
|
77 |
+
"upsample_initial_channel": 512,
|
78 |
+
"upsample_kernel_sizes": [
|
79 |
+
16,
|
80 |
+
16,
|
81 |
+
4,
|
82 |
+
4
|
83 |
+
],
|
84 |
+
"n_layers_q": 3,
|
85 |
+
"use_spectral_norm": false
|
86 |
+
}
|
87 |
+
}
|
data/gum/vocab.txt
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
|
2 |
+
a
|
3 |
+
i
|
4 |
+
r
|
5 |
+
n
|
6 |
+
e
|
7 |
+
ø
|
8 |
+
u
|
9 |
+
g
|
10 |
+
m
|
11 |
+
b
|
12 |
+
t
|
13 |
+
s
|
14 |
+
k
|
15 |
+
h
|
16 |
+
c
|
17 |
+
l
|
18 |
+
w
|
19 |
+
p
|
20 |
+
y
|
21 |
+
d
|
22 |
+
o
|
23 |
+
ñ
|
24 |
+
ú
|
25 |
+
j
|
26 |
+
—
|
27 |
+
í
|
28 |
+
z
|
29 |
+
é
|
30 |
+
á
|
31 |
+
'
|
32 |
+
f
|
33 |
+
v
|
34 |
+
-
|
35 |
+
ó
|
36 |
+
q
|
37 |
+
0
|
38 |
+
x
|
39 |
+
1
|
40 |
+
2
|
41 |
+
4
|
42 |
+
3
|
43 |
+
|
data/quz/D_100000.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:22fd86a89725af83c7faf37d3824db296563871f8d357e07578f6183a992ffb0
|
3 |
+
size 561078748
|
data/quz/G_100000.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1723774e696a2f11c58ce5e89a2ee2b47aad65955b0abae3d8865af28adf9364
|
3 |
+
size 436378676
|
data/quz/config.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"train": {
|
3 |
+
"log_interval": 200,
|
4 |
+
"eval_interval": 1000,
|
5 |
+
"seed": 1234,
|
6 |
+
"epochs": 20000,
|
7 |
+
"learning_rate": 0.0002,
|
8 |
+
"betas": [
|
9 |
+
0.8,
|
10 |
+
0.99
|
11 |
+
],
|
12 |
+
"eps": 1e-09,
|
13 |
+
"batch_size": 64,
|
14 |
+
"fp16_run": true,
|
15 |
+
"lr_decay": 0.999875,
|
16 |
+
"segment_size": 8192,
|
17 |
+
"init_lr_ratio": 1,
|
18 |
+
"warmup_epochs": 0,
|
19 |
+
"c_mel": 45,
|
20 |
+
"c_kl": 1.0
|
21 |
+
},
|
22 |
+
"data": {
|
23 |
+
"training_files": "train.ltr",
|
24 |
+
"validation_files": "dev.ltr",
|
25 |
+
"text_cleaners": [
|
26 |
+
"transliteration_cleaners"
|
27 |
+
],
|
28 |
+
"max_wav_value": 32768.0,
|
29 |
+
"sampling_rate": 16000,
|
30 |
+
"filter_length": 1024,
|
31 |
+
"hop_length": 256,
|
32 |
+
"win_length": 1024,
|
33 |
+
"n_mel_channels": 80,
|
34 |
+
"mel_fmin": 0.0,
|
35 |
+
"mel_fmax": null,
|
36 |
+
"add_blank": true,
|
37 |
+
"n_speakers": 0,
|
38 |
+
"cleaned_text": true
|
39 |
+
},
|
40 |
+
"model": {
|
41 |
+
"inter_channels": 192,
|
42 |
+
"hidden_channels": 192,
|
43 |
+
"filter_channels": 768,
|
44 |
+
"n_heads": 2,
|
45 |
+
"n_layers": 6,
|
46 |
+
"kernel_size": 3,
|
47 |
+
"p_dropout": 0.1,
|
48 |
+
"resblock": "1",
|
49 |
+
"resblock_kernel_sizes": [
|
50 |
+
3,
|
51 |
+
7,
|
52 |
+
11
|
53 |
+
],
|
54 |
+
"resblock_dilation_sizes": [
|
55 |
+
[
|
56 |
+
1,
|
57 |
+
3,
|
58 |
+
5
|
59 |
+
],
|
60 |
+
[
|
61 |
+
1,
|
62 |
+
3,
|
63 |
+
5
|
64 |
+
],
|
65 |
+
[
|
66 |
+
1,
|
67 |
+
3,
|
68 |
+
5
|
69 |
+
]
|
70 |
+
],
|
71 |
+
"upsample_rates": [
|
72 |
+
8,
|
73 |
+
8,
|
74 |
+
2,
|
75 |
+
2
|
76 |
+
],
|
77 |
+
"upsample_initial_channel": 512,
|
78 |
+
"upsample_kernel_sizes": [
|
79 |
+
16,
|
80 |
+
16,
|
81 |
+
4,
|
82 |
+
4
|
83 |
+
],
|
84 |
+
"n_layers_q": 3,
|
85 |
+
"use_spectral_norm": false
|
86 |
+
}
|
87 |
+
}
|
data/quz/vocab.txt
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
a
|
2 |
+
|
|
3 |
+
n
|
4 |
+
i
|
5 |
+
u
|
6 |
+
q
|
7 |
+
k
|
8 |
+
s
|
9 |
+
h
|
10 |
+
p
|
11 |
+
y
|
12 |
+
c
|
13 |
+
t
|
14 |
+
m
|
15 |
+
r
|
16 |
+
l
|
17 |
+
o
|
18 |
+
w
|
19 |
+
e
|
20 |
+
ñ
|
21 |
+
'
|
22 |
+
d
|
23 |
+
j
|
24 |
+
g
|
25 |
+
b
|
26 |
+
-
|
27 |
+
–
|
28 |
+
v
|
29 |
+
f
|
30 |
+
í
|
31 |
+
z
|
32 |
+
é
|
33 |
+
á
|
34 |
+
ó
|
35 |
+
ú
|
36 |
+
x
|
37 |
+
|
data/spa/D_100000.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:07ab22829d36992fc47d7fde4d9e1313f2a8108d2442d489a0953b1910628d7a
|
3 |
+
size 561110151
|
data/spa/G_100000.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8feb91089b706e231efb18d0038f5827f1a9d1e45c57c61fba7ebe2198a7c1e6
|
3 |
+
size 436635085
|
data/spa/config.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"train": {
|
3 |
+
"log_interval": 200,
|
4 |
+
"eval_interval": 1000,
|
5 |
+
"seed": 1234,
|
6 |
+
"epochs": 20000,
|
7 |
+
"learning_rate": 0.0002,
|
8 |
+
"betas": [
|
9 |
+
0.8,
|
10 |
+
0.99
|
11 |
+
],
|
12 |
+
"eps": 1e-09,
|
13 |
+
"batch_size": 64,
|
14 |
+
"fp16_run": true,
|
15 |
+
"lr_decay": 0.999875,
|
16 |
+
"segment_size": 8192,
|
17 |
+
"init_lr_ratio": 1,
|
18 |
+
"warmup_epochs": 0,
|
19 |
+
"c_mel": 45,
|
20 |
+
"c_kl": 1.0
|
21 |
+
},
|
22 |
+
"data": {
|
23 |
+
"training_files": "train.ltr",
|
24 |
+
"validation_files": "dev.ltr",
|
25 |
+
"text_cleaners": [
|
26 |
+
"transliteration_cleaners"
|
27 |
+
],
|
28 |
+
"max_wav_value": 32768.0,
|
29 |
+
"sampling_rate": 16000,
|
30 |
+
"filter_length": 1024,
|
31 |
+
"hop_length": 256,
|
32 |
+
"win_length": 1024,
|
33 |
+
"n_mel_channels": 80,
|
34 |
+
"mel_fmin": 0.0,
|
35 |
+
"mel_fmax": null,
|
36 |
+
"add_blank": true,
|
37 |
+
"n_speakers": 0,
|
38 |
+
"cleaned_text": true
|
39 |
+
},
|
40 |
+
"model": {
|
41 |
+
"inter_channels": 192,
|
42 |
+
"hidden_channels": 192,
|
43 |
+
"filter_channels": 768,
|
44 |
+
"n_heads": 2,
|
45 |
+
"n_layers": 6,
|
46 |
+
"kernel_size": 3,
|
47 |
+
"p_dropout": 0.1,
|
48 |
+
"resblock": "1",
|
49 |
+
"resblock_kernel_sizes": [
|
50 |
+
3,
|
51 |
+
7,
|
52 |
+
11
|
53 |
+
],
|
54 |
+
"resblock_dilation_sizes": [
|
55 |
+
[
|
56 |
+
1,
|
57 |
+
3,
|
58 |
+
5
|
59 |
+
],
|
60 |
+
[
|
61 |
+
1,
|
62 |
+
3,
|
63 |
+
5
|
64 |
+
],
|
65 |
+
[
|
66 |
+
1,
|
67 |
+
3,
|
68 |
+
5
|
69 |
+
]
|
70 |
+
],
|
71 |
+
"upsample_rates": [
|
72 |
+
8,
|
73 |
+
8,
|
74 |
+
2,
|
75 |
+
2
|
76 |
+
],
|
77 |
+
"upsample_initial_channel": 512,
|
78 |
+
"upsample_kernel_sizes": [
|
79 |
+
16,
|
80 |
+
16,
|
81 |
+
4,
|
82 |
+
4
|
83 |
+
],
|
84 |
+
"n_layers_q": 3,
|
85 |
+
"use_spectral_norm": false
|
86 |
+
}
|
87 |
+
}
|
data/spa/vocab.txt
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
7
|
2 |
+
a
|
3 |
+
v
|
4 |
+
c
|
5 |
+
—
|
6 |
+
0
|
7 |
+
5
|
8 |
+
ó
|
9 |
+
8
|
10 |
+
p
|
11 |
+
y
|
12 |
+
z
|
13 |
+
4
|
14 |
+
m
|
15 |
+
ü
|
16 |
+
k
|
17 |
+
s
|
18 |
+
á
|
19 |
+
q
|
20 |
+
h
|
21 |
+
n
|
22 |
+
é
|
23 |
+
_
|
24 |
+
9
|
25 |
+
1
|
26 |
+
f
|
27 |
+
t
|
28 |
+
|
29 |
+
x
|
30 |
+
d
|
31 |
+
í
|
32 |
+
b
|
33 |
+
3
|
34 |
+
j
|
35 |
+
g
|
36 |
+
l
|
37 |
+
2
|
38 |
+
i
|
39 |
+
u
|
40 |
+
e
|
41 |
+
ú
|
42 |
+
o
|
43 |
+
ñ
|
44 |
+
r
|
45 |
+
6
|
logs.txt
ADDED
File without changes
|
output.wav
ADDED
Binary file (359 kB). View file
|
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio
|
2 |
+
SpeechRecognition
|
3 |
+
ttsmms
|
4 |
+
deep_translator
|
video.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5508d3536f55aa02a9cde9a8326799e72f0b148003d50936e735dd23c40cd3ba
|
3 |
+
size 2476504
|
video_subtitles.srt
ADDED
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
1
|
2 |
+
00:00:00,001 --> 00:00:01,000 -->
|
3 |
+
<b>Can you</b>
|
4 |
+
2
|
5 |
+
00:00:01,001 --> 00:00:02,000 -->
|
6 |
+
<b>imagine leaving</b>
|
7 |
+
3
|
8 |
+
00:00:02,001 --> 00:00:03,000 -->
|
9 |
+
<b>home and</b>
|
10 |
+
4
|
11 |
+
00:00:03,001 --> 00:00:04,000 -->
|
12 |
+
<b>finding pieces</b>
|
13 |
+
5
|
14 |
+
00:00:04,001 --> 00:00:05,000 -->
|
15 |
+
<b>of a</b>
|
16 |
+
6
|
17 |
+
00:00:05,001 --> 00:00:06,000 -->
|
18 |
+
<b>plane? Well,</b>
|
19 |
+
7
|
20 |
+
00:00:06,001 --> 00:00:07,000 -->
|
21 |
+
<b>that happened</b>
|
22 |
+
8
|
23 |
+
00:00:07,001 --> 00:00:08,000 -->
|
24 |
+
<b>in a</b>
|
25 |
+
9
|
26 |
+
00:00:08,001 --> 00:00:09,000 -->
|
27 |
+
<b>Chicago neighborhood,</b>
|
28 |
+
10
|
29 |
+
00:00:09,001 --> 00:00:10,000 -->
|
30 |
+
<b>where an</b>
|
31 |
+
11
|
32 |
+
00:00:10,001 --> 00:00:11,000 -->
|
33 |
+
<b>emergency evacuation</b>
|
34 |
+
12
|
35 |
+
00:00:11,001 --> 00:00:12,000 -->
|
36 |
+
<b>slide was</b>
|
37 |
+
13
|
38 |
+
00:00:12,001 --> 00:00:13,000 -->
|
39 |
+
<b>found that</b>
|
40 |
+
14
|
41 |
+
00:00:13,001 --> 00:00:14,000 -->
|
42 |
+
<b>had detached</b>
|
43 |
+
15
|
44 |
+
00:00:14,001 --> 00:00:15,000 -->
|
45 |
+
<b>from an</b>
|
46 |
+
16
|
47 |
+
00:00:15,001 --> 00:00:16,000 -->
|
48 |
+
<b>aircraft that</b>
|
49 |
+
17
|
50 |
+
00:00:16,001 --> 00:00:17,000 -->
|
51 |
+
<b>hit you</b>
|
52 |
+
18
|
53 |
+
00:00:17,001 --> 00:00:18,000 -->
|
54 |
+
<b>at the</b>
|
55 |
+
19
|
56 |
+
00:00:18,001 --> 00:00:19,000 -->
|
57 |
+
<b>International Airport.</b>
|
58 |
+
20
|
59 |
+
00:00:19,001 --> 00:00:20,000 -->
|
60 |
+
<b>Authorities confirmed</b>
|
61 |
+
21
|
62 |
+
00:00:20,001 --> 00:00:21,000 -->
|
63 |
+
<b>that there</b>
|
64 |
+
22
|
65 |
+
00:00:21,001 --> 00:00:22,000 -->
|
66 |
+
<b>were no</b>
|
67 |
+
23
|
68 |
+
00:00:22,001 --> 00:00:23,000 -->
|
69 |
+
<b>injuries. The</b>
|
70 |
+
24
|
71 |
+
00:00:23,001 --> 00:00:24,000 -->
|
72 |
+
<b>large piece</b>
|
73 |
+
25
|
74 |
+
00:00:24,001 --> 00:00:25,000 -->
|
75 |
+
<b>of plastic</b>
|
76 |
+
26
|
77 |
+
00:00:25,001 --> 00:00:26,000 -->
|
78 |
+
<b>was removed</b>
|
79 |
+
27
|
80 |
+
00:00:26,001 --> 00:00:27,000 -->
|
81 |
+
<b>and later</b>
|
82 |
+
28
|
83 |
+
00:00:27,001 --> 00:00:28,000 -->
|
84 |
+
<b>it was</b>
|
85 |
+
29
|
86 |
+
00:00:28,001 --> 00:00:29,000 -->
|
87 |
+
<b>determined that</b>
|
88 |
+
30
|
89 |
+
00:00:29,001 --> 00:00:30,000 -->
|
90 |
+
<b>it belonged</b>
|
91 |
+
31
|
92 |
+
00:00:30,001 --> 00:00:31,000 -->
|
93 |
+
<b>to a</b>
|
94 |
+
32
|
95 |
+
00:00:31,001 --> 00:00:32,000 -->
|
96 |
+
<b>United Airlines</b>
|
97 |
+
33
|
98 |
+
00:00:32,001 --> 00:00:33,000 -->
|
99 |
+
<b>plane from</b>
|
100 |
+
34
|
101 |
+
00:00:33,001 --> 00:00:34,000 -->
|
102 |
+
<b>Switzerland that</b>
|
103 |
+
35
|
104 |
+
00:00:34,001 --> 00:00:35,000 -->
|
105 |
+
<b>landed safely</b>
|
106 |
+
36
|
107 |
+
00:00:35,001 --> 00:00:36,000 -->
|
108 |
+
<b>with 155</b>
|
109 |
+
37
|
110 |
+
00:00:36,001 --> 00:00:37,000 -->
|
111 |
+
<b>passengers and</b>
|
112 |
+
38
|
113 |
+
00:00:37,001 --> 00:00:38,000 -->
|
114 |
+
<b>10 crew</b>
|
115 |
+
39
|
116 |
+
00:00:38,001 --> 00:00:39,000 -->
|
117 |
+
<b>members.</b>
|